A VCS repository archival tool
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

206 lines
6.3 KiB

  1. import abc
  2. import collections
  3. #import codearchiver.modules # In get_module_class
  4. import codearchiver.version
  5. import dataclasses
  6. import logging
  7. import queue
  8. import requests
  9. import time
  10. import typing
  11. logger = logging.getLogger(__name__)
  12. class InputURL:
  13. def __init__(self, url):
  14. if 0 < url.find('+') < url.find('://'):
  15. # '+' and '://' appear in the URL in this order and there is at least one character each before the + as well as between the two
  16. self._moduleScheme, self._url = url.split('+', 1)
  17. else:
  18. self._moduleScheme = None
  19. self._url = url
  20. self._response = None
  21. @property
  22. def url(self):
  23. return self._url
  24. @property
  25. def moduleScheme(self):
  26. return self._moduleScheme
  27. @property
  28. def content(self):
  29. if self._response is None:
  30. self._response = HttpClient().get(self.url)
  31. return self._response.text
  32. def __repr__(self):
  33. return f'{type(self).__module__}.{type(self).__name__}({self._url!r})'
  34. @dataclasses.dataclass
  35. class Result:
  36. '''Container for the result of a module'''
  37. id: str
  38. '''A unique ID for this result'''
  39. files: typing.List[str] = dataclasses.field(default_factory = list)
  40. '''List of filenames produced by the run'''
  41. submoduleResults: typing.List[typing.Tuple['Module', 'Result']] = dataclasses.field(default_factory = list)
  42. '''List of related submodules and their results'''
  43. class HttpError(Exception):
  44. pass
  45. class HttpClient:
  46. defaultRetries: int = 3
  47. defaultUserAgent: str = f'codearchiver/{codearchiver.version.__version__}'
  48. def __init__(self, retries = None, userAgent = None):
  49. self._session = requests.Session()
  50. self._retries = retries if retries else self.defaultRetries
  51. self._userAgent = userAgent if userAgent else self.defaultUserAgent
  52. def request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None):
  53. mergedHeaders = {'User-Agent': self._userAgent}
  54. if headers:
  55. mergedHeaders.update(headers)
  56. headers = mergedHeaders
  57. for attempt in range(self._retries + 1):
  58. # The request is newly prepared on each retry because of potential cookie updates.
  59. req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
  60. logger.info(f'Retrieving {req.url}')
  61. logger.debug(f'... with headers: {headers!r}')
  62. if data:
  63. logger.debug(f'... with data: {data!r}')
  64. try:
  65. r = self._session.send(req, timeout = timeout)
  66. except requests.exceptions.RequestException as exc:
  67. if attempt < self._retries:
  68. retrying = ', retrying'
  69. level = logging.WARNING
  70. else:
  71. retrying = ''
  72. level = logging.ERROR
  73. logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
  74. else:
  75. if responseOkCallback is not None:
  76. success, msg = responseOkCallback(r)
  77. else:
  78. success, msg = (True, None)
  79. msg = f': {msg}' if msg else ''
  80. if success:
  81. logger.debug(f'{req.url} retrieved successfully{msg}')
  82. return r
  83. else:
  84. if attempt < self._retries:
  85. retrying = ', retrying'
  86. level = logging.WARNING
  87. else:
  88. retrying = ''
  89. level = logging.ERROR
  90. logger.log(level, f'Error retrieving {req.url}{msg}{retrying}')
  91. if attempt < self._retries:
  92. sleepTime = 1.0 * 2**attempt # exponential backoff: sleep 1 second after first attempt, 2 after second, 4 after third, etc.
  93. logger.info(f'Waiting {sleepTime:.0f} seconds')
  94. time.sleep(sleepTime)
  95. else:
  96. msg = f'{self._retries + 1} requests to {req.url} failed, giving up.'
  97. logger.fatal(msg)
  98. raise HttpError(msg)
  99. raise RuntimeError('Reached unreachable code')
  100. def get(self, *args, **kwargs):
  101. return self.request('GET', *args, **kwargs)
  102. def post(self, *args, **kwargs):
  103. return self.request('POST', *args, **kwargs)
  104. class Module:
  105. '''An abstract base class for a module.'''
  106. name: typing.Optional[str] = None
  107. '''The name of the module. Modules without a name are ignored, and names must be unique.'''
  108. @staticmethod
  109. def matches(inputUrl: InputURL) -> bool:
  110. '''Whether or not this module is for handling `inputUrl`.'''
  111. return False
  112. def __init__(self, inputUrl, id_ = None):
  113. self._inputUrl = inputUrl
  114. self._url = inputUrl.url
  115. self._id = id_
  116. self._httpClient = HttpClient()
  117. @abc.abstractmethod
  118. def process(self) -> Result:
  119. '''Perform the relevant retrieval(s)'''
  120. def __repr__(self):
  121. return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r})'
  122. def get_module_class(inputUrl: InputURL) -> typing.Type[Module]:
  123. '''Get the Module class most suitable for handling `inputUrl`.'''
  124. # Ensure that modules are imported
  125. # This can't be done at the top because the modules need to refer back to the Module class.
  126. import codearchiver.modules
  127. # Collect all the Module subclasses and names
  128. modules = set()
  129. modulesByName = {} # name: str -> List[Module]
  130. q = queue.Queue()
  131. q.put_nowait(Module)
  132. while not q.empty():
  133. class_ = q.get_nowait()
  134. for c in class_.__subclasses__():
  135. if c.name is not None:
  136. logger.debug(f'Found {c.name!r} module {c.__module__}.{c.__name__}')
  137. modules.add(c)
  138. if c.name not in modulesByName:
  139. modulesByName[c.name] = []
  140. modulesByName[c.name].append(c)
  141. else:
  142. logger.debug(f'Found nameless module {c.__module__}.{c.__name__}')
  143. q.put_nowait(c)
  144. # Verify that there are no module name collisions
  145. if any(len(x) > 1 for x in modulesByName.values()):
  146. raise RuntimeError(f'Found multiple modules with the same name')
  147. # Check if the URL references one of the modules directly
  148. if inputUrl.moduleScheme:
  149. if inputUrl.moduleScheme in modulesByName:
  150. module = modulesByName[inputUrl.moduleScheme][0]
  151. logger.info(f'Selecting module {module.__module__}.{module.__name__}')
  152. return module
  153. else:
  154. raise RuntimeError(f'No module with name {inputUrl.moduleScheme!r} exists')
  155. # Check if exactly one of the modules matches
  156. matches = [class_ for class_ in modules if class_.matches(inputUrl)]
  157. if len(matches) >= 2:
  158. logger.error('Multiple matching modules for input URL')
  159. logger.debug(f'Matching modules: {matches!r}')
  160. raise RuntimeError('Multiple matching modules for input URL')
  161. if matches:
  162. logger.info(f'Selecting module {matches[0].__module__}.{matches[0].__name__}')
  163. return matches[0]
  164. raise RuntimeError('No matching modules for input URL')
  165. def get_module_instance(inputUrl: InputURL, **kwargs) -> Module:
  166. '''Get an instance of the Module class most suitable for handling `inputUrl`.'''
  167. return get_module_class(inputUrl)(inputUrl, **kwargs)