A VCS repository archival tool
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

180 lines
5.4 KiB

  1. import abc
  2. import collections
  3. #import codearchiver.modules # In get_module_class
  4. import codearchiver.version
  5. import dataclasses
  6. import logging
  7. import queue
  8. import requests
  9. import time
  10. import typing
  11. logger = logging.getLogger(__name__)
  12. class InputURL:
  13. def __init__(self, url):
  14. self._url = url
  15. self._response = None
  16. @property
  17. def url(self):
  18. return self._url
  19. @property
  20. def content(self):
  21. if self._response is None:
  22. self._response = HttpClient().get(self.url)
  23. return self._response.text
  24. def __repr__(self):
  25. return f'{type(self).__module__}.{type(self).__name__}({self._url!r})'
  26. @dataclasses.dataclass
  27. class Result:
  28. '''Container for the result of a module'''
  29. id: str
  30. '''A unique ID for this result'''
  31. files: typing.List[str] = dataclasses.field(default_factory = list)
  32. '''List of filenames produced by the run'''
  33. submoduleResults: typing.List[typing.Tuple['Module', 'Result']] = dataclasses.field(default_factory = list)
  34. '''List of related submodules and their results'''
  35. class HttpError(Exception):
  36. pass
  37. class HttpClient:
  38. defaultRetries: int = 3
  39. defaultUserAgent: str = f'codearchiver/{codearchiver.version.__version__}'
  40. def __init__(self, retries = None, userAgent = None):
  41. self._session = requests.Session()
  42. self._retries = retries if retries else self.defaultRetries
  43. self._userAgent = userAgent if userAgent else self.defaultUserAgent
  44. def request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None):
  45. mergedHeaders = {'User-Agent': self._userAgent}
  46. if headers:
  47. mergedHeaders.update(headers)
  48. headers = mergedHeaders
  49. for attempt in range(self._retries + 1):
  50. # The request is newly prepared on each retry because of potential cookie updates.
  51. req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
  52. logger.info(f'Retrieving {req.url}')
  53. logger.debug(f'... with headers: {headers!r}')
  54. if data:
  55. logger.debug(f'... with data: {data!r}')
  56. try:
  57. r = self._session.send(req, timeout = timeout)
  58. except requests.exceptions.RequestException as exc:
  59. if attempt < self._retries:
  60. retrying = ', retrying'
  61. level = logging.WARNING
  62. else:
  63. retrying = ''
  64. level = logging.ERROR
  65. logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
  66. else:
  67. if responseOkCallback is not None:
  68. success, msg = responseOkCallback(r)
  69. else:
  70. success, msg = (True, None)
  71. msg = f': {msg}' if msg else ''
  72. if success:
  73. logger.debug(f'{req.url} retrieved successfully{msg}')
  74. return r
  75. else:
  76. if attempt < self._retries:
  77. retrying = ', retrying'
  78. level = logging.WARNING
  79. else:
  80. retrying = ''
  81. level = logging.ERROR
  82. logger.log(level, f'Error retrieving {req.url}{msg}{retrying}')
  83. if attempt < self._retries:
  84. sleepTime = 1.0 * 2**attempt # exponential backoff: sleep 1 second after first attempt, 2 after second, 4 after third, etc.
  85. logger.info(f'Waiting {sleepTime:.0f} seconds')
  86. time.sleep(sleepTime)
  87. else:
  88. msg = f'{self._retries + 1} requests to {req.url} failed, giving up.'
  89. logger.fatal(msg)
  90. raise HttpError(msg)
  91. raise RuntimeError('Reached unreachable code')
  92. def get(self, *args, **kwargs):
  93. return self.request('GET', *args, **kwargs)
  94. def post(self, *args, **kwargs):
  95. return self.request('POST', *args, **kwargs)
  96. class Module:
  97. '''An abstract base class for a module.'''
  98. @staticmethod
  99. def matches(inputUrl: InputURL) -> bool:
  100. '''Whether or not this module is for handling `inputUrl`.'''
  101. return False
  102. def __init__(self, inputUrl, id_ = None):
  103. self._inputUrl = inputUrl
  104. self._url = inputUrl.url
  105. self._id = id_
  106. self._httpClient = HttpClient()
  107. @abc.abstractmethod
  108. def process(self) -> Result:
  109. '''Perform the relevant retrieval(s)'''
  110. def __repr__(self):
  111. return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r})'
  112. def get_module_class(inputUrl: InputURL) -> typing.Type['Module']:
  113. '''Get the Module class most suitable for handling `inputUrl`.'''
  114. # Ensure that modules are imported
  115. # This can't be done at the top because the modules need to refer back to the Module class.
  116. import codearchiver.modules
  117. # Collect all the Module subclasses and their inheritance level
  118. modules = {} # Module -> level:int
  119. q = queue.Queue()
  120. q.put_nowait((Module, 0))
  121. while not q.empty():
  122. class_, level = q.get_nowait()
  123. for c in class_.__subclasses__():
  124. logger.debug(f'Found module {c.__module__}.{c.__name__} at level {level + 1}')
  125. modules[c] = level + 1 # Implicitly only keeps the highest level, i.e. deepest inheritance
  126. q.put_nowait((c, level + 1))
  127. # Restructure into level->[modules] mapping
  128. levels = collections.defaultdict(list)
  129. for class_, level in modules.items():
  130. levels[level].append(class_)
  131. # Process in descending level order
  132. for level in reversed(levels):
  133. matches = [class_ for class_ in levels[level] if class_.matches(inputUrl)]
  134. if len(matches) >= 2:
  135. logger.warning('Multiple matching modules for input URL, using the first found')
  136. logger.debug(f'Matching modules at level {level}: {matches!r}')
  137. logger.debug(f'Modules: {levels!r}')
  138. if matches:
  139. logger.info(f'Selecting module {matches[0].__module__}.{matches[0].__name__}')
  140. return matches[0]
  141. def get_module_instance(inputUrl: InputURL, **kwargs) -> 'Module':
  142. '''Get an instance of the Module class most suitable for handling `inputUrl`.'''
  143. return get_module_class(inputUrl)(inputUrl, **kwargs)