A VCS repository archival tool
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

170 lines
5.1 KiB

  1. import abc
  2. import collections
  3. #import codearchiver.modules # In get_module_class
  4. import codearchiver.version
  5. import logging
  6. import queue
  7. import requests
  8. import time
  9. import typing
  10. logger = logging.getLogger(__name__)
  11. class InputURL:
  12. def __init__(self, url):
  13. self._url = url
  14. self._response = None
  15. @property
  16. def url(self):
  17. return self._url
  18. @property
  19. def content(self):
  20. if self._response is None:
  21. self._response = HttpClient().get(self.url)
  22. return self._response.text
  23. class Result(typing.NamedTuple):
  24. '''Container for the result of a module'''
  25. id: str
  26. '''A unique ID for this result'''
  27. files: typing.List[str] = []
  28. '''List of filenames produced by the run'''
  29. submodules: typing.List['Module'] = []
  30. '''List of related submodules that need to be run as well'''
  31. class HttpError(Exception):
  32. pass
  33. class HttpClient:
  34. defaultRetries: int = 3
  35. defaultUserAgent: str = f'codearchiver/{codearchiver.version.__version__}'
  36. def __init__(self, retries = None, userAgent = None):
  37. self._session = requests.Session()
  38. self._retries = retries if retries else self.defaultRetries
  39. self._userAgent = userAgent if userAgent else self.defaultUserAgent
  40. def request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None):
  41. mergedHeaders = {'User-Agent': self._userAgent}
  42. if headers:
  43. mergedHeaders.update(headers)
  44. headers = mergedHeaders
  45. for attempt in range(self._retries + 1):
  46. # The request is newly prepared on each retry because of potential cookie updates.
  47. req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
  48. logger.info(f'Retrieving {req.url}')
  49. logger.debug(f'... with headers: {headers!r}')
  50. if data:
  51. logger.debug(f'... with data: {data!r}')
  52. try:
  53. r = self._session.send(req, timeout = timeout)
  54. except requests.exceptions.RequestException as exc:
  55. if attempt < self._retries:
  56. retrying = ', retrying'
  57. level = logging.WARNING
  58. else:
  59. retrying = ''
  60. level = logging.ERROR
  61. logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
  62. else:
  63. if responseOkCallback is not None:
  64. success, msg = responseOkCallback(r)
  65. else:
  66. success, msg = (True, None)
  67. msg = f': {msg}' if msg else ''
  68. if success:
  69. logger.debug(f'{req.url} retrieved successfully{msg}')
  70. return r
  71. else:
  72. if attempt < self._retries:
  73. retrying = ', retrying'
  74. level = logging.WARNING
  75. else:
  76. retrying = ''
  77. level = logging.ERROR
  78. logger.log(level, f'Error retrieving {req.url}{msg}{retrying}')
  79. if attempt < self._retries:
  80. sleepTime = 1.0 * 2**attempt # exponential backoff: sleep 1 second after first attempt, 2 after second, 4 after third, etc.
  81. logger.info(f'Waiting {sleepTime:.0f} seconds')
  82. time.sleep(sleepTime)
  83. else:
  84. msg = f'{self._retries + 1} requests to {req.url} failed, giving up.'
  85. logger.fatal(msg)
  86. raise HttpError(msg)
  87. raise RuntimeError('Reached unreachable code')
  88. def get(self, *args, **kwargs):
  89. return self.request('GET', *args, **kwargs)
  90. def post(self, *args, **kwargs):
  91. return self.request('POST', *args, **kwargs)
  92. class Module:
  93. '''An abstract base class for a module.'''
  94. @staticmethod
  95. def matches(inputUrl: InputURL) -> bool:
  96. '''Whether or not this module is for handling `inputUrl`.'''
  97. return False
  98. def __init__(self, inputUrl):
  99. self._inputUrl = inputUrl
  100. self._httpClient = HttpClient()
  101. @abc.abstractmethod
  102. def process(self) -> Result:
  103. '''Perform the relevant retrieval(s)'''
  104. def get_module_class(inputUrl: InputURL) -> typing.Type['Module']:
  105. '''Get the Module class most suitable for handling `inputUrl`.'''
  106. # Ensure that modules are imported
  107. # This can't be done at the top because the modules need to refer back to the Module class.
  108. import codearchiver.modules
  109. # Collect all the Module subclasses and their inheritance level
  110. modules = {} # Module -> level:int
  111. q = queue.Queue()
  112. q.put_nowait((Module, 0))
  113. while not q.empty():
  114. class_, level = q.get_nowait()
  115. for c in class_.__subclasses__():
  116. logger.debug(f'Found module {c.__module__}.{c.__name__} at level {level + 1}')
  117. modules[c] = level + 1 # Implicitly only keeps the highest level, i.e. deepest inheritance
  118. q.put_nowait((c, level + 1))
  119. # Restructure into level->[modules] mapping
  120. levels = collections.defaultdict(list)
  121. for class_, level in modules.items():
  122. levels[level].append(class_)
  123. # Process in descending level order
  124. for level in reversed(levels):
  125. matches = [class_ for class_ in levels[level] if class_.matches(inputUrl)]
  126. if len(matches) >= 2:
  127. logger.warning('Multiple matching modules for input URL, using the first found')
  128. logger.debug(f'Matching modules at level {level}: {matches!r}')
  129. logger.debug(f'Modules: {levels!r}')
  130. if matches:
  131. logger.info(f'Selecting module {matches[0].__module__}.{matches[0].__name__}')
  132. return matches[0]
  133. def get_module_instance(inputUrl: InputURL, **kwargs) -> 'Module':
  134. '''Get an instance of the Module class most suitable for handling `inputUrl`.'''
  135. return get_module_class(inputUrl)(inputUrl, **kwargs)