A VCS repository archival tool
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

172 linhas
5.2 KiB

  1. import abc
  2. import collections
  3. #import codearchiver.modules # In get_module_class
  4. import codearchiver.version
  5. import dataclasses
  6. import logging
  7. import queue
  8. import requests
  9. import time
  10. import typing
  11. logger = logging.getLogger(__name__)
  12. class InputURL:
  13. def __init__(self, url):
  14. self._url = url
  15. self._response = None
  16. @property
  17. def url(self):
  18. return self._url
  19. @property
  20. def content(self):
  21. if self._response is None:
  22. self._response = HttpClient().get(self.url)
  23. return self._response.text
  24. @dataclasses.dataclass
  25. class Result:
  26. '''Container for the result of a module'''
  27. id: str
  28. '''A unique ID for this result'''
  29. files: typing.List[str] = dataclasses.field(default_factory = list)
  30. '''List of filenames produced by the run'''
  31. submodules: typing.List['Module'] = dataclasses.field(default_factory = list)
  32. '''List of related submodules that need to be run as well'''
  33. class HttpError(Exception):
  34. pass
  35. class HttpClient:
  36. defaultRetries: int = 3
  37. defaultUserAgent: str = f'codearchiver/{codearchiver.version.__version__}'
  38. def __init__(self, retries = None, userAgent = None):
  39. self._session = requests.Session()
  40. self._retries = retries if retries else self.defaultRetries
  41. self._userAgent = userAgent if userAgent else self.defaultUserAgent
  42. def request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None):
  43. mergedHeaders = {'User-Agent': self._userAgent}
  44. if headers:
  45. mergedHeaders.update(headers)
  46. headers = mergedHeaders
  47. for attempt in range(self._retries + 1):
  48. # The request is newly prepared on each retry because of potential cookie updates.
  49. req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
  50. logger.info(f'Retrieving {req.url}')
  51. logger.debug(f'... with headers: {headers!r}')
  52. if data:
  53. logger.debug(f'... with data: {data!r}')
  54. try:
  55. r = self._session.send(req, timeout = timeout)
  56. except requests.exceptions.RequestException as exc:
  57. if attempt < self._retries:
  58. retrying = ', retrying'
  59. level = logging.WARNING
  60. else:
  61. retrying = ''
  62. level = logging.ERROR
  63. logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
  64. else:
  65. if responseOkCallback is not None:
  66. success, msg = responseOkCallback(r)
  67. else:
  68. success, msg = (True, None)
  69. msg = f': {msg}' if msg else ''
  70. if success:
  71. logger.debug(f'{req.url} retrieved successfully{msg}')
  72. return r
  73. else:
  74. if attempt < self._retries:
  75. retrying = ', retrying'
  76. level = logging.WARNING
  77. else:
  78. retrying = ''
  79. level = logging.ERROR
  80. logger.log(level, f'Error retrieving {req.url}{msg}{retrying}')
  81. if attempt < self._retries:
  82. sleepTime = 1.0 * 2**attempt # exponential backoff: sleep 1 second after first attempt, 2 after second, 4 after third, etc.
  83. logger.info(f'Waiting {sleepTime:.0f} seconds')
  84. time.sleep(sleepTime)
  85. else:
  86. msg = f'{self._retries + 1} requests to {req.url} failed, giving up.'
  87. logger.fatal(msg)
  88. raise HttpError(msg)
  89. raise RuntimeError('Reached unreachable code')
  90. def get(self, *args, **kwargs):
  91. return self.request('GET', *args, **kwargs)
  92. def post(self, *args, **kwargs):
  93. return self.request('POST', *args, **kwargs)
  94. class Module:
  95. '''An abstract base class for a module.'''
  96. @staticmethod
  97. def matches(inputUrl: InputURL) -> bool:
  98. '''Whether or not this module is for handling `inputUrl`.'''
  99. return False
  100. def __init__(self, inputUrl):
  101. self._inputUrl = inputUrl
  102. self._httpClient = HttpClient()
  103. @abc.abstractmethod
  104. def process(self) -> Result:
  105. '''Perform the relevant retrieval(s)'''
  106. def get_module_class(inputUrl: InputURL) -> typing.Type['Module']:
  107. '''Get the Module class most suitable for handling `inputUrl`.'''
  108. # Ensure that modules are imported
  109. # This can't be done at the top because the modules need to refer back to the Module class.
  110. import codearchiver.modules
  111. # Collect all the Module subclasses and their inheritance level
  112. modules = {} # Module -> level:int
  113. q = queue.Queue()
  114. q.put_nowait((Module, 0))
  115. while not q.empty():
  116. class_, level = q.get_nowait()
  117. for c in class_.__subclasses__():
  118. logger.debug(f'Found module {c.__module__}.{c.__name__} at level {level + 1}')
  119. modules[c] = level + 1 # Implicitly only keeps the highest level, i.e. deepest inheritance
  120. q.put_nowait((c, level + 1))
  121. # Restructure into level->[modules] mapping
  122. levels = collections.defaultdict(list)
  123. for class_, level in modules.items():
  124. levels[level].append(class_)
  125. # Process in descending level order
  126. for level in reversed(levels):
  127. matches = [class_ for class_ in levels[level] if class_.matches(inputUrl)]
  128. if len(matches) >= 2:
  129. logger.warning('Multiple matching modules for input URL, using the first found')
  130. logger.debug(f'Matching modules at level {level}: {matches!r}')
  131. logger.debug(f'Modules: {levels!r}')
  132. if matches:
  133. logger.info(f'Selecting module {matches[0].__module__}.{matches[0].__name__}')
  134. return matches[0]
  135. def get_module_instance(inputUrl: InputURL, **kwargs) -> 'Module':
  136. '''Get an instance of the Module class most suitable for handling `inputUrl`.'''
  137. return get_module_class(inputUrl)(inputUrl, **kwargs)