A VCS repository archival tool
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

176 строки
5.2 KiB

  1. import abc
  2. import collections
  3. #import codearchiver.modules # In get_module_class
  4. import codearchiver.version
  5. import dataclasses
  6. import logging
  7. import queue
  8. import requests
  9. import time
  10. import typing
  11. logger = logging.getLogger(__name__)
  12. class InputURL:
  13. def __init__(self, url):
  14. self._url = url
  15. self._response = None
  16. @property
  17. def url(self):
  18. return self._url
  19. @property
  20. def content(self):
  21. if self._response is None:
  22. self._response = HttpClient().get(self.url)
  23. return self._response.text
  24. def __repr__(self):
  25. return f'{type(self).__module__}.{type(self).__name__}({self._url!r})'
  26. @dataclasses.dataclass
  27. class Result:
  28. '''Container for the result of a module'''
  29. id: str
  30. '''A unique ID for this result'''
  31. files: typing.List[str] = dataclasses.field(default_factory = list)
  32. '''List of filenames produced by the run'''
  33. submoduleResults: typing.List[typing.Tuple['Module', 'Result']] = dataclasses.field(default_factory = list)
  34. '''List of related submodules and their results'''
  35. class HttpError(Exception):
  36. pass
  37. class HttpClient:
  38. defaultRetries: int = 3
  39. defaultUserAgent: str = f'codearchiver/{codearchiver.version.__version__}'
  40. def __init__(self, retries = None, userAgent = None):
  41. self._session = requests.Session()
  42. self._retries = retries if retries else self.defaultRetries
  43. self._userAgent = userAgent if userAgent else self.defaultUserAgent
  44. def request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None):
  45. mergedHeaders = {'User-Agent': self._userAgent}
  46. if headers:
  47. mergedHeaders.update(headers)
  48. headers = mergedHeaders
  49. for attempt in range(self._retries + 1):
  50. # The request is newly prepared on each retry because of potential cookie updates.
  51. req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
  52. logger.info(f'Retrieving {req.url}')
  53. logger.debug(f'... with headers: {headers!r}')
  54. if data:
  55. logger.debug(f'... with data: {data!r}')
  56. try:
  57. r = self._session.send(req, timeout = timeout)
  58. except requests.exceptions.RequestException as exc:
  59. if attempt < self._retries:
  60. retrying = ', retrying'
  61. level = logging.WARNING
  62. else:
  63. retrying = ''
  64. level = logging.ERROR
  65. logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
  66. else:
  67. if responseOkCallback is not None:
  68. success, msg = responseOkCallback(r)
  69. else:
  70. success, msg = (True, None)
  71. msg = f': {msg}' if msg else ''
  72. if success:
  73. logger.debug(f'{req.url} retrieved successfully{msg}')
  74. return r
  75. else:
  76. if attempt < self._retries:
  77. retrying = ', retrying'
  78. level = logging.WARNING
  79. else:
  80. retrying = ''
  81. level = logging.ERROR
  82. logger.log(level, f'Error retrieving {req.url}{msg}{retrying}')
  83. if attempt < self._retries:
  84. sleepTime = 1.0 * 2**attempt # exponential backoff: sleep 1 second after first attempt, 2 after second, 4 after third, etc.
  85. logger.info(f'Waiting {sleepTime:.0f} seconds')
  86. time.sleep(sleepTime)
  87. else:
  88. msg = f'{self._retries + 1} requests to {req.url} failed, giving up.'
  89. logger.fatal(msg)
  90. raise HttpError(msg)
  91. raise RuntimeError('Reached unreachable code')
  92. def get(self, *args, **kwargs):
  93. return self.request('GET', *args, **kwargs)
  94. def post(self, *args, **kwargs):
  95. return self.request('POST', *args, **kwargs)
  96. class Module:
  97. '''An abstract base class for a module.'''
  98. name: typing.Optional[str] = None
  99. '''The name of the module. Modules without a name are ignored, and names must be unique.'''
  100. @staticmethod
  101. def matches(inputUrl: InputURL) -> bool:
  102. '''Whether or not this module is for handling `inputUrl`.'''
  103. return False
  104. def __init__(self, inputUrl, id_ = None):
  105. self._inputUrl = inputUrl
  106. self._url = inputUrl.url
  107. self._id = id_
  108. self._httpClient = HttpClient()
  109. @abc.abstractmethod
  110. def process(self) -> Result:
  111. '''Perform the relevant retrieval(s)'''
  112. def __repr__(self):
  113. return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r})'
  114. def get_module_class(inputUrl: InputURL) -> typing.Type[Module]:
  115. '''Get the Module class most suitable for handling `inputUrl`.'''
  116. # Ensure that modules are imported
  117. # This can't be done at the top because the modules need to refer back to the Module class.
  118. import codearchiver.modules
  119. # Collect all the Module subclasses
  120. modules = set()
  121. q = queue.Queue()
  122. q.put_nowait(Module)
  123. while not q.empty():
  124. class_ = q.get_nowait()
  125. for c in class_.__subclasses__():
  126. logger.debug(f'Found module {c.__module__}.{c.__name__}')
  127. modules.add(c)
  128. q.put_nowait(c)
  129. matches = [class_ for class_ in modules if class_.matches(inputUrl)]
  130. if len(matches) >= 2:
  131. logger.error('Multiple matching modules for input URL')
  132. logger.debug(f'Matching modules: {matches!r}')
  133. raise RuntimeError('Multiple matching modules for input URL')
  134. if matches:
  135. logger.info(f'Selecting module {matches[0].__module__}.{matches[0].__name__}')
  136. return matches[0]
  137. raise RuntimeError('No matching modules for input URL')
  138. def get_module_instance(inputUrl: InputURL, **kwargs) -> Module:
  139. '''Get an instance of the Module class most suitable for handling `inputUrl`.'''
  140. return get_module_class(inputUrl)(inputUrl, **kwargs)