A VCS repository archival tool
Não pode escolher mais do que 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

238 linhas
7.7 KiB

  1. import abc
  2. import collections
  3. #import codearchiver.modules # In get_module_class
  4. import codearchiver.version
  5. import dataclasses
  6. import logging
  7. import queue
  8. import requests
  9. import time
  10. import typing
  11. import weakref
  12. logger = logging.getLogger(__name__)
  13. class InputURL:
  14. def __init__(self, url):
  15. if 0 < url.find('+') < url.find('://'):
  16. # '+' and '://' appear in the URL in this order and there is at least one character each before the + as well as between the two
  17. self._moduleScheme, self._url = url.split('+', 1)
  18. else:
  19. self._moduleScheme = None
  20. self._url = url
  21. self._response = None
  22. @property
  23. def url(self):
  24. return self._url
  25. @property
  26. def moduleScheme(self):
  27. return self._moduleScheme
  28. @property
  29. def content(self):
  30. if self._response is None:
  31. self._response = HttpClient().get(self.url)
  32. return self._response.text
  33. def __repr__(self):
  34. return f'{type(self).__module__}.{type(self).__name__}({self._url!r})'
  35. @dataclasses.dataclass
  36. class Result:
  37. '''Container for the result of a module'''
  38. id: str
  39. '''A unique ID for this result'''
  40. files: typing.List[str] = dataclasses.field(default_factory = list)
  41. '''List of filenames produced by the run'''
  42. submoduleResults: typing.List[typing.Tuple['Module', 'Result']] = dataclasses.field(default_factory = list)
  43. '''List of related submodules and their results'''
  44. class HttpError(Exception):
  45. pass
  46. class HttpClient:
  47. defaultRetries: int = 3
  48. defaultUserAgent: str = f'codearchiver/{codearchiver.version.__version__}'
  49. def __init__(self, retries = None, userAgent = None):
  50. self._session = requests.Session()
  51. self._retries = retries if retries else self.defaultRetries
  52. self._userAgent = userAgent if userAgent else self.defaultUserAgent
  53. def request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None):
  54. mergedHeaders = {'User-Agent': self._userAgent}
  55. if headers:
  56. mergedHeaders.update(headers)
  57. headers = mergedHeaders
  58. for attempt in range(self._retries + 1):
  59. # The request is newly prepared on each retry because of potential cookie updates.
  60. req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
  61. logger.info(f'Retrieving {req.url}')
  62. logger.debug(f'... with headers: {headers!r}')
  63. if data:
  64. logger.debug(f'... with data: {data!r}')
  65. try:
  66. r = self._session.send(req, timeout = timeout)
  67. except requests.exceptions.RequestException as exc:
  68. if attempt < self._retries:
  69. retrying = ', retrying'
  70. level = logging.WARNING
  71. else:
  72. retrying = ''
  73. level = logging.ERROR
  74. logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
  75. else:
  76. if responseOkCallback is not None:
  77. success, msg = responseOkCallback(r)
  78. else:
  79. success, msg = (True, None)
  80. msg = f': {msg}' if msg else ''
  81. if success:
  82. logger.debug(f'{req.url} retrieved successfully{msg}')
  83. return r
  84. else:
  85. if attempt < self._retries:
  86. retrying = ', retrying'
  87. level = logging.WARNING
  88. else:
  89. retrying = ''
  90. level = logging.ERROR
  91. logger.log(level, f'Error retrieving {req.url}{msg}{retrying}')
  92. if attempt < self._retries:
  93. sleepTime = 1.0 * 2**attempt # exponential backoff: sleep 1 second after first attempt, 2 after second, 4 after third, etc.
  94. logger.info(f'Waiting {sleepTime:.0f} seconds')
  95. time.sleep(sleepTime)
  96. else:
  97. msg = f'{self._retries + 1} requests to {req.url} failed, giving up.'
  98. logger.fatal(msg)
  99. raise HttpError(msg)
  100. raise RuntimeError('Reached unreachable code')
  101. def get(self, *args, **kwargs):
  102. return self.request('GET', *args, **kwargs)
  103. def post(self, *args, **kwargs):
  104. return self.request('POST', *args, **kwargs)
  105. class ModuleMeta(type):
  106. __modulesByName = {} # name -> Module class
  107. def __new__(cls, *args, **kwargs):
  108. class_ = super().__new__(cls, *args, **kwargs)
  109. if class_.name is not None:
  110. if class_.name.strip('abcdefghijklmnopqrstuvwxyz_-') != '':
  111. raise RuntimeError(f'Invalid class name: {class_.name!r}')
  112. if class_.name in cls.__modulesByName:
  113. raise RuntimeError(f'Class name collision: {class_.name!r} is already known')
  114. cls.__modulesByName[class_.name] = weakref.ref(class_)
  115. logger.info(f'Found {class_.name!r} module {class_.__module__}.{class_.__name__}')
  116. else:
  117. logger.info(f'Found nameless module {class_.__module__}.{class_.__name__}')
  118. return class_
  119. @classmethod
  120. def get_module_by_name(cls, name):
  121. if classRef := cls.__modulesByName.get(name):
  122. class_ = classRef()
  123. if class_ is None:
  124. logger.info(f'Module {name!r} is gone, dropping')
  125. del cls.__modulesByName[name]
  126. return class_
  127. @classmethod
  128. def iter_modules(cls):
  129. # Housekeeping first: remove dead modules
  130. for name in list(cls.__modulesByName): # create a copy of the names list so the dict can be modified in the loop
  131. if cls.__modulesByName[name]() is None:
  132. logger.info(f'Module {name!r} is gone, dropping')
  133. del cls.__modulesByName[name]
  134. for name, classRef in cls.__modulesByName.items():
  135. class_ = classRef()
  136. if class_ is None:
  137. # Module class no longer exists, skip
  138. # Even though dead modules are removed above, it's possible that the code consuming this iterator drops/deletes modules.
  139. continue
  140. yield class_
  141. @classmethod
  142. def drop(cls, module):
  143. if module.name is not None and module.name in cls.__modulesByName:
  144. del cls.__modulesByName[module.name]
  145. logger.info(f'Module {module.name!r} dropped')
  146. def __del__(self, *args, **kwargs):
  147. if self.name is not None and self.name in type(self).__modulesByName:
  148. logger.info(f'Module {self.name!r} is being destroyed, dropping')
  149. del type(self).__modulesByName[self.name]
  150. # type has no __del__ method, no need to call it.
  151. class Module(metaclass = ModuleMeta):
  152. '''An abstract base class for a module.'''
  153. name: typing.Optional[str] = None
  154. '''The name of the module. Modules without a name are ignored. Names must be unique and may only contain a-z, underscores, and hyphens.'''
  155. @staticmethod
  156. def matches(inputUrl: InputURL) -> bool:
  157. '''Whether or not this module is for handling `inputUrl`.'''
  158. return False
  159. def __init__(self, inputUrl, id_ = None):
  160. self._inputUrl = inputUrl
  161. self._url = inputUrl.url
  162. self._id = id_
  163. self._httpClient = HttpClient()
  164. @abc.abstractmethod
  165. def process(self) -> Result:
  166. '''Perform the relevant retrieval(s)'''
  167. def __repr__(self):
  168. return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r})'
  169. def get_module_class(inputUrl: InputURL) -> typing.Type[Module]:
  170. '''Get the Module class most suitable for handling `inputUrl`.'''
  171. # Ensure that modules are imported
  172. # This can't be done at the top because the modules need to refer back to the Module class.
  173. import codearchiver.modules
  174. # Check if the URL references one of the modules directly
  175. if inputUrl.moduleScheme:
  176. if module := ModuleMeta.get_module_by_name(inputUrl.moduleScheme):
  177. logger.info(f'Selecting module {module.__module__}.{module.__name__}')
  178. return module
  179. else:
  180. raise RuntimeError(f'No module with name {inputUrl.moduleScheme!r} exists')
  181. # Check if exactly one of the modules matches
  182. matches = [class_ for class_ in ModuleMeta.iter_modules() if class_.matches(inputUrl)]
  183. if len(matches) >= 2:
  184. logger.error('Multiple matching modules for input URL')
  185. logger.debug(f'Matching modules: {matches!r}')
  186. raise RuntimeError('Multiple matching modules for input URL')
  187. if matches:
  188. logger.info(f'Selecting module {matches[0].__module__}.{matches[0].__name__}')
  189. return matches[0]
  190. raise RuntimeError('No matching modules for input URL')
  191. def get_module_instance(inputUrl: InputURL, **kwargs) -> Module:
  192. '''Get an instance of the Module class most suitable for handling `inputUrl`.'''
  193. return get_module_class(inputUrl)(inputUrl, **kwargs)