A VCS repository archival tool
Não pode escolher mais do que 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

426 linhas
16 KiB

  1. import abc
  2. #import codearchiver.modules # In get_module_class
  3. import codearchiver.storage
  4. import codearchiver.version
  5. import collections
  6. import contextlib
  7. import dataclasses
  8. import datetime
  9. import functools
  10. import logging
  11. import os
  12. import queue
  13. import requests
  14. import time
  15. import typing
  16. import weakref
  17. _logger = logging.getLogger(__name__)
  18. class InputURL:
  19. '''
  20. An input URL
  21. This primarily exists so multiple modules can access the content behind the URL for checks in `Module.matches` without fetching multiple times.
  22. It also handles the module name prefix in the scheme part of the URL. Note that `InputURL.url` is then the part without the module name.
  23. '''
  24. def __init__(self, url: str):
  25. if 0 < url.find('+') < url.find('://'):
  26. # '+' and '://' appear in the URL in this order and there is at least one character each before the + as well as between the two
  27. self._moduleScheme, self._url = url.split('+', 1)
  28. else:
  29. self._moduleScheme = None
  30. self._url = url
  31. self._response = None
  32. @property
  33. def url(self) -> str:
  34. '''URL without the module scheme prefix (if any)'''
  35. return self._url
  36. @property
  37. def moduleScheme(self) -> typing.Optional[str]:
  38. '''Module scheme prefix (if one is included, else `None`)'''
  39. return self._moduleScheme
  40. @property
  41. def content(self) -> str:
  42. '''HTTP response body upon fetching the URL with GET'''
  43. if self._response is None:
  44. self._response = HttpClient().get(self.url)
  45. return self._response.text
  46. def __repr__(self):
  47. return f'{type(self).__module__}.{type(self).__name__}({self._url!r})'
  48. @dataclasses.dataclass
  49. class Result:
  50. '''Container for the result of a module'''
  51. id: str
  52. '''A unique ID for this result'''
  53. files: list[tuple[str, typing.Optional['Index']]] = dataclasses.field(default_factory = list)
  54. '''List of filenames produced by the run, optionally with an index'''
  55. submoduleResults: list[tuple['Module', 'Result']] = dataclasses.field(default_factory = list)
  56. '''List of related submodules and their results'''
  57. class IndexValidationError(ValueError):
  58. pass
  59. @dataclasses.dataclass
  60. class IndexField:
  61. key: str
  62. required: bool
  63. repeatable: bool
  64. class Index(list[tuple[str, str]]):
  65. '''An index (key-value mapping, possibly with repeated keys) of a file produced by a module'''
  66. fields: tuple[IndexField] = (
  67. IndexField('codearchiver version', required = True, repeatable = False),
  68. IndexField('Module', required = True, repeatable = False),
  69. IndexField('ID', required = True, repeatable = False),
  70. IndexField('Input URL', required = True, repeatable = False),
  71. IndexField('Filename', required = True, repeatable = False),
  72. )
  73. '''The fields for this index'''
  74. _allFieldsCache: typing.Optional[tuple[IndexField]] = None
  75. def append(self, *args):
  76. if len(args) == 1:
  77. args = args[0]
  78. return super().append(args)
  79. # This should be a @classmethod, too, but that's deprecated since Python 3.11.
  80. @property
  81. def _allFields(self):
  82. '''All fields known by this index, own ones and all from superclasses'''
  83. if type(self)._allFieldsCache is None:
  84. fields = []
  85. for cls in reversed(type(self).mro()):
  86. fields.extend(getattr(cls, 'fields', []))
  87. type(self)._allFieldsCache = tuple(fields)
  88. return type(self)._allFieldsCache
  89. def validate(self):
  90. '''Check that all keys and values in the index conform to the specification'''
  91. keyCounts = collections.Counter(key for key, _ in self)
  92. keys = set(keyCounts)
  93. permittedKeys = set(field.key for field in self._allFields)
  94. unrecognisedKeys = keys - permittedKeys
  95. if unrecognisedKeys:
  96. raise IndexValidationError(f'Unrecognised key(s): {", ".join(sorted(unrecognisedKeys))}')
  97. requiredKeys = set(field.key for field in self._allFields if field.required)
  98. missingRequiredKeys = requiredKeys - keys
  99. if missingRequiredKeys:
  100. raise IndexValidationError(f'Missing required key(s): {", ".join(sorted(missingRequiredKeys))}')
  101. repeatableKeys = set(field.key for field in self._allFields if field.repeatable)
  102. repeatedKeys = set(key for key, count in keyCounts.items() if count > 1)
  103. repeatedUnrepeatableKeys = repeatedKeys - repeatableKeys
  104. if repeatedUnrepeatableKeys:
  105. raise IndexValidationError(f'Repeated unrepeatable key(s): {", ".join(sorted(repeatedUnrepeatableKeys))}')
  106. def matches(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> bool:
  107. '''
  108. Check whether the criteria match this index
  109. Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in the index.
  110. Multiple criteria may use the same key to perform an AND search.
  111. The index is a match if all criteria match.
  112. '''
  113. criteria = criteria.copy()
  114. _logger.debug(f'Searching index for {criteria!r}')
  115. keysOfInterest = set(key for key, _ in criteria)
  116. for key, value in self:
  117. if key not in keysOfInterest:
  118. continue
  119. _logger.debug(f'Potentially interesting entry: {key!r} = {value!r}')
  120. matched = [] # Indices to remove from remaining criteria
  121. for i, (keyCriterion, valueCriterion) in enumerate(criteria):
  122. if keyCriterion != key:
  123. continue
  124. if isinstance(valueCriterion, str) and valueCriterion == value:
  125. _logger.debug('Str match')
  126. matched.append(i)
  127. elif isinstance(valueCriterion, tuple) and value in valueCriterion:
  128. _logger.debug('Tuple match')
  129. matched.append(i)
  130. for i in reversed(matched):
  131. _logger.debug(f'Matched remaining criterion {i}: {criteria[i]}')
  132. del criteria[i]
  133. if not criteria:
  134. break
  135. _logger.debug(f'Remaining unmatched criteria: {criteria!r}')
  136. return not bool(criteria)
  137. def serialise(self) -> str:
  138. '''Convert the index to a string suitable for e.g. a simple text file storage'''
  139. self.validate()
  140. return ''.join(f'{key}: {value}\n' for key, value in self)
  141. @classmethod
  142. def deserialise(cls, f: typing.Union[str, bytes, os.PathLike, typing.TextIO], *, validate = True):
  143. '''Import a serialised index from a filename or file-like object'''
  144. if isinstance(f, (str, bytes, os.PathLike)):
  145. cm = open(f, 'r')
  146. else:
  147. cm = contextlib.nullcontext(f)
  148. with cm as fp:
  149. o = cls((key, value[:-1]) for key, value in map(functools.partial(str.split, sep = ': '), fp))
  150. if validate:
  151. o.validate()
  152. return o
  153. class HttpError(Exception):
  154. '''An HTTP request failed too many times.'''
  155. class HttpClient:
  156. '''A thin wrapper HTTP client around Requests with exponential backoff retries and a default user agent for all requests.'''
  157. defaultRetries: int = 3
  158. '''Default number of retries on errors unless overridden on creating the HttpClient object'''
  159. defaultUserAgent: str = f'codearchiver/{codearchiver.version.__version__}'
  160. '''Default user agent unless overridden on instantiation or by overriding via the headers kwarg'''
  161. def __init__(self, retries: typing.Optional[int] = None, userAgent: typing.Optional[str] = None):
  162. self._session = requests.Session()
  163. self._retries = retries if retries else self.defaultRetries
  164. self._userAgent = userAgent if userAgent else self.defaultUserAgent
  165. def request(self,
  166. method,
  167. url,
  168. params = None,
  169. data = None,
  170. headers: typing.Optional[dict[str, str]] = None,
  171. timeout: int = 10,
  172. responseOkCallback: typing.Optional[typing.Callable[[requests.Response], tuple[bool, typing.Optional[str]]]] = None,
  173. ) -> requests.Response:
  174. '''
  175. Make an HTTP request
  176. For the details on `method`, `url`, `params`, and `data`, refer to the Requests documentation on the constructor of `requests.Request`.
  177. For details on `timeout`, see `requests.adapters.HTTPAdapter.send`.
  178. `headers` can be used to specify any HTTP headers. Note that this is case-sensitive. To override the user agent, include a value for the `User-Agent` key here.
  179. `responseOkCallback` can be used to control whether a response is considered acceptable or not. By default, all HTTP responses are considered fine. If specified, this callable must produce a boolean marking whether the response is successful and an error message string. The string is used for logging purposes when the success flag is `False`; it should be `None` if the first return value is `True`.
  180. '''
  181. mergedHeaders = {'User-Agent': self._userAgent}
  182. if headers:
  183. mergedHeaders.update(headers)
  184. headers = mergedHeaders
  185. for attempt in range(self._retries + 1):
  186. # The request is newly prepared on each retry because of potential cookie updates.
  187. req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
  188. _logger.info(f'Retrieving {req.url}')
  189. _logger.debug(f'... with headers: {headers!r}')
  190. if data:
  191. _logger.debug(f'... with data: {data!r}')
  192. try:
  193. r = self._session.send(req, timeout = timeout)
  194. except requests.exceptions.RequestException as exc:
  195. if attempt < self._retries:
  196. retrying = ', retrying'
  197. level = logging.WARNING
  198. else:
  199. retrying = ''
  200. level = logging.ERROR
  201. _logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
  202. else:
  203. if responseOkCallback is not None:
  204. success, msg = responseOkCallback(r)
  205. else:
  206. success, msg = (True, None)
  207. msg = f': {msg}' if msg else ''
  208. if success:
  209. _logger.debug(f'{req.url} retrieved successfully{msg}')
  210. return r
  211. else:
  212. if attempt < self._retries:
  213. retrying = ', retrying'
  214. level = logging.WARNING
  215. else:
  216. retrying = ''
  217. level = logging.ERROR
  218. _logger.log(level, f'Error retrieving {req.url}{msg}{retrying}')
  219. if attempt < self._retries:
  220. sleepTime = 1.0 * 2**attempt # exponential backoff: sleep 1 second after first attempt, 2 after second, 4 after third, etc.
  221. _logger.info(f'Waiting {sleepTime:.0f} seconds')
  222. time.sleep(sleepTime)
  223. else:
  224. msg = f'{self._retries + 1} requests to {req.url} failed, giving up.'
  225. _logger.fatal(msg)
  226. raise HttpError(msg)
  227. raise RuntimeError('Reached unreachable code')
  228. def get(self, *args, **kwargs):
  229. '''Make a GET request. This is equivalent to calling `.request('GET', ...)`.'''
  230. return self.request('GET', *args, **kwargs)
  231. def post(self, *args, **kwargs):
  232. '''Make a POST request. This is equivalent to calling `.request('POST', ...)`.'''
  233. return self.request('POST', *args, **kwargs)
  234. class ModuleMeta(abc.ABCMeta):
  235. '''Metaclass of modules. This is used to keep track of which modules exist and selecting them. It also enforces module name restrictions and prevents name collisions.'''
  236. __modulesByName: dict[str, typing.Type['Module']] = {}
  237. def __new__(cls, *args, **kwargs):
  238. class_ = super().__new__(cls, *args, **kwargs)
  239. if class_.name is not None:
  240. if class_.name.strip('abcdefghijklmnopqrstuvwxyz-') != '':
  241. raise RuntimeError(f'Invalid class name: {class_.name!r}')
  242. if class_.name in cls.__modulesByName:
  243. raise RuntimeError(f'Class name collision: {class_.name!r} is already known')
  244. cls.__modulesByName[class_.name] = weakref.ref(class_)
  245. _logger.info(f'Found {class_.name!r} module {class_.__module__}.{class_.__name__}')
  246. else:
  247. _logger.info(f'Found nameless module {class_.__module__}.{class_.__name__}')
  248. return class_
  249. @classmethod
  250. def get_module_by_name(cls, name: str) -> typing.Optional[typing.Type['Module']]:
  251. '''Get a module by name if one exists'''
  252. if classRef := cls.__modulesByName.get(name):
  253. class_ = classRef()
  254. if class_ is None:
  255. _logger.info(f'Module {name!r} is gone, dropping')
  256. del cls.__modulesByName[name]
  257. return class_
  258. @classmethod
  259. def iter_modules(cls) -> typing.Iterator[typing.Type['Module']]:
  260. '''Iterate over all known modules'''
  261. # Housekeeping first: remove dead modules
  262. for name in list(cls.__modulesByName): # create a copy of the names list so the dict can be modified in the loop
  263. if cls.__modulesByName[name]() is None:
  264. _logger.info(f'Module {name!r} is gone, dropping')
  265. del cls.__modulesByName[name]
  266. for name, classRef in cls.__modulesByName.items():
  267. class_ = classRef()
  268. if class_ is None:
  269. # Module class no longer exists, skip
  270. # Even though dead modules are removed above, it's possible that the code consuming this iterator drops/deletes modules.
  271. continue
  272. yield class_
  273. @classmethod
  274. def drop(cls, module: 'Module'):
  275. '''
  276. Remove a module from the list of known modules
  277. If a Module subclass is destroyed after `del MyModule`, it is also eventually removed from the list. However, as that relies on garbage collection, it should not be depended on and modules should be dropped with this method explicitly.
  278. '''
  279. if module.name is not None and module.name in cls.__modulesByName:
  280. del cls.__modulesByName[module.name]
  281. _logger.info(f'Module {module.name!r} dropped')
  282. def __del__(self, *args, **kwargs):
  283. if self.name is not None and self.name in type(self).__modulesByName:
  284. _logger.info(f'Module {self.name!r} is being destroyed, dropping')
  285. del type(self).__modulesByName[self.name]
  286. # type has no __del__ method, no need to call it.
  287. class Module(metaclass = ModuleMeta):
  288. '''An abstract base class for a module.'''
  289. name: typing.Optional[str] = None
  290. '''The name of the module. Modules without a name are ignored. Names must be unique and may only contain a-z and hyphens.'''
  291. IndexClass: typing.Optional[typing.Type[Index]] = None
  292. '''The Index class corresponding to this module, if any.'''
  293. @staticmethod
  294. def matches(inputUrl: InputURL) -> bool:
  295. '''Whether or not this module is for handling `inputUrl`.'''
  296. return False
  297. def __init__(self, inputUrl: InputURL, storage: typing.Optional[codearchiver.storage.Storage] = None, id_: typing.Optional[str] = None):
  298. self._inputUrl = inputUrl
  299. self._url = inputUrl.url
  300. self._storage = storage
  301. self._id = id_
  302. if self._id is None and type(self).name is not None:
  303. self._id = f'{type(self).name}_{self._url.replace("/", "_")}_{datetime.datetime.utcnow():%Y%m%dT%H%M%SZ}'
  304. self._httpClient = HttpClient()
  305. @abc.abstractmethod
  306. def process(self) -> Result:
  307. '''Perform the relevant retrieval(s)'''
  308. def create_index(self, filename: str) -> Index:
  309. '''Create a basic Index instance appropriate for this module'''
  310. if type(self).IndexClass is None or type(self).name is None:
  311. raise RuntimeError('Module lacks an IndexClass or a name; cannot create index')
  312. idx = type(self).IndexClass()
  313. idx.append('codearchiver version', codearchiver.version.__version__)
  314. idx.append('Module', type(self).name)
  315. idx.append('ID', self._id)
  316. idx.append('Input URL', self._url)
  317. idx.append('Filename', filename)
  318. return idx
  319. def __repr__(self):
  320. return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r})'
  321. def get_module_class(inputUrl: InputURL) -> typing.Type[Module]:
  322. '''Get the Module class most suitable for handling `inputUrl`.'''
  323. # Ensure that modules are imported
  324. # This can't be done at the top because the modules need to refer back to the Module class.
  325. import codearchiver.modules
  326. # Check if the URL references one of the modules directly
  327. if inputUrl.moduleScheme:
  328. if module := ModuleMeta.get_module_by_name(inputUrl.moduleScheme):
  329. _logger.info(f'Selecting module {module.__module__}.{module.__name__}')
  330. return module
  331. else:
  332. raise RuntimeError(f'No module with name {inputUrl.moduleScheme!r} exists')
  333. # Check if exactly one of the modules matches
  334. matches = [class_ for class_ in ModuleMeta.iter_modules() if class_.matches(inputUrl)]
  335. if len(matches) >= 2:
  336. _logger.error('Multiple matching modules for input URL')
  337. _logger.debug(f'Matching modules: {matches!r}')
  338. raise RuntimeError('Multiple matching modules for input URL')
  339. if matches:
  340. _logger.info(f'Selecting module {matches[0].__module__}.{matches[0].__name__}')
  341. return matches[0]
  342. raise RuntimeError('No matching modules for input URL')
  343. def get_module_instance(inputUrl: InputURL, **kwargs) -> Module:
  344. '''Get an instance of the Module class most suitable for handling `inputUrl`.'''
  345. return get_module_class(inputUrl)(inputUrl, **kwargs)