diff --git a/codearchiver/core.py b/codearchiver/core.py index 5b18684..ba85ce0 100644 --- a/codearchiver/core.py +++ b/codearchiver/core.py @@ -15,7 +15,14 @@ logger = logging.getLogger(__name__) class InputURL: - def __init__(self, url): + ''' + An input URL + + This primarily exists so multiple modules can access the content behind the URL for checks in `Module.matches` without fetching multiple times. + It also handles the module name prefix in the scheme part of the URL. Note that `InputURL.url` is then the part without the module name. + ''' + + def __init__(self, url: str): if 0 < url.find('+') < url.find('://'): # '+' and '://' appear in the URL in this order and there is at least one character each before the + as well as between the two self._moduleScheme, self._url = url.split('+', 1) @@ -25,15 +32,18 @@ class InputURL: self._response = None @property - def url(self): + def url(self) -> str: + '''URL without the module scheme prefix (if any)''' return self._url @property - def moduleScheme(self): + def moduleScheme(self) -> typing.Optional[str]: + '''Module scheme prefix (if one is included, else `None`)''' return self._moduleScheme @property - def content(self): + def content(self) -> str: + '''HTTP response body upon fetching the URL with GET''' if self._response is None: self._response = HttpClient().get(self.url) return self._response.text @@ -57,19 +67,41 @@ class Result: class HttpError(Exception): - pass + '''An HTTP request failed too many times.''' class HttpClient: + '''A thin wrapper HTTP client around Requests with exponential backoff retries and a default user agent for all requests.''' + defaultRetries: int = 3 + '''Default number of retries on errors unless overridden on creating the HttpClient object''' + defaultUserAgent: str = f'codearchiver/{codearchiver.version.__version__}' + '''Default user agent unless overridden on instantiation or by overriding via the headers kwarg''' - def __init__(self, retries = None, userAgent = None): + def __init__(self, retries: typing.Optional[int] = None, userAgent: typing.Optional[str] = None): self._session = requests.Session() self._retries = retries if retries else self.defaultRetries self._userAgent = userAgent if userAgent else self.defaultUserAgent - def request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None): + def request(self, + method, + url, + params = None, + data = None, + headers: typing.Optional[typing.Dict[str, str]] = None, + timeout: int = 10, + responseOkCallback: typing.Optional[typing.Callable[[requests.Response], typing.Tuple[bool, typing.Optional[str]]]] = None, + ) -> requests.Response: + ''' + Make an HTTP request + + For the details on `method`, `url`, `params`, and `data`, refer to the Requests documentation on the constructor of `requests.Request`. + For details on `timeout`, see `requests.adapters.HTTPAdapter.send`. + `headers` can be used to specify any HTTP headers. Note that this is case-sensitive. To override the user agent, include a value for the `User-Agent` key here. + `responseOkCallback` can be used to control whether a response is considered acceptable or not. By default, all HTTP responses are considered fine. If specified, this callable must produce a boolean marking whether the response is successful and an error message string. The string is used for logging purposes when the success flag is `False`; it should be `None` if the first return value is `True`. + ''' + mergedHeaders = {'User-Agent': self._userAgent} if headers: mergedHeaders.update(headers) @@ -120,14 +152,18 @@ class HttpClient: raise RuntimeError('Reached unreachable code') def get(self, *args, **kwargs): + '''Make a GET request. This is equivalent to calling `.request('GET', ...)`.''' return self.request('GET', *args, **kwargs) def post(self, *args, **kwargs): + '''Make a POST request. This is equivalent to calling `.request('POST', ...)`.''' return self.request('POST', *args, **kwargs) class ModuleMeta(type): - __modulesByName = {} # name -> Module class + '''Metaclass of modules. This is used to keep track of which modules exist and selecting them. It also enforces module name restrictions and prevents name collisions.''' + + __modulesByName: typing.Dict[str, typing.Type['Module']] = {} def __new__(cls, *args, **kwargs): class_ = super().__new__(cls, *args, **kwargs) @@ -143,7 +179,9 @@ class ModuleMeta(type): return class_ @classmethod - def get_module_by_name(cls, name): + def get_module_by_name(cls, name: str) -> typing.Optional[typing.Type['Module']]: + '''Get a module by name if one exists''' + if classRef := cls.__modulesByName.get(name): class_ = classRef() if class_ is None: @@ -152,7 +190,9 @@ class ModuleMeta(type): return class_ @classmethod - def iter_modules(cls): + def iter_modules(cls) -> typing.Iterator[typing.Type['Module']]: + '''Iterate over all known modules''' + # Housekeeping first: remove dead modules for name in list(cls.__modulesByName): # create a copy of the names list so the dict can be modified in the loop if cls.__modulesByName[name]() is None: @@ -168,7 +208,13 @@ class ModuleMeta(type): yield class_ @classmethod - def drop(cls, module): + def drop(cls, module: 'Module'): + ''' + Remove a module from the list of known modules + + If a Module subclass is destroyed after `del MyModule`, it is also eventually removed from the list. However, as that relies on garbage collection, it should not be depended on and modules should be dropped with this method explicitly. + ''' + if module.name is not None and module.name in cls.__modulesByName: del cls.__modulesByName[module.name] logger.info(f'Module {module.name!r} dropped') @@ -191,7 +237,7 @@ class Module(metaclass = ModuleMeta): '''Whether or not this module is for handling `inputUrl`.''' return False - def __init__(self, inputUrl, id_ = None): + def __init__(self, inputUrl: InputURL, id_: typing.Optional[str] = None): self._inputUrl = inputUrl self._url = inputUrl.url self._id = id_