Documentation of the core

3 年之前 · 2a2c9373d0
--- a/codearchiver/core.py
+++ b/codearchiver/core.py
@@ -15,7 +15,14 @@ logger = logging.getLogger(__name__)


 class InputURL:
 	def __init__(self, url):
 	'''
 	An input URL

 	This primarily exists so multiple modules can access the content behind the URL for checks in `Module.matches` without fetching multiple times.
 	It also handles the module name prefix in the scheme part of the URL. Note that `InputURL.url` is then the part without the module name.
 	'''

 	def __init__(self, url: str):
 		if 0 < url.find('+') < url.find('://'):
 			# '+' and '://' appear in the URL in this order and there is at least one character each before the + as well as between the two
 			self._moduleScheme, self._url = url.split('+', 1)
@@ -25,15 +32,18 @@ class InputURL:
 		self._response = None

 	@property
 	def url(self):
 	def url(self) -> str:
 		'''URL without the module scheme prefix (if any)'''
 		return self._url

 	@property
 	def moduleScheme(self):
 	def moduleScheme(self) -> typing.Optional[str]:
 		'''Module scheme prefix (if one is included, else `None`)'''
 		return self._moduleScheme

 	@property
 	def content(self):
 	def content(self) -> str:
 		'''HTTP response body upon fetching the URL with GET'''
 		if self._response is None:
 			self._response = HttpClient().get(self.url)
 		return self._response.text
@@ -57,19 +67,41 @@ class Result:


 class HttpError(Exception):
 	pass
 	'''An HTTP request failed too many times.'''


 class HttpClient:
 	'''A thin wrapper HTTP client around Requests with exponential backoff retries and a default user agent for all requests.'''

 	defaultRetries: int = 3
 	'''Default number of retries on errors unless overridden on creating the HttpClient object'''

 	defaultUserAgent: str = f'codearchiver/{codearchiver.version.__version__}'
 	'''Default user agent unless overridden on instantiation or by overriding via the headers kwarg'''

 	def __init__(self, retries = None, userAgent = None):
 	def __init__(self, retries: typing.Optional[int] = None, userAgent: typing.Optional[str] = None):
 		self._session = requests.Session()
 		self._retries = retries if retries else self.defaultRetries
 		self._userAgent = userAgent if userAgent else self.defaultUserAgent

 	def request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None):
 	def request(self,
 	            method,
 	            url,
 	            params = None,
 	            data = None,
 	            headers: typing.Optional[typing.Dict[str, str]] = None,
 	            timeout: int = 10,
 	            responseOkCallback: typing.Optional[typing.Callable[[requests.Response], typing.Tuple[bool, typing.Optional[str]]]] = None,
 	           ) -> requests.Response:
 		'''
 		Make an HTTP request

 		For the details on `method`, `url`, `params`, and `data`, refer to the Requests documentation on the constructor of `requests.Request`.
 		For details on `timeout`, see `requests.adapters.HTTPAdapter.send`.
 		`headers` can be used to specify any HTTP headers. Note that this is case-sensitive. To override the user agent, include a value for the `User-Agent` key here.
 		`responseOkCallback` can be used to control whether a response is considered acceptable or not. By default, all HTTP responses are considered fine. If specified, this callable must produce a boolean marking whether the response is successful and an error message string. The string is used for logging purposes when the success flag is `False`; it should be `None` if the first return value is `True`.
 		'''

 		mergedHeaders = {'User-Agent': self._userAgent}
 		if headers:
 			mergedHeaders.update(headers)
@@ -120,14 +152,18 @@ class HttpClient:
 		raise RuntimeError('Reached unreachable code')

 	def get(self, *args, **kwargs):
 		'''Make a GET request. This is equivalent to calling `.request('GET', ...)`.'''
 		return self.request('GET', *args, **kwargs)

 	def post(self, *args, **kwargs):
 		'''Make a POST request. This is equivalent to calling `.request('POST', ...)`.'''
 		return self.request('POST', *args, **kwargs)


 class ModuleMeta(type):
 	__modulesByName = {}  # name -> Module class
 	'''Metaclass of modules. This is used to keep track of which modules exist and selecting them. It also enforces module name restrictions and prevents name collisions.'''

 	__modulesByName: typing.Dict[str, typing.Type['Module']] = {}

 	def __new__(cls, *args, **kwargs):
 		class_ = super().__new__(cls, *args, **kwargs)
@@ -143,7 +179,9 @@ class ModuleMeta(type):
 		return class_

 	@classmethod
 	def get_module_by_name(cls, name):
 	def get_module_by_name(cls, name: str) -> typing.Optional[typing.Type['Module']]:
 		'''Get a module by name if one exists'''

 		if classRef := cls.__modulesByName.get(name):
 			class_ = classRef()
 			if class_ is None:
@@ -152,7 +190,9 @@ class ModuleMeta(type):
 			return class_

 	@classmethod
 	def iter_modules(cls):
 	def iter_modules(cls) -> typing.Iterator[typing.Type['Module']]:
 		'''Iterate over all known modules'''

 		# Housekeeping first: remove dead modules
 		for name in list(cls.__modulesByName): # create a copy of the names list so the dict can be modified in the loop
 			if cls.__modulesByName[name]() is None:
@@ -168,7 +208,13 @@ class ModuleMeta(type):
 			yield class_

 	@classmethod
 	def drop(cls, module):
 	def drop(cls, module: 'Module'):
 		'''
 		Remove a module from the list of known modules

 		If a Module subclass is destroyed after `del MyModule`, it is also eventually removed from the list. However, as that relies on garbage collection, it should not be depended on and modules should be dropped with this method explicitly.
 		'''

 		if module.name is not None and module.name in cls.__modulesByName:
 			del cls.__modulesByName[module.name]
 			logger.info(f'Module {module.name!r} dropped')
@@ -191,7 +237,7 @@ class Module(metaclass = ModuleMeta):
 		'''Whether or not this module is for handling `inputUrl`.'''
 		return False

 	def __init__(self, inputUrl, id_ = None):
 	def __init__(self, inputUrl: InputURL, id_: typing.Optional[str] = None):
 		self._inputUrl = inputUrl
 		self._url = inputUrl.url
 		self._id = id_