A framework for quick web archiving
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

378 lignes
12 KiB

  1. from qwarc.const import *
  2. import aiohttp
  3. import asyncio
  4. import functools
  5. import io
  6. import logging
  7. import os
  8. import pkg_resources
  9. import platform
  10. import time
  11. import typing
  12. import zlib
  13. PAGESIZE = os.sysconf('SC_PAGE_SIZE')
  14. def get_rss():
  15. '''Get the current RSS of this process in bytes'''
  16. with open('/proc/self/statm', 'r') as fp:
  17. return int(fp.readline().split()[1]) * PAGESIZE
  18. def get_disk_free():
  19. '''Get the current free disk space on the relevant partition in bytes'''
  20. st = os.statvfs('.')
  21. return st.f_bavail * st.f_frsize
  22. def uses_too_much_memory(limit):
  23. '''
  24. Check whether the process is using too much memory
  25. For performance reasons, this actually only checks the memory usage on every 100th call.
  26. '''
  27. uses_too_much_memory.callCounter += 1
  28. # Only check every hundredth call
  29. if uses_too_much_memory.callCounter % 100 == 0 and get_rss() > limit:
  30. return True
  31. return False
  32. uses_too_much_memory.callCounter = 0
  33. def too_little_disk_space(limit):
  34. '''
  35. Check whether the disk space is too small
  36. For performance reasons, this actually only checks the free disk space on every 100th call.
  37. '''
  38. too_little_disk_space.callCounter += 1
  39. if too_little_disk_space.callCounter % 100 == 0:
  40. too_little_disk_space.currentResult = (get_disk_free() < limit)
  41. return too_little_disk_space.currentResult
  42. too_little_disk_space.callCounter = 0
  43. too_little_disk_space.currentResult = False
  44. # https://stackoverflow.com/a/4665027
  45. def find_all(aStr, sub):
  46. '''Generator yielding the start positions of every non-overlapping occurrence of sub in aStr.'''
  47. start = 0
  48. while True:
  49. start = aStr.find(sub, start)
  50. if start == -1:
  51. return
  52. yield start
  53. start += len(sub)
  54. def str_get_between(aStr, a, b):
  55. '''Get the string after the first occurrence of a in aStr and the first occurrence of b after that of a, or None if there is no such string.'''
  56. aPos = aStr.find(a)
  57. if aPos == -1:
  58. return None
  59. offset = aPos + len(a)
  60. bPos = aStr.find(b, offset)
  61. if bPos == -1:
  62. return None
  63. return aStr[offset:bPos]
  64. def maybe_str_get_between(x, a, b):
  65. '''Like str_get_between, but returns None if x evaluates to False and converts it to a str before matching.'''
  66. if x:
  67. return str_get_between(str(x), a, b)
  68. def str_get_all_between(aStr, a, b):
  69. '''Generator yielding every string between occurrences of a in aStr and the following occurrence of b.'''
  70. #TODO: This produces half-overlapping matches: str_get_all_between('aabc', 'a', 'c') will yield 'ab' and 'b'.
  71. # Might need to implement sending an offset to the find_all generator to work around this, or discard aOffset values which are smaller than the previous bPos+len(b).
  72. for aOffset in find_all(aStr, a):
  73. offset = aOffset + len(a)
  74. bPos = aStr.find(b, offset)
  75. if bPos != -1:
  76. yield aStr[offset:bPos]
  77. def maybe_str_get_all_between(x, a, b):
  78. '''Like str_get_all_between, but yields no elements if x evaluates to False and converts x to a str before matching.'''
  79. if x:
  80. yield from str_get_all_between(str(x), a, b)
  81. def generate_range_items(start, stop, step):
  82. '''
  83. Generator for items of `step` size between `start` and `stop` (inclusive)
  84. Yields strings of the form `'a-b'` where `a` and `b` are integers such that `b - a + 1 == step`, `min(a) == start`, and `max(b) == stop`.
  85. `b - a + 1` may be unequal to `step` on the last item if `(stop - start + 1) % step != 0` (see examples below).
  86. Note that `a` and `b` can be equal on the last item if `(stop - start) % step == 0` (see examples below).
  87. Examples:
  88. - generate_range_items(0, 99, 10) yields '0-9', '10-19', '20-29', ..., '90-99'
  89. - generate_range_items(0, 42, 10): '0-9', '10-19', '20-29', '30-39', '40-42'
  90. - generate_range_items(0, 20, 10): '0-9', '10-19', '20-20'
  91. '''
  92. for i in range(start, stop + 1, step):
  93. yield f'{i}-{min(i + step - 1, stop)}'
  94. async def handle_response_default(url, attempt, response, exc):
  95. '''
  96. The default response handler, which behaves as follows:
  97. - If there is no response (e.g. timeout error), retry the retrieval after a delay of 5 seconds.
  98. - If the response has any of the status codes 401, 403, 404, 405, or 410, treat it as a permanent error and return.
  99. - If there was any exception and it is a asyncio.TimeoutError or a aiohttp.ClientError, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds.
  100. - If the response has any of the status codes 200, 204, 206, or 304, treat it as a success and return.
  101. - If the response has any of the status codes 301, 302, 303, 307, or 308, follow the redirect target if specified or return otherwise.
  102. - Otherwise, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds.
  103. - All responses are written to WARC by default.
  104. Note that this handler does not limit the number of retries on errors.
  105. Parameters: url (yarl.URL instance), attempt (int), response (aiohttp.ClientResponse or None), exc (Exception or None)
  106. At least one of response and exc is not None.
  107. Returns: (one of the qwarc.RESPONSE_* constants, bool signifying whether to write to WARC or not)
  108. The latter is ignored when exc is not None; responses that triggered an exception are never written to WARC.
  109. '''
  110. #TODO: Document that `attempt` is reset on redirects
  111. if response is None:
  112. await asyncio.sleep(5)
  113. return ACTION_RETRY, True
  114. if response.status in (401, 403, 404, 405, 410):
  115. return ACTION_IGNORE, True
  116. if exc is not None:
  117. if isinstance(exc, (asyncio.TimeoutError, aiohttp.ClientError)):
  118. await asyncio.sleep(5)
  119. return ACTION_RETRY, False # Don't write to WARC since there might be an incomplete response
  120. if response.status in (200, 204, 206, 304):
  121. return ACTION_SUCCESS, True
  122. if response.status in (301, 302, 303, 307, 308):
  123. return ACTION_FOLLOW_OR_SUCCESS, True
  124. await asyncio.sleep(5)
  125. return ACTION_RETRY, True
  126. async def handle_response_ignore_redirects(url, attempt, response, exc):
  127. '''A response handler that does not follow redirects, i.e. treats them as a success instead. It behaves as handle_response_default otherwise.'''
  128. action, writeToWarc = await handle_response_default(url, attempt, response, exc)
  129. if action == ACTION_FOLLOW_OR_SUCCESS:
  130. action = ACTION_SUCCESS
  131. return action, writeToWarc
  132. def handle_response_limit_error_retries(maxRetries, handler = handle_response_default):
  133. '''A response handler that limits the number of retries on errors. It behaves as handler otherwise, which defaults to handle_response_default.
  134. Technically, this is actually a response handler factory. This is so that the intuitive use works: fetch(..., responseHandler = handle_response_limit_error_retries(5))
  135. If you use the same limit many times, you should keep the return value (the response handler) of this method and reuse it to avoid creating a new function every time.
  136. '''
  137. async def _handler(url, attempt, response, exc):
  138. action, writeToWarc = await handler(url, attempt, response, exc)
  139. if action == ACTION_RETRY and attempt > maxRetries:
  140. action = ACTION_RETRIES_EXCEEDED
  141. return action, writeToWarc
  142. return _handler
  143. def _get_dependency_versions(*pkgs):
  144. pending = set(pkgs)
  145. have = set(pkgs)
  146. while pending:
  147. key = pending.pop()
  148. try:
  149. dist = pkg_resources.get_distribution(key)
  150. except pkg_resources.DistributionNotFound:
  151. logging.error(f'Unable to get distribution {key}')
  152. continue
  153. yield dist.key, dist.version
  154. for requirement in dist.requires():
  155. if requirement.key not in have:
  156. pending.add(requirement.key)
  157. have.add(requirement.key)
  158. @functools.lru_cache(maxsize = 1)
  159. def get_software_info(specFile, specDependencies):
  160. # Based on crocoite.utils, authored by PromyLOPh in commit 6ccd72ab on 2018-12-08 under MIT licence
  161. baseDependencyPackageVersions = list(_get_dependency_versions(__package__))
  162. baseDependencyPackages = set(x[0] for x in baseDependencyPackageVersions)
  163. specDependencyPackageVersions = list(_get_dependency_versions(*specDependencies.packages))
  164. return {
  165. 'platform': platform.platform(),
  166. 'python': {
  167. 'implementation': platform.python_implementation(),
  168. 'version': platform.python_version(),
  169. 'build': platform.python_build(),
  170. },
  171. 'self': [{"package": package, "version": version} for package, version in baseDependencyPackageVersions],
  172. 'spec': [{"package": package, "version": version} for package, version in specDependencyPackageVersions if package not in baseDependencyPackages],
  173. }
  174. class LogFormatter(logging.Formatter):
  175. def __init__(self):
  176. super().__init__('%(asctime)s.%(msecs)03dZ %(levelname)s %(itemString)s %(message)s', datefmt = '%Y-%m-%d %H:%M:%S')
  177. self.converter = time.gmtime
  178. def format(self, record):
  179. if not hasattr(record, 'itemString'):
  180. if hasattr(record, 'itemType') and hasattr(record, 'itemValue'):
  181. record.itemString = f'{record.itemType}:{record.itemValue}'
  182. else:
  183. record.itemString = 'None'
  184. return super().format(record)
  185. class SpecDependencies(typing.NamedTuple):
  186. packages: tuple = ()
  187. files: tuple = ()
  188. extra: typing.Any = None
  189. class ReadonlyFileView:
  190. '''
  191. A poor read-only view for a file object. It hides the writing methods and passes everything else through to the underlying file object. Note that this does *not* actually prevent modification at all.
  192. '''
  193. def __init__(self, fp):
  194. self._fp = fp
  195. def __getattr__(self, key):
  196. if key in ('write', 'writelines', 'truncate'):
  197. raise AttributeError
  198. if key == 'writable':
  199. return False
  200. return getattr(self._fp, key)
  201. def iter_file(f, length = None, blockSize = 1048576):
  202. '''Read `length` bytes from `f` in chunks of `blockSize` bytes. If `length` is `None`, read until EOF.'''
  203. read = 0
  204. while True:
  205. buf = f.read(blockSize)
  206. if not buf: # EOF
  207. if length and read < length:
  208. raise RuntimeError('Reached EOF before reading enough data')
  209. break
  210. if length and read + len(buf) > length:
  211. initialBufLen = len(buf)
  212. buf = buf[0 : length - read]
  213. f.seek(len(buf) - initialBufLen, io.SEEK_CUR)
  214. read += len(buf)
  215. yield buf
  216. if length and read >= length:
  217. if read > length: # This should never happen due to the truncation above.
  218. raise RuntimeError('Overread')
  219. break
  220. def read_http_headers(f, copy = None):
  221. headers = {}
  222. # Status line or request line
  223. line = f.readline()
  224. if copy:
  225. copy.write(line)
  226. line = f.readline()
  227. if copy:
  228. copy.write(line)
  229. while line and line not in (b'\r\n', b'\r', b'\n'):
  230. # Split into header name and value
  231. name, value = line.split(b':', 1)
  232. name = name.strip(b' \t')
  233. #TODO name validation
  234. # Read next line
  235. line = f.readline()
  236. if copy:
  237. copy.write(line)
  238. # Handle continuation lines
  239. continuation = line[0:1] in (b' ', b'\t')
  240. if continuation:
  241. value = []
  242. while continuation:
  243. value.append(line)
  244. line = f.readline()
  245. if copy:
  246. copy.write(line)
  247. continuation = line[0:1] in (b' ', b'\t')
  248. value = b''.join(value)
  249. # Decode and store
  250. try:
  251. name = name.decode('utf-8')
  252. except UnicodeDecodeError:
  253. name = name.decode('iso-8859-1')
  254. try:
  255. value = value.decode('utf-8')
  256. except UnicodeDecodeError:
  257. value = value.decode('iso-8859-1')
  258. headers[name.lower()] = value
  259. # `line` is already the next line, if any
  260. return headers
  261. def read_http_body(f, length, headers):
  262. if 'chunked' in map(str.strip, headers.get('transfer-encoding', '').split(',')):
  263. while True:
  264. chunkLine = f.readline()
  265. if b';' in chunkLine:
  266. chunkLength = chunkLine.split(b';', 1)[0].strip()
  267. else:
  268. chunkLength = chunkLine.strip()
  269. chunkLength = int(chunkLength, base = 16)
  270. if chunkLength == 0:
  271. break
  272. yield from iter_file(f, length = chunkLength)
  273. assert f.read(2) == b'\r\n' # Chunk terminator
  274. # Consume trailer
  275. line = f.readline()
  276. while line and line not in (b'\r\n', b'\r', b'\n'):
  277. line = f.readline()
  278. else:
  279. yield from iter_file(f, length = length)
  280. class GzipWrapper:
  281. def __init__(self, f):
  282. self._file = f
  283. self._compressor = None
  284. def __enter__(self):
  285. self._compressor = zlib.compressobj(9, zlib.DEFLATED, 16 + zlib.MAX_WBITS)
  286. return self
  287. def write(self, data):
  288. buf = self._compressor.compress(data)
  289. self._file.write(buf)
  290. def __exit__(self, excType, excVal, excTb):
  291. buf = self._compressor.flush()
  292. self._file.write(buf)
  293. self._file.flush()
  294. self._compressor = None