The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

539 lines
19 KiB

  1. #!/usr/bin/env python3
  2. # Tiny tool for WARC stuff.
  3. # Operating modes:
  4. # warc-tiny colour FILES -- coloured output of the WARCs for easier reading
  5. # warc-tiny dump-responses [-m|--meta] FILES -- dump the HTTP response bodies to stdout
  6. # With --meta, prefix every line with the filename, record offset, record ID, and target URI; e.g. 'file.warc.gz:123:<urn:uuid:41b76f1f-f946-4723-91f8-cee6491e92f3>:<https://example.org/>: foobar'
  7. # The record offset may be -1 if it is not known.
  8. # The filename is wrapped in angled brackets if it contains a colon; the target URI is always wrapped in angled brackets (since it virtually always contains a colon).
  9. # warc-tiny scrape [-u|--urls] FILES -- extract all links and page requisites from the records; produces lines of filename, record offset, record URI, link type, inline flag, and URL as JSONL
  10. # With --urls, only the URL is printed.
  11. # wpull's scrapers are used for the extraction.
  12. # warc-tiny verify FILES -- verify the integrity of a WARC by comparing the digests
  13. import base64
  14. import enum
  15. import gzip
  16. import hashlib
  17. import json
  18. import sys
  19. import tempfile
  20. import zlib
  21. try:
  22. import wpull.body
  23. import wpull.document.htmlparse.lxml_
  24. try:
  25. import wpull.protocol.http.request as wpull_protocol_http_request # wpull 2.x
  26. except ImportError:
  27. import wpull.http.request as wpull_protocol_http_request # wpull 1.x
  28. import wpull.scraper.base
  29. import wpull.scraper.css
  30. import wpull.scraper.html
  31. import wpull.scraper.javascript
  32. import wpull.scraper.sitemap
  33. except ImportError:
  34. wpull = None
  35. def GzipDecompressor():
  36. return zlib.decompressobj(16 + zlib.MAX_WBITS)
  37. class DummyDecompressor:
  38. def decompress(self, data):
  39. return data
  40. class Event:
  41. pass
  42. class NewFile(Event):
  43. def __init__(self, filename):
  44. self._filename = filename
  45. @property
  46. def filename(self):
  47. return self._filename
  48. class BeginOfRecord(Event):
  49. def __init__(self, warcHeaders, rawData):
  50. self._warcHeaders = warcHeaders
  51. self._rawData = rawData
  52. @property
  53. def warcHeaders(self):
  54. return self._warcHeaders
  55. @property
  56. def rawData(self):
  57. return self._rawData
  58. class HTTPHeaders(Event):
  59. def __init__(self, headers):
  60. self._headers = headers
  61. @property
  62. def headers(self):
  63. return self._headers
  64. class _DataChunk(Event):
  65. def __init__(self, data):
  66. self._data = data
  67. @property
  68. def data(self):
  69. return self._data
  70. def __repr__(self):
  71. return '{}({!r}{})'.format(type(self).__name__, self._data[:50], '...' if len(self._data) > 50 else '')
  72. class WARCBlockChunk(_DataChunk):
  73. def __init__(self, data, isHttpHeader = None):
  74. super().__init__(data)
  75. self._isHttpHeader = isHttpHeader
  76. @property
  77. def isHttpHeader(self):
  78. # True: the chunk represents (part of) the HTTP header; False: the chunk represents (part of) the HTTP body; None: the chunk is not part of an HTTP record
  79. return self._isHttpHeader
  80. class RawHTTPBodyChunk(_DataChunk):
  81. '''
  82. Because many tools misunderstood the WARC specifications, the Payload-Digest was often implemented without stripping transfer encoding.
  83. This is like HTTPBodyChunk but without transfer encoding stripping.
  84. '''
  85. class HTTPBodyChunk(_DataChunk):
  86. '''
  87. Representing a part of the HTTP body with transfer encoding stripped.
  88. '''
  89. class EndOfRecord(Event):
  90. pass
  91. def iter_warc(f):
  92. # Yields Events
  93. # BeginOfRecord's rawData does not include the CRLF CRLF at the end of the headers, and WARCBlockChunk does not contain the CRLF CRLF after the block either.
  94. with gzip.open(f, 'rb') as fp:
  95. buf = b''
  96. while True:
  97. # Read WARC header
  98. while b'\r\n\r\n' not in buf:
  99. try:
  100. buf = buf + fp.read(4096)
  101. except EOFError:
  102. break
  103. if not buf:
  104. break
  105. if not buf:
  106. break
  107. warcHeaderBuf, buf = buf.split(b'\r\n\r\n', 1)
  108. assert warcHeaderBuf.startswith(b'WARC/1.0\r\n') or warcHeaderBuf.startswith(b'WARC/1.1\r\n')
  109. assert b'\r\nContent-Length:' in warcHeaderBuf
  110. warcHeaders = tuple(tuple(map(bytes.strip, x.split(b':', 1))) for x in warcHeaderBuf.split(b'\r\n'))
  111. warcContentType = next(x[1] for x in warcHeaders if x[0] == b'Content-Type')
  112. warcContentLength = int(next(x[1] for x in warcHeaders if x[0] == b'Content-Length'))
  113. warcType = next(x[1] for x in warcHeaders if x[0] == b'WARC-Type')
  114. yield BeginOfRecord(warcHeaders, warcHeaderBuf)
  115. recordID = next(x[1] for x in warcHeaders if x[0] == b'WARC-Record-ID')
  116. # Read WARC block (and skip CRLFCRLF at the end of the record)
  117. if len(buf) < warcContentLength + 4:
  118. try:
  119. buf = buf + fp.read(warcContentLength + 4 - len(buf))
  120. except EOFError:
  121. pass
  122. if len(buf) < warcContentLength + 4:
  123. print('Error: truncated WARC', file = sys.stderr)
  124. break
  125. warcContent = buf[:warcContentLength]
  126. buf = buf[warcContentLength + 4:]
  127. # Decode HTTP body if appropriate
  128. if warcContentType in (b'application/http;msgtype=request', b'application/http; msgtype=request') and warcType == b'request':
  129. httpType = 'request'
  130. elif warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response':
  131. httpType = 'response'
  132. else:
  133. httpType = None
  134. if httpType is not None:
  135. if b'\r\n\r\n' in warcContent:
  136. httpHeaders, httpBody = warcContent.split(b'\r\n\r\n', 1)
  137. # Parse headers and extract transfer encoding
  138. httpHeaderLines = [tuple(map(bytes.strip, x.split(b':', 1))) for x in httpHeaders.split(b'\r\n')]
  139. chunked = False
  140. gzipped = False
  141. if b'\r\ntransfer-encoding' in httpHeaders.lower():
  142. transferEncoding = next(x[1] for x in httpHeaderLines if x[0].lower() == b'transfer-encoding')
  143. transferEncodings = set(map(bytes.strip, transferEncoding.split(b',')))
  144. chunked = b'chunked' in transferEncodings
  145. gzipped = b'gzip' in transferEncodings
  146. yield WARCBlockChunk(httpHeaders + b'\r\n\r\n', isHttpHeader = True)
  147. yield HTTPHeaders(httpHeaderLines)
  148. yield WARCBlockChunk(httpBody, isHttpHeader = False)
  149. yield RawHTTPBodyChunk(httpBody)
  150. # Decode body
  151. if gzipped:
  152. httpDecompressor = GzipDecompressor()
  153. else:
  154. httpDecompressor = DummyDecompressor()
  155. if chunked:
  156. pos = 0
  157. while True:
  158. try:
  159. chunkLineEnd = httpBody.index(b'\r\n', pos)
  160. except ValueError:
  161. print('Error: could not find chunk line end in record {}, skipping'.format(recordID), file = sys.stderr)
  162. break
  163. chunkLine = httpBody[pos:chunkLineEnd]
  164. if b';' in chunkLine:
  165. chunkLength = chunkLine[:chunkLine.index(b';')].strip()
  166. else:
  167. chunkLength = chunkLine.strip()
  168. if chunkLength.lstrip(b'0123456789abcdefABCDEF') != b'':
  169. print('Error: malformed chunk length {!r} in record {}, skipping'.format(chunkLength, recordID), file = sys.stderr)
  170. break
  171. chunkLength = int(chunkLength, base = 16)
  172. if chunkLength == 0:
  173. break
  174. chunk = httpDecompressor.decompress(httpBody[chunkLineEnd + 2 : chunkLineEnd + 2 + chunkLength])
  175. yield HTTPBodyChunk(chunk)
  176. pos = chunkLineEnd + 2 + chunkLength + 2
  177. else:
  178. yield HTTPBodyChunk(httpDecompressor.decompress(httpBody))
  179. else:
  180. print('Warning: malformed HTTP request or response in record {}, skipping'.format(recordID), file = sys.stderr)
  181. yield WARCBlockChunk(warcContent)
  182. else:
  183. yield WARCBlockChunk(warcContent)
  184. yield EndOfRecord()
  185. class ProcessMode:
  186. @classmethod
  187. def split_args(cls, args):
  188. '''Split args into arguments to be passed into __init__ and filenames'''
  189. return (), args
  190. def process_event(self, event):
  191. raise NotImplementedError
  192. class Digest:
  193. def __init__(self, digest):
  194. self._digest = digest
  195. def format(self, digest = None):
  196. raise NotImplementedError
  197. def equals(self, digest):
  198. return self._digest == digest
  199. class Base32Digest(Digest):
  200. def format(self, digest = None):
  201. return base64.b32encode(digest if digest else self._digest)
  202. class HexDigest(Digest):
  203. def format(self, digest = None):
  204. return (digest if digest else self._digest).hex()
  205. class VerifyMode(ProcessMode):
  206. def __init__(self):
  207. self._blockDigester = None
  208. self._recordedBlockDigest = None
  209. self._payloadDigester = None
  210. self._brokenPayloadDigester = None
  211. self._recordedPayloadDigest = None
  212. self._printedBrokenPayloadWarning = False
  213. def parse_digest(self, digest):
  214. if not digest.startswith(b'sha1:'):
  215. print('Warning: don\'t understand hash format: {!r}'.format(digest), file = sys.stderr)
  216. return None
  217. if len(digest) == 37 and digest.rstrip(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567') == b'sha1:': # 5 for 'sha1:' + 32 for base-32 hash
  218. return Base32Digest(base64.b32decode(digest[5:]))
  219. if len(digest) == 45 and digest.rstrip(b'0123456789abcdef') == b'sha1:':
  220. return HexDigest(bytes.fromhex(digest[5:].decode('ascii')))
  221. return None
  222. def process_event(self, event):
  223. if type(event) is NewFile:
  224. self._printedBrokenPayloadWarning = False
  225. elif type(event) is BeginOfRecord:
  226. if any(x[0] == b'WARC-Block-Digest' for x in event.warcHeaders):
  227. self._blockDigester = hashlib.sha1()
  228. self._recordedBlockDigest = self.parse_digest(next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Block-Digest'))
  229. else:
  230. self._blockDigester = None
  231. self._recordedBlockDigest = None
  232. if any(x[0] == b'WARC-Payload-Digest' for x in event.warcHeaders):
  233. self._payloadDigester = hashlib.sha1()
  234. self._brokenPayloadDigester = hashlib.sha1()
  235. self._recordedPayloadDigest = self.parse_digest(next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Payload-Digest'))
  236. else:
  237. self._payloadDigester = None
  238. self._brokenPayloadDigester = None
  239. self._recordedPayloadDigest = None
  240. self._recordID = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Record-ID')
  241. self._recordType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
  242. elif type(event) is WARCBlockChunk:
  243. if self._blockDigester:
  244. self._blockDigester.update(event.data)
  245. elif type(event) is HTTPBodyChunk:
  246. if self._payloadDigester:
  247. self._payloadDigester.update(event.data)
  248. elif type(event) is RawHTTPBodyChunk:
  249. if self._brokenPayloadDigester:
  250. self._brokenPayloadDigester.update(event.data)
  251. elif type(event) is EndOfRecord:
  252. if self._blockDigester and self._recordedBlockDigest:
  253. if not self._recordedBlockDigest.equals(self._blockDigester.digest()):
  254. print('Block digest mismatch for record {}: recorded {} v calculated {}'.format(self._recordID, self._recordedBlockDigest.format(), self._recordedBlockDigest.format(self._blockDigester.digest())), file = sys.stderr)
  255. if self._payloadDigester and self._recordType in (b'request', b'response'): #TODO: Support revisit
  256. if not self._recordedPayloadDigest.equals(self._payloadDigester.digest()):
  257. if self._recordedPayloadDigest.equals(self._brokenPayloadDigester.digest()):
  258. if not self._printedBrokenPayloadWarning:
  259. print('Warning: WARC uses incorrect payload digests without stripping the transfer encoding', file = sys.stderr)
  260. self._printedBrokenPayloadWarning = True
  261. else:
  262. print('Payload digest mismatch for record {}: recorded {} vs. calculated {} (calculated broken {})'.format(self._recordID, self._recordedPayloadDigest.format(), self._recordedPayloadDigest.format(self._payloadDigester.digest()), self._recordedPayloadDigest.format(self._brokenPayloadDigester.digest())), file = sys.stderr)
  263. class DumpResponsesMode(ProcessMode):
  264. @classmethod
  265. def split_args(cls, args):
  266. if args[0] == '-m' or args[0] == '--meta':
  267. return (True,), args[1:]
  268. return (False,), args
  269. def __init__(self, withMeta):
  270. self._printEOR = False
  271. self._isResponse = False
  272. self._withMeta = withMeta
  273. if withMeta:
  274. self._recordID = None
  275. self._targetURI = None
  276. self._buffer = b''
  277. def _write(self, data):
  278. if not self._withMeta:
  279. sys.stdout.buffer.write(data)
  280. return
  281. buf = self._buffer + data
  282. lines = buf.split(b'\n')
  283. self._buffer = lines.pop() # Since there's an explicit `_write(b'\r\n')` at the end of the record, this implicitly resets the buffer as well
  284. for line in lines:
  285. sys.stdout.buffer.write(':'.join((self._filename, '-1', self._recordID, '<' + self._targetURI + '>', '')).encode('utf-8'))
  286. sys.stdout.buffer.write(line)
  287. sys.stdout.buffer.write(b'\n')
  288. def process_event(self, event):
  289. if type(event) is NewFile:
  290. self._filename = event.filename
  291. if ':' in self._filename:
  292. self._filename = '<' + self._filename + '>'
  293. elif type(event) is BeginOfRecord:
  294. warcContentType = next(x[1] for x in event.warcHeaders if x[0] == b'Content-Type')
  295. warcType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
  296. self._isResponse = warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response'
  297. self._printEOR = False
  298. if self._withMeta:
  299. # Both of these are URIs, and per RFC 3986, those can only contain ASCII characters.
  300. self._recordID = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Record-ID').decode('ascii')
  301. self._targetURI = next((x[1] for x in event.warcHeaders if x[0] == b'WARC-Target-URI'), b'').decode('ascii')
  302. self._buffer = b''
  303. elif type(event) is HTTPBodyChunk:
  304. if self._isResponse:
  305. self._printEOR = True
  306. self._write(event.data)
  307. elif type(event) is EndOfRecord:
  308. if self._printEOR:
  309. self._write(b'\r\n')
  310. class COLOURS:
  311. RESET = b'\x1b[0m'
  312. GREEN = b'\x1b[0;32m'
  313. LIGHTGREEN = b'\x1b[1;32m'
  314. PURPLE = b'\x1b[0;35m'
  315. LIGHTPURPLE = b'\x1b[1;35m'
  316. RED = b'\x1b[0;31m'
  317. INVERTED = b'\x1b[7m'
  318. class ColourMode(ProcessMode):
  319. def __init__(self):
  320. self._hadHttpStatusLine = False
  321. def _replace_esc(self, data):
  322. return data.replace(b'\x1b', COLOURS.INVERTED + b'ESC' + COLOURS.RESET)
  323. def _print_line(self, line, colour, withLF = True, colourOnlyBeforeColon = False):
  324. if colourOnlyBeforeColon:
  325. if b':' in line:
  326. offset = line.index(b':')
  327. else:
  328. offset = 0
  329. else:
  330. offset = len(line)
  331. if offset > 0:
  332. sys.stdout.buffer.write(colour)
  333. sys.stdout.buffer.write(self._replace_esc(line[:offset]))
  334. sys.stdout.buffer.write(COLOURS.RESET)
  335. sys.stdout.buffer.write(line[offset:])
  336. if withLF:
  337. sys.stdout.buffer.write(b'\n')
  338. def _print_data(self, data, colour, colourOnlyBeforeColon):
  339. later = False
  340. for line in data.split(b'\r\n'):
  341. if later:
  342. sys.stdout.buffer.write(b'\n')
  343. self._print_line(line, colour, withLF = False, colourOnlyBeforeColon = colourOnlyBeforeColon)
  344. later = True
  345. def process_event(self, event):
  346. if type(event) is BeginOfRecord:
  347. firstNewline = event.rawData.index(b'\r\n')
  348. self._print_line(event.rawData[:firstNewline], COLOURS.LIGHTGREEN)
  349. self._print_data(event.rawData[firstNewline + 2:], COLOURS.GREEN, True)
  350. sys.stdout.buffer.write(b'\n\n') # separator between header and block
  351. self._hadHttpStatusLine = False
  352. elif type(event) is WARCBlockChunk:
  353. if event.isHttpHeader is True:
  354. if not self._hadHttpStatusLine:
  355. firstNewline = event.data.index(b'\r\n')
  356. self._print_line(event.data[:firstNewline], COLOURS.LIGHTPURPLE)
  357. offset = firstNewline + 2
  358. self._hadHttpStatusLine = True
  359. else:
  360. offset = 0
  361. self._print_data(event.data[offset:], COLOURS.PURPLE, True)
  362. elif event.isHttpHeader is False:
  363. self._print_data(event.data, COLOURS.RED, False)
  364. elif event.isHttpHeader is None:
  365. sys.stdout.buffer.write(self._replace_esc(event.data))
  366. elif type(event) is EndOfRecord:
  367. sys.stdout.buffer.write(b'\n\n')
  368. class ScrapeMode(ProcessMode):
  369. @classmethod
  370. def split_args(cls, args):
  371. if args[0] == '-u' or args[0] == '--urls':
  372. return (True,), args[1:]
  373. return (False,), args
  374. def __init__(self, urlsOnly):
  375. self._urlsOnly = urlsOnly
  376. assert wpull is not None, 'Scrape mode requires wpull and lxml'
  377. htmlParser = wpull.document.htmlparse.lxml_.HTMLParser()
  378. elementWalker = wpull.scraper.html.ElementWalker()
  379. scrapers = []
  380. scrapers.append(wpull.scraper.html.HTMLScraper(htmlParser, elementWalker))
  381. scrapers.append(wpull.scraper.css.CSSScraper())
  382. elementWalker.css_scraper = scrapers[-1]
  383. scrapers.append(wpull.scraper.javascript.JavaScriptScraper())
  384. elementWalker.javascript_scraper = scrapers[-1]
  385. scrapers.append(wpull.scraper.sitemap.SitemapScraper(htmlParser))
  386. self._scraper = wpull.scraper.base.DemuxDocumentScraper(scrapers)
  387. self._isResponse = None
  388. self._body = None
  389. self._recordURI = None
  390. self._statusCode = None
  391. self._statusReason = None
  392. if not self._urlsOnly:
  393. self._filename = None
  394. self._recordID = None
  395. def process_event(self, event):
  396. if type(event) is NewFile and not self._urlsOnly:
  397. self._filename = event.filename
  398. elif type(event) is BeginOfRecord:
  399. warcContentType = next(x[1] for x in event.warcHeaders if x[0] == b'Content-Type')
  400. warcType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
  401. self._isResponse = warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response'
  402. if self._isResponse:
  403. self._body = wpull.body.Body(file = tempfile.SpooledTemporaryFile(max_size = 10485760)) # Up to 10 MiB in memory
  404. self._printEOR = False
  405. if not self._urlsOnly:
  406. # Both of these are URIs, and per RFC 3986, those can only contain ASCII characters.
  407. self._recordID = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Record-ID').decode('ascii')
  408. self._recordURI = next((x[1] for x in event.warcHeaders if x[0] == b'WARC-Target-URI'), b'').decode('ascii')
  409. elif type(event) is HTTPHeaders and self._isResponse:
  410. assert len(event.headers[0]) == 1 and event.headers[0][0].startswith(b'HTTP/'), 'malformed HTTP response'
  411. _, statusCode, reason = event.headers[0][0].decode('ascii').split(' ', 2)
  412. self._statusCode = int(statusCode)
  413. self._statusReason = reason
  414. elif type(event) is HTTPBodyChunk and self._isResponse:
  415. self._body.write(event.data)
  416. elif type(event) is EndOfRecord and self._isResponse:
  417. request = wpull_protocol_http_request.Request(self._recordURI)
  418. response = wpull_protocol_http_request.Response(self._statusCode, self._statusReason)
  419. response.body = self._body
  420. response.body.seek(0)
  421. for scraper, scrapeResult in self._scraper.scrape_info(request, response).items():
  422. if not scrapeResult:
  423. continue
  424. for linkContext in scrapeResult.link_contexts:
  425. if self._urlsOnly:
  426. print(linkContext.link)
  427. continue
  428. o = {
  429. 'filename': self._filename,
  430. 'recordOffset': None,
  431. 'recordID': self._recordID,
  432. 'recordURI': self._recordURI,
  433. 'linkType': linkContext.link_type.value if isinstance(linkContext.link_type, enum.Enum) else linkContext.link_type,
  434. 'inline': bool(linkContext.inline), # Needs manual casting; https://github.com/ArchiveTeam/wpull/issues/458
  435. 'linked': bool(linkContext.linked),
  436. 'url': linkContext.link,
  437. }
  438. print(json.dumps(o))
  439. def main():
  440. processorMap = {'verify': VerifyMode, 'dump-responses': DumpResponsesMode, 'colour': ColourMode, 'scrape': ScrapeMode}
  441. assert len(sys.argv) - 1 >= 2
  442. mode = sys.argv[1]
  443. assert mode in processorMap
  444. processorArgs, files = processorMap[mode].split_args(sys.argv[2:])
  445. assert files
  446. processor = processorMap[mode](*processorArgs)
  447. try:
  448. for f in files:
  449. print('Info: processing {}'.format(f), file = sys.stderr)
  450. processor.process_event(NewFile(f))
  451. for event in iter_warc(f):
  452. processor.process_event(event)
  453. except BrokenPipeError:
  454. return
  455. if __name__ == '__main__':
  456. main()