The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

393 lines
13 KiB

  1. #!/usr/bin/env python3
  2. # Tiny tool for WARC stuff.
  3. # Operating modes:
  4. # warc-tiny colour FILES -- coloured output of the WARCs for easier reading
  5. # warc-tiny dump-responses FILES -- dump the HTTP response bodies to stdout
  6. # warc-tiny verify FILES -- verify the integrity of a WARC by comparing the digests
  7. import base64
  8. import gzip
  9. import hashlib
  10. import sys
  11. import zlib
  12. def GzipDecompressor():
  13. return zlib.decompressobj(16 + zlib.MAX_WBITS)
  14. class DummyDecompressor:
  15. def decompress(self, data):
  16. return data
  17. class Event:
  18. pass
  19. class NewFile(Event):
  20. def __init__(self, filename):
  21. self._filename = filename
  22. @property
  23. def filename(self):
  24. return self._filename
  25. class BeginOfRecord(Event):
  26. def __init__(self, warcHeaders, rawData):
  27. self._warcHeaders = warcHeaders
  28. self._rawData = rawData
  29. @property
  30. def warcHeaders(self):
  31. return self._warcHeaders
  32. @property
  33. def rawData(self):
  34. return self._rawData
  35. class _DataChunk(Event):
  36. def __init__(self, data):
  37. self._data = data
  38. @property
  39. def data(self):
  40. return self._data
  41. def __repr__(self):
  42. return '{}({!r}{})'.format(type(self).__name__, self._data[:50], '...' if len(self._data) > 50 else '')
  43. class WARCBlockChunk(_DataChunk):
  44. def __init__(self, data, isHttpHeader = None):
  45. super().__init__(data)
  46. self._isHttpHeader = isHttpHeader
  47. @property
  48. def isHttpHeader(self):
  49. # True: the chunk represents (part of) the HTTP header; False: the chunk represents (part of) the HTTP body; None: the chunk is not part of an HTTP record
  50. return self._isHttpHeader
  51. class RawHTTPBodyChunk(_DataChunk):
  52. '''
  53. Because many tools misunderstood the WARC specifications, the Payload-Digest was often implemented without stripping transfer encoding.
  54. This is like HTTPBodyChunk but without transfer encoding stripping.
  55. '''
  56. class HTTPBodyChunk(_DataChunk):
  57. '''
  58. Representing a part of the HTTP body with transfer encoding stripped.
  59. '''
  60. class EndOfRecord(Event):
  61. pass
  62. def iter_warc(f):
  63. # Yields Events
  64. # BeginOfRecord's rawData does not include the CRLF CRLF at the end of the headers, and WARCBlockChunk does not contain the CRLF CRLF after the block either.
  65. with gzip.open(f, 'rb') as fp:
  66. buf = b''
  67. while True:
  68. # Read WARC header
  69. while b'\r\n\r\n' not in buf:
  70. try:
  71. buf = buf + fp.read(4096)
  72. except EOFError:
  73. break
  74. if not buf:
  75. break
  76. if not buf:
  77. break
  78. warcHeaderBuf, buf = buf.split(b'\r\n\r\n', 1)
  79. assert warcHeaderBuf.startswith(b'WARC/1.0\r\n') or warcHeaderBuf.startswith(b'WARC/1.1\r\n')
  80. assert b'\r\nContent-Length:' in warcHeaderBuf
  81. warcHeaders = tuple(tuple(map(bytes.strip, x.split(b':', 1))) for x in warcHeaderBuf.split(b'\r\n'))
  82. warcContentType = next(x[1] for x in warcHeaders if x[0] == b'Content-Type')
  83. warcContentLength = int(next(x[1] for x in warcHeaders if x[0] == b'Content-Length'))
  84. warcType = next(x[1] for x in warcHeaders if x[0] == b'WARC-Type')
  85. yield BeginOfRecord(warcHeaders, warcHeaderBuf)
  86. recordID = next(x[1] for x in warcHeaders if x[0] == b'WARC-Record-ID')
  87. # Read WARC block (and skip CRLFCRLF at the end of the record)
  88. if len(buf) < warcContentLength + 4:
  89. try:
  90. buf = buf + fp.read(warcContentLength + 4 - len(buf))
  91. except EOFError:
  92. pass
  93. if len(buf) < warcContentLength + 4:
  94. print('Error: truncated WARC', file = sys.stderr)
  95. break
  96. warcContent = buf[:warcContentLength]
  97. buf = buf[warcContentLength + 4:]
  98. # Decode HTTP body if appropriate
  99. if warcContentType in (b'application/http;msgtype=request', b'application/http; msgtype=request') and warcType == b'request':
  100. httpType = 'request'
  101. elif warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response':
  102. httpType = 'response'
  103. else:
  104. httpType = None
  105. if httpType is not None:
  106. if b'\r\n\r\n' in warcContent:
  107. httpHeaders, httpBody = warcContent.split(b'\r\n\r\n', 1)
  108. # Parse headers and extract transfer encoding
  109. httpHeaderLines = [tuple(map(bytes.strip, x.split(b':', 1))) for x in httpHeaders.split(b'\r\n')]
  110. chunked = False
  111. gzipped = False
  112. if b'\r\ntransfer-encoding' in httpHeaders.lower():
  113. transferEncoding = next(x[1] for x in httpHeaderLines if x[0].lower() == b'transfer-encoding')
  114. transferEncodings = set(map(bytes.strip, transferEncoding.split(b',')))
  115. chunked = b'chunked' in transferEncodings
  116. gzipped = b'gzip' in transferEncodings
  117. yield WARCBlockChunk(httpHeaders + b'\r\n\r\n', isHttpHeader = True)
  118. yield WARCBlockChunk(httpBody, isHttpHeader = False)
  119. yield RawHTTPBodyChunk(httpBody)
  120. # Decode body
  121. if gzipped:
  122. httpDecompressor = GzipDecompressor()
  123. else:
  124. httpDecompressor = DummyDecompressor()
  125. if chunked:
  126. while True:
  127. try:
  128. chunkLineEnd = httpBody.index(b'\r\n')
  129. except ValueError:
  130. print('Error: could not find chunk line end in record {}, skipping'.format(recordID), file = sys.stderr)
  131. break
  132. chunkLine = httpBody[:chunkLineEnd]
  133. if b';' in chunkLine:
  134. chunkLength = chunkLine[:chunkLine.index(b';')].strip()
  135. else:
  136. chunkLength = chunkLine.strip()
  137. if chunkLength.lstrip(b'0123456789abcdefABCDEF') != b'':
  138. print('Error: malformed chunk length {!r} in record {}, skipping'.format(chunkLength, recordID), file = sys.stderr)
  139. break
  140. chunkLength = int(chunkLength, base = 16)
  141. if chunkLength == 0:
  142. break
  143. chunk = httpDecompressor.decompress(httpBody[chunkLineEnd + 2 : chunkLineEnd + 2 + chunkLength])
  144. yield HTTPBodyChunk(chunk)
  145. httpBody = httpBody[chunkLineEnd + 2 + chunkLength + 2:]
  146. else:
  147. yield HTTPBodyChunk(httpDecompressor.decompress(httpBody))
  148. else:
  149. print('Warning: malformed HTTP request or response in record {}, skipping'.format(recordID), file = sys.stderr)
  150. yield WARCBlockChunk(warcContent)
  151. else:
  152. yield WARCBlockChunk(warcContent)
  153. yield EndOfRecord()
  154. class ProcessMode:
  155. @classmethod
  156. def split_args(cls, args):
  157. '''Split args into arguments to be passed into __init__ and filenames'''
  158. return (), args
  159. def process_event(self, event):
  160. raise NotImplementedError
  161. class Digest:
  162. def __init__(self, digest):
  163. self._digest = digest
  164. def format(self, digest = None):
  165. raise NotImplementedError
  166. def equals(self, digest):
  167. return self._digest == digest
  168. class Base32Digest(Digest):
  169. def format(self, digest = None):
  170. return base64.b32encode(digest if digest else self._digest)
  171. class HexDigest(Digest):
  172. def format(self, digest = None):
  173. return (digest if digest else self._digest).hex()
  174. class VerifyMode(ProcessMode):
  175. def __init__(self):
  176. self._blockDigester = None
  177. self._recordedBlockDigest = None
  178. self._payloadDigester = None
  179. self._brokenPayloadDigester = None
  180. self._recordedPayloadDigest = None
  181. self._printedBrokenPayloadWarning = False
  182. def parse_digest(self, digest):
  183. if not digest.startswith(b'sha1:'):
  184. print('Warning: don\'t understand hash format: {!r}'.format(digest), file = sys.stderr)
  185. return None
  186. if len(digest) == 37 and digest.rstrip(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567') == b'sha1:': # 5 for 'sha1:' + 32 for base-32 hash
  187. return Base32Digest(base64.b32decode(digest[5:]))
  188. if len(digest) == 45 and digest.rstrip(b'0123456789abcdef') == b'sha1:':
  189. return HexDigest(bytes.fromhex(digest[5:].decode('ascii')))
  190. return None
  191. def process_event(self, event):
  192. if type(event) is NewFile:
  193. self._printedBrokenPayloadWarning = False
  194. elif type(event) is BeginOfRecord:
  195. if any(x[0] == b'WARC-Block-Digest' for x in event.warcHeaders):
  196. self._blockDigester = hashlib.sha1()
  197. self._recordedBlockDigest = self.parse_digest(next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Block-Digest'))
  198. else:
  199. self._blockDigester = None
  200. self._recordedBlockDigest = None
  201. if any(x[0] == b'WARC-Payload-Digest' for x in event.warcHeaders):
  202. self._payloadDigester = hashlib.sha1()
  203. self._brokenPayloadDigester = hashlib.sha1()
  204. self._recordedPayloadDigest = self.parse_digest(next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Payload-Digest'))
  205. else:
  206. self._payloadDigester = None
  207. self._brokenPayloadDigester = None
  208. self._recordedPayloadDigest = None
  209. self._recordID = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Record-ID')
  210. self._recordType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
  211. elif type(event) is WARCBlockChunk:
  212. if self._blockDigester:
  213. self._blockDigester.update(event.data)
  214. elif type(event) is HTTPBodyChunk:
  215. if self._payloadDigester:
  216. self._payloadDigester.update(event.data)
  217. elif type(event) is RawHTTPBodyChunk:
  218. if self._brokenPayloadDigester:
  219. self._brokenPayloadDigester.update(event.data)
  220. elif type(event) is EndOfRecord:
  221. if self._blockDigester and self._recordedBlockDigest:
  222. if not self._recordedBlockDigest.equals(self._blockDigester.digest()):
  223. print('Block digest mismatch for record {}: recorded {} v calculated {}'.format(self._recordID, self._recordedBlockDigest.format(), self._recordedBlockDigest.format(self._blockDigester.digest())), file = sys.stderr)
  224. if self._payloadDigester and self._recordType in (b'request', b'response'): #TODO: Support revisit
  225. if not self._recordedPayloadDigest.equals(self._payloadDigester.digest()):
  226. if self._recordedPayloadDigest.equals(self._brokenPayloadDigester.digest()):
  227. if not self._printedBrokenPayloadWarning:
  228. print('Warning: WARC uses incorrect payload digests without stripping the transfer encoding', file = sys.stderr)
  229. self._printedBrokenPayloadWarning = True
  230. else:
  231. print('Payload digest mismatch for record {}: recorded {} vs. calculated {} (calculated broken {})'.format(self._recordID, self._recordedPayloadDigest.format(), self._recordedPayloadDigest.format(self._payloadDigester.digest()), self._recordedPayloadDigest.format(self._brokenPayloadDigester.digest())), file = sys.stderr)
  232. class DumpResponsesMode(ProcessMode):
  233. def __init__(self):
  234. self._printEOR = False
  235. self._isResponse = False
  236. def process_event(self, event):
  237. if type(event) is BeginOfRecord:
  238. warcContentType = next(x[1] for x in event.warcHeaders if x[0] == b'Content-Type')
  239. warcType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
  240. self._isResponse = warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response'
  241. self._printEOR = False
  242. elif type(event) is HTTPBodyChunk:
  243. if self._isResponse:
  244. self._printEOR = True
  245. sys.stdout.buffer.write(event.data)
  246. elif type(event) is EndOfRecord:
  247. if self._printEOR:
  248. sys.stdout.buffer.write(b'\r\n')
  249. class COLOURS:
  250. RESET = b'\x1b[0m'
  251. GREEN = b'\x1b[0;32m'
  252. LIGHTGREEN = b'\x1b[1;32m'
  253. PURPLE = b'\x1b[0;35m'
  254. LIGHTPURPLE = b'\x1b[1;35m'
  255. RED = b'\x1b[0;31m'
  256. INVERTED = b'\x1b[7m'
  257. class ColourMode(ProcessMode):
  258. def __init__(self):
  259. self._hadHttpStatusLine = False
  260. def _replace_esc(self, data):
  261. return data.replace(b'\x1b', COLOURS.INVERTED + b'ESC' + COLOURS.RESET)
  262. def _print_line(self, line, colour, withLF = True, colourOnlyBeforeColon = False):
  263. if colourOnlyBeforeColon:
  264. if b':' in line:
  265. offset = line.index(b':')
  266. else:
  267. offset = 0
  268. else:
  269. offset = len(line)
  270. if offset > 0:
  271. sys.stdout.buffer.write(colour)
  272. sys.stdout.buffer.write(self._replace_esc(line[:offset]))
  273. sys.stdout.buffer.write(COLOURS.RESET)
  274. sys.stdout.buffer.write(line[offset:])
  275. if withLF:
  276. sys.stdout.buffer.write(b'\n')
  277. def _print_data(self, data, colour, colourOnlyBeforeColon):
  278. later = False
  279. for line in data.split(b'\r\n'):
  280. if later:
  281. sys.stdout.buffer.write(b'\n')
  282. self._print_line(line, colour, withLF = False, colourOnlyBeforeColon = colourOnlyBeforeColon)
  283. later = True
  284. def process_event(self, event):
  285. if type(event) is BeginOfRecord:
  286. firstNewline = event.rawData.index(b'\r\n')
  287. self._print_line(event.rawData[:firstNewline], COLOURS.LIGHTGREEN)
  288. self._print_data(event.rawData[firstNewline + 2:], COLOURS.GREEN, True)
  289. sys.stdout.buffer.write(b'\n\n') # separator between header and block
  290. self._hadHttpStatusLine = False
  291. elif type(event) is WARCBlockChunk:
  292. if event.isHttpHeader is True:
  293. if not self._hadHttpStatusLine:
  294. firstNewline = event.data.index(b'\r\n')
  295. self._print_line(event.data[:firstNewline], COLOURS.LIGHTPURPLE)
  296. offset = firstNewline + 2
  297. self._hadHttpStatusLine = True
  298. else:
  299. offset = 0
  300. self._print_data(event.data[offset:], COLOURS.PURPLE, True)
  301. elif event.isHttpHeader is False:
  302. self._print_data(event.data, COLOURS.RED, False)
  303. elif event.isHttpHeader is None:
  304. sys.stdout.buffer.write(self._replace_esc(event.data))
  305. elif type(event) is EndOfRecord:
  306. sys.stdout.buffer.write(b'\n\n')
  307. def main():
  308. processorMap = {'verify': VerifyMode, 'dump-responses': DumpResponsesMode, 'colour': ColourMode}
  309. assert len(sys.argv) - 1 >= 2
  310. mode = sys.argv[1]
  311. assert mode in processorMap
  312. processorArgs, files = processorMap[mode].split_args(sys.argv[2:])
  313. assert files
  314. processor = processorMap[mode](*processorArgs)
  315. try:
  316. for f in files:
  317. print('Info: processing {}'.format(f), file = sys.stderr)
  318. processor.process_event(NewFile(f))
  319. for event in iter_warc(f):
  320. processor.process_event(event)
  321. except BrokenPipeError:
  322. return
  323. if __name__ == '__main__':
  324. main()