The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

600 lines
21 KiB

  1. #!/usr/bin/env python3
  2. # Tiny tool for WARC stuff.
  3. # Operating modes:
  4. # warc-tiny colour FILES -- coloured output of the WARCs for easier reading
  5. # warc-tiny dump-responses [-m|--meta] FILES -- dump the HTTP response bodies to stdout
  6. # With --meta, prefix every line with the filename, record offset, record ID, and target URI; e.g. 'file.warc.gz:123:<urn:uuid:41b76f1f-f946-4723-91f8-cee6491e92f3>:<https://example.org/>: foobar'
  7. # The record offset may be -1 if it is not known.
  8. # The filename is wrapped in angled brackets if it contains a colon; the target URI is always wrapped in angled brackets (since it virtually always contains a colon).
  9. # warc-tiny scrape [-u|--urls] FILES -- extract all links and page requisites from the records; produces lines of filename, record offset, record URI, link type, inline flag, and URL as JSONL
  10. # With --urls, only the URL is printed.
  11. # wpull's scrapers are used for the extraction.
  12. # warc-tiny verify FILES -- verify the integrity of a WARC by comparing the digests
  13. import base64
  14. import contextlib
  15. import enum
  16. import gzip
  17. import hashlib
  18. import json
  19. import sys
  20. import tempfile
  21. import zlib
  22. try:
  23. import wpull.body
  24. import wpull.document.htmlparse.lxml_
  25. try:
  26. import wpull.protocol.http.request as wpull_protocol_http_request # wpull 2.x
  27. except ImportError:
  28. import wpull.http.request as wpull_protocol_http_request # wpull 1.x
  29. import wpull.scraper.base
  30. import wpull.scraper.css
  31. import wpull.scraper.html
  32. import wpull.scraper.javascript
  33. import wpull.scraper.sitemap
  34. except ImportError:
  35. wpull = None
  36. def GzipDecompressor():
  37. return zlib.decompressobj(16 + zlib.MAX_WBITS)
  38. class DummyDecompressor:
  39. def decompress(self, data):
  40. return data
  41. class Event:
  42. pass
  43. class FileEvent(Event):
  44. def __init__(self, filename):
  45. self._filename = filename
  46. @property
  47. def filename(self):
  48. return self._filename
  49. class NewFile(FileEvent):
  50. pass
  51. class BeginOfRecord(Event):
  52. def __init__(self, warcHeaders, rawData):
  53. self._warcHeaders = warcHeaders
  54. self._rawData = rawData
  55. @property
  56. def warcHeaders(self):
  57. return self._warcHeaders
  58. @property
  59. def rawData(self):
  60. return self._rawData
  61. class HTTPHeaders(Event):
  62. def __init__(self, headers):
  63. self._headers = headers
  64. @property
  65. def headers(self):
  66. return self._headers
  67. class _DataChunk(Event):
  68. def __init__(self, data):
  69. self._data = data
  70. @property
  71. def data(self):
  72. return self._data
  73. def __repr__(self):
  74. return '{}({!r}{})'.format(type(self).__name__, self._data[:50], '...' if len(self._data) > 50 else '')
  75. class WARCBlockChunk(_DataChunk):
  76. def __init__(self, data, isHttpHeader = None):
  77. super().__init__(data)
  78. self._isHttpHeader = isHttpHeader
  79. @property
  80. def isHttpHeader(self):
  81. # True: the chunk represents (part of) the HTTP header; False: the chunk represents (part of) the HTTP body; None: the chunk is not part of an HTTP record
  82. return self._isHttpHeader
  83. class RawHTTPBodyChunk(_DataChunk):
  84. '''
  85. Because many tools misunderstood the WARC specifications, the Payload-Digest was often implemented without stripping transfer encoding.
  86. This is like HTTPBodyChunk but without transfer encoding stripping.
  87. '''
  88. class HTTPBodyChunk(_DataChunk):
  89. '''
  90. Representing a part of the HTTP body with transfer encoding stripped.
  91. '''
  92. class EndOfRecord(Event):
  93. pass
  94. class WARCParsingIssue(enum.Enum):
  95. TRUNCATED_FILE = enum.auto()
  96. MALFORMED_HTTP_RECORD = enum.auto()
  97. EMPTY_FILE = enum.auto()
  98. class WARCParsingIssueEvent(Event):
  99. def __init__(self, issue, message = None):
  100. self.issue = issue
  101. self.message = message
  102. class EndOfFile(FileEvent):
  103. pass
  104. @contextlib.contextmanager
  105. def open_warc(f):
  106. if hasattr(f, 'read'):
  107. yield f
  108. else:
  109. with open(f, 'rb') as fp:
  110. yield fp
  111. def iter_warc(f):
  112. # Yields Events
  113. # BeginOfRecord's rawData does not include the CRLF CRLF at the end of the headers, and WARCBlockChunk does not contain the CRLF CRLF after the block either.
  114. with open_warc(f) as fp:
  115. buf = b''
  116. isEmpty = True
  117. while True:
  118. # Read WARC header
  119. while b'\r\n\r\n' not in buf:
  120. try:
  121. d = fp.read(16777216)
  122. except EOFError:
  123. break
  124. if not d:
  125. break
  126. buf += d
  127. if not buf:
  128. if isEmpty:
  129. print('Error: empty file', file = sys.stderr)
  130. yield WARCParsingIssueEvent(WARCParsingIssue.EMPTY_FILE)
  131. break
  132. isEmpty = False
  133. assert b'\r\n\r\n' in buf
  134. warcHeaderBuf, buf = buf.split(b'\r\n\r\n', 1)
  135. assert warcHeaderBuf.startswith(b'WARC/1.0\r\n') or warcHeaderBuf.startswith(b'WARC/1.1\r\n')
  136. assert b'\r\nContent-Length:' in warcHeaderBuf
  137. warcHeaders = tuple(tuple(map(bytes.strip, x.split(b':', 1))) for x in warcHeaderBuf.split(b'\r\n'))
  138. warcContentType = next(x[1] for x in warcHeaders if x[0] == b'Content-Type')
  139. warcContentLength = int(next(x[1] for x in warcHeaders if x[0] == b'Content-Length'))
  140. warcType = next(x[1] for x in warcHeaders if x[0] == b'WARC-Type')
  141. yield BeginOfRecord(warcHeaders, warcHeaderBuf)
  142. recordID = next(x[1] for x in warcHeaders if x[0] == b'WARC-Record-ID')
  143. # Read WARC block (and skip CRLFCRLF at the end of the record)
  144. if len(buf) < warcContentLength + 4:
  145. try:
  146. buf = buf + fp.read(warcContentLength + 4 - len(buf))
  147. except EOFError:
  148. pass
  149. if len(buf) < warcContentLength + 4:
  150. print('Error: truncated WARC', file = sys.stderr)
  151. yield WARCParsingIssueEvent(WARCParsingIssue.TRUNCATED_FILE)
  152. break
  153. warcContent = buf[:warcContentLength]
  154. buf = buf[warcContentLength + 4:]
  155. # Decode HTTP body if appropriate
  156. if warcContentType in (b'application/http;msgtype=request', b'application/http; msgtype=request') and warcType == b'request':
  157. httpType = 'request'
  158. elif warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response':
  159. httpType = 'response'
  160. else:
  161. httpType = None
  162. if httpType is not None:
  163. if b'\r\n\r\n' in warcContent:
  164. httpHeaders, httpBody = warcContent.split(b'\r\n\r\n', 1)
  165. # Parse headers and extract transfer encoding
  166. httpHeaderLines = [tuple(map(bytes.strip, x.split(b':', 1))) for x in httpHeaders.split(b'\r\n')]
  167. chunked = False
  168. gzipped = False
  169. if b'\r\ntransfer-encoding' in httpHeaders.lower():
  170. transferEncoding = next(x[1] for x in httpHeaderLines if x[0].lower() == b'transfer-encoding')
  171. transferEncodings = set(map(bytes.strip, transferEncoding.split(b',')))
  172. chunked = b'chunked' in transferEncodings
  173. gzipped = b'gzip' in transferEncodings
  174. yield WARCBlockChunk(httpHeaders + b'\r\n\r\n', isHttpHeader = True)
  175. yield HTTPHeaders(httpHeaderLines)
  176. yield WARCBlockChunk(httpBody, isHttpHeader = False)
  177. yield RawHTTPBodyChunk(httpBody)
  178. # Decode body
  179. if gzipped:
  180. httpDecompressor = GzipDecompressor()
  181. else:
  182. httpDecompressor = DummyDecompressor()
  183. if chunked:
  184. pos = 0
  185. while True:
  186. try:
  187. chunkLineEnd = httpBody.index(b'\r\n', pos)
  188. except ValueError:
  189. message = 'could not find chunk line end in record {}'.format(recordID)
  190. print('Error: {}, skipping'.format(message), file = sys.stderr)
  191. yield WARCParsingIssueEvent(WARCParsingIssue.MALFORMED_HTTP_RECORD, message)
  192. break
  193. chunkLine = httpBody[pos:chunkLineEnd]
  194. if b';' in chunkLine:
  195. chunkLength = chunkLine[:chunkLine.index(b';')].strip()
  196. else:
  197. chunkLength = chunkLine.strip()
  198. if chunkLength.lstrip(b'0123456789abcdefABCDEF') != b'':
  199. message = 'malformed chunk length {!r} in record {}'.format(chunkLength, recordID)
  200. print('Error: {}, skipping'.format(message), file = sys.stderr)
  201. yield WARCParsingIssueEvent(WARCParsingIssue.MALFORMED_HTTP_RECORD, message)
  202. break
  203. chunkLength = int(chunkLength, base = 16)
  204. if chunkLength == 0:
  205. break
  206. chunk = httpDecompressor.decompress(httpBody[chunkLineEnd + 2 : chunkLineEnd + 2 + chunkLength])
  207. yield HTTPBodyChunk(chunk)
  208. pos = chunkLineEnd + 2 + chunkLength + 2
  209. else:
  210. yield HTTPBodyChunk(httpDecompressor.decompress(httpBody))
  211. else:
  212. message = 'malformed HTTP request or response in record {}'.format(recordID)
  213. print('Warning: {}, skipping'.format(message), file = sys.stderr)
  214. yield WARCParsingIssueEvent(WARCParsingIssue.MALFORMED_HTTP_RECORD, message)
  215. yield WARCBlockChunk(warcContent)
  216. else:
  217. yield WARCBlockChunk(warcContent)
  218. yield EndOfRecord()
  219. class ProcessMode:
  220. @classmethod
  221. def split_args(cls, args):
  222. '''Split args into arguments to be passed into __init__ and filenames'''
  223. return (), args
  224. def process_event(self, event):
  225. raise NotImplementedError
  226. class Digest:
  227. def __init__(self, digest):
  228. self._digest = digest
  229. def format(self, digest = None):
  230. raise NotImplementedError
  231. def equals(self, digest):
  232. return self._digest == digest
  233. class Base32Digest(Digest):
  234. def format(self, digest = None):
  235. return base64.b32encode(digest if digest else self._digest)
  236. class HexDigest(Digest):
  237. def format(self, digest = None):
  238. return (digest if digest else self._digest).hex()
  239. class VerificationError(Exception):
  240. pass
  241. class VerifyMode(ProcessMode):
  242. def __init__(self):
  243. self._blockDigester = None
  244. self._recordedBlockDigest = None
  245. self._payloadDigester = None
  246. self._brokenPayloadDigester = None
  247. self._recordedPayloadDigest = None
  248. self._printedBrokenPayloadWarning = False
  249. self._verificationFailed = False
  250. def parse_digest(self, digest):
  251. if not digest.startswith(b'sha1:'):
  252. print('Warning: don\'t understand hash format: {!r}'.format(digest), file = sys.stderr)
  253. return None
  254. if len(digest) == 37 and digest.rstrip(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567') == b'sha1:': # 5 for 'sha1:' + 32 for base-32 hash
  255. return Base32Digest(base64.b32decode(digest[5:]))
  256. if len(digest) == 45 and digest.rstrip(b'0123456789abcdef') == b'sha1:':
  257. return HexDigest(bytes.fromhex(digest[5:].decode('ascii')))
  258. return None
  259. def process_event(self, event):
  260. if type(event) is NewFile:
  261. self._printedBrokenPayloadWarning = False
  262. self._verificationFailed = False
  263. elif type(event) is BeginOfRecord:
  264. if any(x[0] == b'WARC-Block-Digest' for x in event.warcHeaders):
  265. self._blockDigester = hashlib.sha1()
  266. self._recordedBlockDigest = self.parse_digest(next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Block-Digest'))
  267. else:
  268. self._blockDigester = None
  269. self._recordedBlockDigest = None
  270. if any(x[0] == b'WARC-Payload-Digest' for x in event.warcHeaders):
  271. self._payloadDigester = hashlib.sha1()
  272. self._brokenPayloadDigester = hashlib.sha1()
  273. self._recordedPayloadDigest = self.parse_digest(next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Payload-Digest'))
  274. else:
  275. self._payloadDigester = None
  276. self._brokenPayloadDigester = None
  277. self._recordedPayloadDigest = None
  278. self._recordID = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Record-ID')
  279. self._recordType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
  280. elif type(event) is WARCBlockChunk:
  281. if self._blockDigester:
  282. self._blockDigester.update(event.data)
  283. elif type(event) is HTTPBodyChunk:
  284. if self._payloadDigester:
  285. self._payloadDigester.update(event.data)
  286. elif type(event) is RawHTTPBodyChunk:
  287. if self._brokenPayloadDigester:
  288. self._brokenPayloadDigester.update(event.data)
  289. elif type(event) is WARCParsingIssueEvent:
  290. self._verificationFailed = True
  291. elif type(event) is EndOfRecord:
  292. if self._blockDigester and self._recordedBlockDigest:
  293. if not self._recordedBlockDigest.equals(self._blockDigester.digest()):
  294. print('Block digest mismatch for record {}: recorded {} v calculated {}'.format(self._recordID, self._recordedBlockDigest.format(), self._recordedBlockDigest.format(self._blockDigester.digest())), file = sys.stderr)
  295. self._verificationFailed = True
  296. if self._payloadDigester and self._recordType in (b'request', b'response'): #TODO: Support revisit
  297. if not self._recordedPayloadDigest.equals(self._payloadDigester.digest()):
  298. if self._recordedPayloadDigest.equals(self._brokenPayloadDigester.digest()):
  299. if not self._printedBrokenPayloadWarning:
  300. print('Warning: WARC uses incorrect payload digests without stripping the transfer encoding', file = sys.stderr)
  301. self._printedBrokenPayloadWarning = True
  302. else:
  303. print('Payload digest mismatch for record {}: recorded {} vs. calculated {} (calculated broken {})'.format(self._recordID, self._recordedPayloadDigest.format(), self._recordedPayloadDigest.format(self._payloadDigester.digest()), self._recordedPayloadDigest.format(self._brokenPayloadDigester.digest())), file = sys.stderr)
  304. self._verificationFailed = True
  305. elif type(event) is EndOfFile and self._verificationFailed:
  306. raise VerificationError('one or more errors encountered while verifying {}'.format(event.filename))
  307. class DumpResponsesMode(ProcessMode):
  308. @classmethod
  309. def split_args(cls, args):
  310. if args[0] == '-m' or args[0] == '--meta':
  311. return (True,), args[1:]
  312. return (False,), args
  313. def __init__(self, withMeta):
  314. self._printEOR = False
  315. self._isResponse = False
  316. self._withMeta = withMeta
  317. if withMeta:
  318. self._recordID = None
  319. self._targetURI = None
  320. self._buffer = b''
  321. def _write(self, data):
  322. if not self._withMeta:
  323. sys.stdout.buffer.write(data)
  324. return
  325. buf = self._buffer + data
  326. lines = buf.split(b'\n')
  327. self._buffer = lines.pop() # Since there's an explicit `_write(b'\r\n')` at the end of the record, this implicitly resets the buffer as well
  328. for line in lines:
  329. sys.stdout.buffer.write(':'.join((self._filename, '-1', self._recordID, '<' + self._targetURI + '>', '')).encode('utf-8'))
  330. sys.stdout.buffer.write(line)
  331. sys.stdout.buffer.write(b'\n')
  332. def process_event(self, event):
  333. if type(event) is NewFile:
  334. self._filename = event.filename
  335. if ':' in self._filename:
  336. self._filename = '<' + self._filename + '>'
  337. elif type(event) is BeginOfRecord:
  338. warcContentType = next(x[1] for x in event.warcHeaders if x[0] == b'Content-Type')
  339. warcType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
  340. self._isResponse = warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response'
  341. self._printEOR = False
  342. if self._withMeta:
  343. # Both of these are URIs, and per RFC 3986, those can only contain ASCII characters.
  344. self._recordID = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Record-ID').decode('ascii')
  345. self._targetURI = next((x[1] for x in event.warcHeaders if x[0] == b'WARC-Target-URI'), b'').decode('ascii')
  346. self._buffer = b''
  347. elif type(event) is HTTPBodyChunk:
  348. if self._isResponse:
  349. self._printEOR = True
  350. self._write(event.data)
  351. elif type(event) is EndOfRecord:
  352. if self._printEOR:
  353. self._write(b'\r\n')
  354. class COLOURS:
  355. RESET = b'\x1b[0m'
  356. GREEN = b'\x1b[0;32m'
  357. LIGHTGREEN = b'\x1b[1;32m'
  358. PURPLE = b'\x1b[0;35m'
  359. LIGHTPURPLE = b'\x1b[1;35m'
  360. RED = b'\x1b[0;31m'
  361. INVERTED = b'\x1b[7m'
  362. class ColourMode(ProcessMode):
  363. def __init__(self):
  364. self._hadHttpStatusLine = False
  365. def _replace_esc(self, data):
  366. return data.replace(b'\x1b', COLOURS.INVERTED + b'ESC' + COLOURS.RESET)
  367. def _print_line(self, line, colour, withLF = True, colourOnlyBeforeColon = False):
  368. if colourOnlyBeforeColon:
  369. if b':' in line:
  370. offset = line.index(b':')
  371. else:
  372. offset = 0
  373. else:
  374. offset = len(line)
  375. if offset > 0:
  376. sys.stdout.buffer.write(colour)
  377. sys.stdout.buffer.write(self._replace_esc(line[:offset]))
  378. sys.stdout.buffer.write(COLOURS.RESET)
  379. sys.stdout.buffer.write(line[offset:])
  380. if withLF:
  381. sys.stdout.buffer.write(b'\n')
  382. def _print_data(self, data, colour, colourOnlyBeforeColon):
  383. later = False
  384. for line in data.split(b'\r\n'):
  385. if later:
  386. sys.stdout.buffer.write(b'\n')
  387. self._print_line(line, colour, withLF = False, colourOnlyBeforeColon = colourOnlyBeforeColon)
  388. later = True
  389. def process_event(self, event):
  390. if type(event) is BeginOfRecord:
  391. firstNewline = event.rawData.index(b'\r\n')
  392. self._print_line(event.rawData[:firstNewline], COLOURS.LIGHTGREEN)
  393. self._print_data(event.rawData[firstNewline + 2:], COLOURS.GREEN, True)
  394. sys.stdout.buffer.write(b'\n\n') # separator between header and block
  395. self._hadHttpStatusLine = False
  396. elif type(event) is WARCBlockChunk:
  397. if event.isHttpHeader is True:
  398. if not self._hadHttpStatusLine:
  399. firstNewline = event.data.index(b'\r\n')
  400. self._print_line(event.data[:firstNewline], COLOURS.LIGHTPURPLE)
  401. offset = firstNewline + 2
  402. self._hadHttpStatusLine = True
  403. else:
  404. offset = 0
  405. self._print_data(event.data[offset:], COLOURS.PURPLE, True)
  406. elif event.isHttpHeader is False:
  407. self._print_data(event.data, COLOURS.RED, False)
  408. elif event.isHttpHeader is None:
  409. sys.stdout.buffer.write(self._replace_esc(event.data))
  410. elif type(event) is EndOfRecord:
  411. sys.stdout.buffer.write(b'\n\n')
  412. class ScrapeMode(ProcessMode):
  413. @classmethod
  414. def split_args(cls, args):
  415. if args[0] == '-u' or args[0] == '--urls':
  416. return (True,), args[1:]
  417. return (False,), args
  418. def __init__(self, urlsOnly):
  419. self._urlsOnly = urlsOnly
  420. assert wpull is not None, 'Scrape mode requires wpull and lxml'
  421. htmlParser = wpull.document.htmlparse.lxml_.HTMLParser()
  422. elementWalker = wpull.scraper.html.ElementWalker()
  423. scrapers = []
  424. scrapers.append(wpull.scraper.html.HTMLScraper(htmlParser, elementWalker))
  425. scrapers.append(wpull.scraper.css.CSSScraper())
  426. elementWalker.css_scraper = scrapers[-1]
  427. scrapers.append(wpull.scraper.javascript.JavaScriptScraper())
  428. elementWalker.javascript_scraper = scrapers[-1]
  429. scrapers.append(wpull.scraper.sitemap.SitemapScraper(htmlParser))
  430. self._scraper = wpull.scraper.base.DemuxDocumentScraper(scrapers)
  431. self._isResponse = None
  432. self._body = None
  433. self._recordURI = None
  434. self._statusCode = None
  435. self._statusReason = None
  436. if not self._urlsOnly:
  437. self._filename = None
  438. self._recordID = None
  439. def process_event(self, event):
  440. if type(event) is NewFile and not self._urlsOnly:
  441. self._filename = event.filename
  442. elif type(event) is BeginOfRecord:
  443. warcContentType = next(x[1] for x in event.warcHeaders if x[0] == b'Content-Type')
  444. warcType = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Type')
  445. self._isResponse = warcContentType in (b'application/http;msgtype=response', b'application/http; msgtype=response') and warcType == b'response'
  446. if self._isResponse:
  447. self._body = wpull.body.Body(file = tempfile.SpooledTemporaryFile(max_size = 10485760)) # Up to 10 MiB in memory
  448. self._printEOR = False
  449. if not self._urlsOnly:
  450. # Both of these are URIs, and per RFC 3986, those can only contain ASCII characters.
  451. self._recordID = next(x[1] for x in event.warcHeaders if x[0] == b'WARC-Record-ID').decode('ascii')
  452. self._recordURI = next((x[1] for x in event.warcHeaders if x[0] == b'WARC-Target-URI'), b'').decode('ascii')
  453. elif type(event) is HTTPHeaders and self._isResponse:
  454. assert len(event.headers[0]) == 1 and event.headers[0][0].startswith(b'HTTP/'), 'malformed HTTP response'
  455. _, statusCode, reason = event.headers[0][0].decode('ascii').split(' ', 2)
  456. self._statusCode = int(statusCode)
  457. self._statusReason = reason
  458. elif type(event) is HTTPBodyChunk and self._isResponse:
  459. self._body.write(event.data)
  460. elif type(event) is EndOfRecord and self._isResponse:
  461. request = wpull_protocol_http_request.Request(self._recordURI)
  462. response = wpull_protocol_http_request.Response(self._statusCode, self._statusReason)
  463. response.body = self._body
  464. response.body.seek(0)
  465. for scraper, scrapeResult in self._scraper.scrape_info(request, response).items():
  466. if not scrapeResult:
  467. continue
  468. for linkContext in scrapeResult.link_contexts:
  469. if self._urlsOnly:
  470. print(linkContext.link)
  471. continue
  472. o = {
  473. 'filename': self._filename,
  474. 'recordOffset': None,
  475. 'recordID': self._recordID,
  476. 'recordURI': self._recordURI,
  477. 'linkType': linkContext.link_type.value if isinstance(linkContext.link_type, enum.Enum) else linkContext.link_type,
  478. 'inline': bool(linkContext.inline), # Needs manual casting; https://github.com/ArchiveTeam/wpull/issues/458
  479. 'linked': bool(linkContext.linked),
  480. 'url': linkContext.link,
  481. }
  482. print(json.dumps(o))
  483. def main():
  484. processorMap = {'verify': VerifyMode, 'dump-responses': DumpResponsesMode, 'colour': ColourMode, 'scrape': ScrapeMode}
  485. assert len(sys.argv) - 1 >= 2
  486. mode = sys.argv[1]
  487. assert mode in processorMap
  488. processorArgs, files = processorMap[mode].split_args(sys.argv[2:])
  489. assert files
  490. processor = processorMap[mode](*processorArgs)
  491. try:
  492. for f in files:
  493. if f.endswith('.warc.gz') or f.endswith('.warc.zst'):
  494. print(f'Warning: warc-tiny does not support decompressing WARCs like {f}. Please use zcat/zstdcat/zstdwarccat and pipe the decompressed stream into warc-tiny instead.', file = sys.stderr)
  495. print('Info: processing {}'.format(f), file = sys.stderr)
  496. processor.process_event(NewFile(f))
  497. if f == '-':
  498. f = sys.stdin.buffer
  499. for event in iter_warc(f):
  500. processor.process_event(event)
  501. processor.process_event(EndOfFile(f))
  502. except BrokenPipeError:
  503. return
  504. if __name__ == '__main__':
  505. main()