diff --git a/warc-tiny b/warc-tiny index 6543d08..85ede06 100755 --- a/warc-tiny +++ b/warc-tiny @@ -13,6 +13,7 @@ # warc-tiny verify FILES -- verify the integrity of a WARC by comparing the digests import base64 +import contextlib import enum import gzip import hashlib @@ -122,11 +123,20 @@ class EndOfRecord(Event): pass +@contextlib.contextmanager +def open_warc(f): + if hasattr(f, 'read'): + yield f + else: + with open(f, 'rb') as fp: + yield fp + + def iter_warc(f): # Yields Events # BeginOfRecord's rawData does not include the CRLF CRLF at the end of the headers, and WARCBlockChunk does not contain the CRLF CRLF after the block either. - with gzip.open(f, 'rb') as fp: + with open_warc(f) as fp: buf = b'' while True: # Read WARC header @@ -526,6 +536,8 @@ def main(): try: for f in files: + if f.endswith('.warc.gz') or f.endswith('.warc.zst'): + print(f'Warning: warc-tiny does not support decompressing WARCs like {f}. Please use zcat/zstdcat/zstdwarccat and pipe the decompressed stream into warc-tiny instead.', file = sys.stderr) print('Info: processing {}'.format(f), file = sys.stderr) processor.process_event(NewFile(f)) if f == '-':