|
|
@@ -13,6 +13,7 @@ |
|
|
|
# warc-tiny verify FILES -- verify the integrity of a WARC by comparing the digests |
|
|
|
|
|
|
|
import base64 |
|
|
|
import contextlib |
|
|
|
import enum |
|
|
|
import gzip |
|
|
|
import hashlib |
|
|
@@ -122,11 +123,20 @@ class EndOfRecord(Event): |
|
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
@contextlib.contextmanager |
|
|
|
def open_warc(f): |
|
|
|
if hasattr(f, 'read'): |
|
|
|
yield f |
|
|
|
else: |
|
|
|
with open(f, 'rb') as fp: |
|
|
|
yield fp |
|
|
|
|
|
|
|
|
|
|
|
def iter_warc(f): |
|
|
|
# Yields Events |
|
|
|
# BeginOfRecord's rawData does not include the CRLF CRLF at the end of the headers, and WARCBlockChunk does not contain the CRLF CRLF after the block either. |
|
|
|
|
|
|
|
with gzip.open(f, 'rb') as fp: |
|
|
|
with open_warc(f) as fp: |
|
|
|
buf = b'' |
|
|
|
while True: |
|
|
|
# Read WARC header |
|
|
@@ -526,6 +536,8 @@ def main(): |
|
|
|
|
|
|
|
try: |
|
|
|
for f in files: |
|
|
|
if f.endswith('.warc.gz') or f.endswith('.warc.zst'): |
|
|
|
print(f'Warning: warc-tiny does not support decompressing WARCs like {f}. Please use zcat/zstdcat/zstdwarccat and pipe the decompressed stream into warc-tiny instead.', file = sys.stderr) |
|
|
|
print('Info: processing {}'.format(f), file = sys.stderr) |
|
|
|
processor.process_event(NewFile(f)) |
|
|
|
if f == '-': |
|
|
|