Browse Source

Require decompressed WARCs with warc-tiny

master
JustAnotherArchivist 2 years ago
parent
commit
74485c399b
1 changed files with 13 additions and 1 deletions
  1. +13
    -1
      warc-tiny

+ 13
- 1
warc-tiny View File

@@ -13,6 +13,7 @@
# warc-tiny verify FILES -- verify the integrity of a WARC by comparing the digests # warc-tiny verify FILES -- verify the integrity of a WARC by comparing the digests


import base64 import base64
import contextlib
import enum import enum
import gzip import gzip
import hashlib import hashlib
@@ -122,11 +123,20 @@ class EndOfRecord(Event):
pass pass




@contextlib.contextmanager
def open_warc(f):
if hasattr(f, 'read'):
yield f
else:
with open(f, 'rb') as fp:
yield fp


def iter_warc(f): def iter_warc(f):
# Yields Events # Yields Events
# BeginOfRecord's rawData does not include the CRLF CRLF at the end of the headers, and WARCBlockChunk does not contain the CRLF CRLF after the block either. # BeginOfRecord's rawData does not include the CRLF CRLF at the end of the headers, and WARCBlockChunk does not contain the CRLF CRLF after the block either.


with gzip.open(f, 'rb') as fp:
with open_warc(f) as fp:
buf = b'' buf = b''
while True: while True:
# Read WARC header # Read WARC header
@@ -526,6 +536,8 @@ def main():


try: try:
for f in files: for f in files:
if f.endswith('.warc.gz') or f.endswith('.warc.zst'):
print(f'Warning: warc-tiny does not support decompressing WARCs like {f}. Please use zcat/zstdcat/zstdwarccat and pipe the decompressed stream into warc-tiny instead.', file = sys.stderr)
print('Info: processing {}'.format(f), file = sys.stderr) print('Info: processing {}'.format(f), file = sys.stderr)
processor.process_event(NewFile(f)) processor.process_event(NewFile(f))
if f == '-': if f == '-':


Loading…
Cancel
Save