Browse Source

Require decompressed WARCs with warc-tiny

master
JustAnotherArchivist 2 years ago
parent
commit
74485c399b
1 changed files with 13 additions and 1 deletions
  1. +13
    -1
      warc-tiny

+ 13
- 1
warc-tiny View File

@@ -13,6 +13,7 @@
# warc-tiny verify FILES -- verify the integrity of a WARC by comparing the digests

import base64
import contextlib
import enum
import gzip
import hashlib
@@ -122,11 +123,20 @@ class EndOfRecord(Event):
pass


@contextlib.contextmanager
def open_warc(f):
if hasattr(f, 'read'):
yield f
else:
with open(f, 'rb') as fp:
yield fp


def iter_warc(f):
# Yields Events
# BeginOfRecord's rawData does not include the CRLF CRLF at the end of the headers, and WARCBlockChunk does not contain the CRLF CRLF after the block either.

with gzip.open(f, 'rb') as fp:
with open_warc(f) as fp:
buf = b''
while True:
# Read WARC header
@@ -526,6 +536,8 @@ def main():

try:
for f in files:
if f.endswith('.warc.gz') or f.endswith('.warc.zst'):
print(f'Warning: warc-tiny does not support decompressing WARCs like {f}. Please use zcat/zstdcat/zstdwarccat and pipe the decompressed stream into warc-tiny instead.', file = sys.stderr)
print('Info: processing {}'.format(f), file = sys.stderr)
processor.process_event(NewFile(f))
if f == '-':


Loading…
Cancel
Save