Przeglądaj źródła

Require decompressed WARCs with warc-tiny

master
JustAnotherArchivist 2 lat temu
rodzic
commit
74485c399b
1 zmienionych plików z 13 dodań i 1 usunięć
  1. +13
    -1
      warc-tiny

+ 13
- 1
warc-tiny Wyświetl plik

@@ -13,6 +13,7 @@
# warc-tiny verify FILES -- verify the integrity of a WARC by comparing the digests

import base64
import contextlib
import enum
import gzip
import hashlib
@@ -122,11 +123,20 @@ class EndOfRecord(Event):
pass


@contextlib.contextmanager
def open_warc(f):
if hasattr(f, 'read'):
yield f
else:
with open(f, 'rb') as fp:
yield fp


def iter_warc(f):
# Yields Events
# BeginOfRecord's rawData does not include the CRLF CRLF at the end of the headers, and WARCBlockChunk does not contain the CRLF CRLF after the block either.

with gzip.open(f, 'rb') as fp:
with open_warc(f) as fp:
buf = b''
while True:
# Read WARC header
@@ -526,6 +536,8 @@ def main():

try:
for f in files:
if f.endswith('.warc.gz') or f.endswith('.warc.zst'):
print(f'Warning: warc-tiny does not support decompressing WARCs like {f}. Please use zcat/zstdcat/zstdwarccat and pipe the decompressed stream into warc-tiny instead.', file = sys.stderr)
print('Info: processing {}'.format(f), file = sys.stderr)
processor.process_event(NewFile(f))
if f == '-':


Ładowanie…
Anuluj
Zapisz