Require decompressed WARCs with warc-tiny

2 years ago · 74485c399b
--- a/+ 13
+++ b/+ 13
@@ -13,6 +13,7 @@
 #  warc-tiny verify FILES  --  verify the integrity of a WARC by comparing the digests
 import base64
 import contextlib
 import enum
 import gzip
 import hashlib
@@ -122,11 +123,20 @@ class EndOfRecord(Event):
 	pass
@contextlib.contextmanager
 def open_warc(f):
 	if hasattr(f, 'read'):
 		yield f
 	else:
 		with open(f, 'rb') as fp:
 			yield fp
 def iter_warc(f):
 	# Yields Events
 	# BeginOfRecord's rawData does not include the CRLF CRLF at the end of the headers, and WARCBlockChunk does not contain the CRLF CRLF after the block either.
 	with gzip.open(f, 'rb') as fp:
 	with open_warc(f) as fp:
 		buf = b''
 		while True:
 			# Read WARC header
@@ -526,6 +536,8 @@ def main():
 	try:
 		for f in files:
 			if f.endswith('.warc.gz') or f.endswith('.warc.zst'):
 				print(f'Warning: warc-tiny does not support decompressing WARCs like {f}. Please use zcat/zstdcat/zstdwarccat and pipe the decompressed stream into warc-tiny instead.', file = sys.stderr)
 			print('Info: processing {}'.format(f), file = sys.stderr)
 			processor.process_event(NewFile(f))
 			if f == '-':