diff --git a/warc-tiny b/warc-tiny index 6ae34c3..24a8c1f 100755 --- a/warc-tiny +++ b/warc-tiny @@ -130,6 +130,7 @@ class EndOfRecord(Event): class WARCParsingIssue(enum.Enum): TRUNCATED_FILE = enum.auto() MALFORMED_HTTP_RECORD = enum.auto() + EMPTY_FILE = enum.auto() class WARCParsingIssueEvent(Event): @@ -157,6 +158,7 @@ def iter_warc(f): with open_warc(f) as fp: buf = b'' + isEmpty = True while True: # Read WARC header while b'\r\n\r\n' not in buf: @@ -168,7 +170,11 @@ def iter_warc(f): break buf += d if not buf: + if isEmpty: + print('Error: empty file', file = sys.stderr) + yield WARCParsingIssueEvent(WARCParsingIssue.EMPTY_FILE) break + isEmpty = False assert b'\r\n\r\n' in buf warcHeaderBuf, buf = buf.split(b'\r\n\r\n', 1) assert warcHeaderBuf.startswith(b'WARC/1.0\r\n') or warcHeaderBuf.startswith(b'WARC/1.1\r\n')