|
|
@@ -130,6 +130,7 @@ class EndOfRecord(Event): |
|
|
|
class WARCParsingIssue(enum.Enum): |
|
|
|
TRUNCATED_FILE = enum.auto() |
|
|
|
MALFORMED_HTTP_RECORD = enum.auto() |
|
|
|
EMPTY_FILE = enum.auto() |
|
|
|
|
|
|
|
|
|
|
|
class WARCParsingIssueEvent(Event): |
|
|
@@ -157,6 +158,7 @@ def iter_warc(f): |
|
|
|
|
|
|
|
with open_warc(f) as fp: |
|
|
|
buf = b'' |
|
|
|
isEmpty = True |
|
|
|
while True: |
|
|
|
# Read WARC header |
|
|
|
while b'\r\n\r\n' not in buf: |
|
|
@@ -168,7 +170,11 @@ def iter_warc(f): |
|
|
|
break |
|
|
|
buf += d |
|
|
|
if not buf: |
|
|
|
if isEmpty: |
|
|
|
print('Error: empty file', file = sys.stderr) |
|
|
|
yield WARCParsingIssueEvent(WARCParsingIssue.EMPTY_FILE) |
|
|
|
break |
|
|
|
isEmpty = False |
|
|
|
assert b'\r\n\r\n' in buf |
|
|
|
warcHeaderBuf, buf = buf.split(b'\r\n\r\n', 1) |
|
|
|
assert warcHeaderBuf.startswith(b'WARC/1.0\r\n') or warcHeaderBuf.startswith(b'WARC/1.1\r\n') |
|
|
|