Browse Source

Fix empty files being considered valid WARCs

master
JustAnotherArchivist 9 months ago
parent
commit
4ff212eb20
1 changed files with 6 additions and 0 deletions
  1. +6
    -0
      warc-tiny

+ 6
- 0
warc-tiny View File

@@ -130,6 +130,7 @@ class EndOfRecord(Event):
class WARCParsingIssue(enum.Enum):
TRUNCATED_FILE = enum.auto()
MALFORMED_HTTP_RECORD = enum.auto()
EMPTY_FILE = enum.auto()


class WARCParsingIssueEvent(Event):
@@ -157,6 +158,7 @@ def iter_warc(f):

with open_warc(f) as fp:
buf = b''
isEmpty = True
while True:
# Read WARC header
while b'\r\n\r\n' not in buf:
@@ -168,7 +170,11 @@ def iter_warc(f):
break
buf += d
if not buf:
if isEmpty:
print('Error: empty file', file = sys.stderr)
yield WARCParsingIssueEvent(WARCParsingIssue.EMPTY_FILE)
break
isEmpty = False
assert b'\r\n\r\n' in buf
warcHeaderBuf, buf = buf.split(b'\r\n\r\n', 1)
assert warcHeaderBuf.startswith(b'WARC/1.0\r\n') or warcHeaderBuf.startswith(b'WARC/1.1\r\n')


Loading…
Cancel
Save