|
|
@@ -142,13 +142,15 @@ def iter_warc(f): |
|
|
|
# Read WARC header |
|
|
|
while b'\r\n\r\n' not in buf: |
|
|
|
try: |
|
|
|
buf = buf + fp.read(16777216) |
|
|
|
d = fp.read(16777216) |
|
|
|
except EOFError: |
|
|
|
break |
|
|
|
if not buf: |
|
|
|
if not d: |
|
|
|
break |
|
|
|
buf += d |
|
|
|
if not buf: |
|
|
|
break |
|
|
|
assert b'\r\n\r\n' in buf |
|
|
|
warcHeaderBuf, buf = buf.split(b'\r\n\r\n', 1) |
|
|
|
assert warcHeaderBuf.startswith(b'WARC/1.0\r\n') or warcHeaderBuf.startswith(b'WARC/1.1\r\n') |
|
|
|
assert b'\r\nContent-Length:' in warcHeaderBuf |
|
|
|