Browse Source

Raise an error when verification fails

master
JustAnotherArchivist 9 months ago
parent
commit
828dae2597
1 changed files with 43 additions and 4 deletions
  1. +43
    -4
      warc-tiny

+ 43
- 4
warc-tiny View File

@@ -51,7 +51,7 @@ class Event:
pass


class NewFile(Event):
class FileEvent(Event):
def __init__(self, filename):
self._filename = filename

@@ -60,6 +60,10 @@ class NewFile(Event):
return self._filename


class NewFile(FileEvent):
pass


class BeginOfRecord(Event):
def __init__(self, warcHeaders, rawData):
self._warcHeaders = warcHeaders
@@ -123,6 +127,21 @@ class EndOfRecord(Event):
pass


class WARCParsingIssue(enum.Enum):
TRUNCATED_FILE = enum.auto()
MALFORMED_HTTP_RECORD = enum.auto()


class WARCParsingIssueEvent(Event):
def __init__(self, issue, message = None):
self.issue = issue
self.message = message


class EndOfFile(FileEvent):
pass


@contextlib.contextmanager
def open_warc(f):
if hasattr(f, 'read'):
@@ -169,6 +188,7 @@ def iter_warc(f):
pass
if len(buf) < warcContentLength + 4:
print('Error: truncated WARC', file = sys.stderr)
yield WARCParsingIssueEvent(WARCParsingIssue.TRUNCATED_FILE)
break
warcContent = buf[:warcContentLength]
buf = buf[warcContentLength + 4:]
@@ -210,7 +230,9 @@ def iter_warc(f):
try:
chunkLineEnd = httpBody.index(b'\r\n', pos)
except ValueError:
print('Error: could not find chunk line end in record {}, skipping'.format(recordID), file = sys.stderr)
message = 'could not find chunk line end in record {}'.format(recordID)
print('Error: {}, skipping'.format(message), file = sys.stderr)
yield WARCParsingIssueEvent(WARCParsingIssue.MALFORMED_HTTP_RECORD, message)
break
chunkLine = httpBody[pos:chunkLineEnd]
if b';' in chunkLine:
@@ -218,7 +240,9 @@ def iter_warc(f):
else:
chunkLength = chunkLine.strip()
if chunkLength.lstrip(b'0123456789abcdefABCDEF') != b'':
print('Error: malformed chunk length {!r} in record {}, skipping'.format(chunkLength, recordID), file = sys.stderr)
message = 'malformed chunk length {!r} in record {}'.format(chunkLength, recordID)
print('Error: {}, skipping'.format(message), file = sys.stderr)
yield WARCParsingIssueEvent(WARCParsingIssue.MALFORMED_HTTP_RECORD, message)
break
chunkLength = int(chunkLength, base = 16)
if chunkLength == 0:
@@ -229,7 +253,9 @@ def iter_warc(f):
else:
yield HTTPBodyChunk(httpDecompressor.decompress(httpBody))
else:
print('Warning: malformed HTTP request or response in record {}, skipping'.format(recordID), file = sys.stderr)
message = 'malformed HTTP request or response in record {}'.format(recordID)
print('Warning: {}, skipping'.format(message), file = sys.stderr)
yield WARCParsingIssueEvent(WARCParsingIssue.MALFORMED_HTTP_RECORD, message)
yield WARCBlockChunk(warcContent)
else:
yield WARCBlockChunk(warcContent)
@@ -267,6 +293,10 @@ class HexDigest(Digest):
return (digest if digest else self._digest).hex()


class VerificationError(Exception):
pass


class VerifyMode(ProcessMode):
def __init__(self):
self._blockDigester = None
@@ -275,6 +305,7 @@ class VerifyMode(ProcessMode):
self._brokenPayloadDigester = None
self._recordedPayloadDigest = None
self._printedBrokenPayloadWarning = False
self._verificationFailed = False

def parse_digest(self, digest):
if not digest.startswith(b'sha1:'):
@@ -289,6 +320,7 @@ class VerifyMode(ProcessMode):
def process_event(self, event):
if type(event) is NewFile:
self._printedBrokenPayloadWarning = False
self._verificationFailed = False
elif type(event) is BeginOfRecord:
if any(x[0] == b'WARC-Block-Digest' for x in event.warcHeaders):
self._blockDigester = hashlib.sha1()
@@ -315,10 +347,13 @@ class VerifyMode(ProcessMode):
elif type(event) is RawHTTPBodyChunk:
if self._brokenPayloadDigester:
self._brokenPayloadDigester.update(event.data)
elif type(event) is WARCParsingIssueEvent:
self._verificationFailed = True
elif type(event) is EndOfRecord:
if self._blockDigester and self._recordedBlockDigest:
if not self._recordedBlockDigest.equals(self._blockDigester.digest()):
print('Block digest mismatch for record {}: recorded {} v calculated {}'.format(self._recordID, self._recordedBlockDigest.format(), self._recordedBlockDigest.format(self._blockDigester.digest())), file = sys.stderr)
self._verificationFailed = True
if self._payloadDigester and self._recordType in (b'request', b'response'): #TODO: Support revisit
if not self._recordedPayloadDigest.equals(self._payloadDigester.digest()):
if self._recordedPayloadDigest.equals(self._brokenPayloadDigester.digest()):
@@ -327,6 +362,9 @@ class VerifyMode(ProcessMode):
self._printedBrokenPayloadWarning = True
else:
print('Payload digest mismatch for record {}: recorded {} vs. calculated {} (calculated broken {})'.format(self._recordID, self._recordedPayloadDigest.format(), self._recordedPayloadDigest.format(self._payloadDigester.digest()), self._recordedPayloadDigest.format(self._brokenPayloadDigester.digest())), file = sys.stderr)
self._verificationFailed = True
elif type(event) is EndOfFile and self._verificationFailed:
raise VerificationError('one or more errors encountered while verifying {}'.format(event.filename))


class DumpResponsesMode(ProcessMode):
@@ -546,6 +584,7 @@ def main():
f = sys.stdin.buffer
for event in iter_warc(f):
processor.process_event(event)
processor.process_event(EndOfFile(f))
except BrokenPipeError:
return



Loading…
Cancel
Save