From 828dae25975bd7c82dae8382d4550be765479a0a Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 11 Jul 2023 10:05:11 +0000 Subject: [PATCH] Raise an error when verification fails --- warc-tiny | 47 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/warc-tiny b/warc-tiny index 22b7cd7..6ae34c3 100755 --- a/warc-tiny +++ b/warc-tiny @@ -51,7 +51,7 @@ class Event: pass -class NewFile(Event): +class FileEvent(Event): def __init__(self, filename): self._filename = filename @@ -60,6 +60,10 @@ class NewFile(Event): return self._filename +class NewFile(FileEvent): + pass + + class BeginOfRecord(Event): def __init__(self, warcHeaders, rawData): self._warcHeaders = warcHeaders @@ -123,6 +127,21 @@ class EndOfRecord(Event): pass +class WARCParsingIssue(enum.Enum): + TRUNCATED_FILE = enum.auto() + MALFORMED_HTTP_RECORD = enum.auto() + + +class WARCParsingIssueEvent(Event): + def __init__(self, issue, message = None): + self.issue = issue + self.message = message + + +class EndOfFile(FileEvent): + pass + + @contextlib.contextmanager def open_warc(f): if hasattr(f, 'read'): @@ -169,6 +188,7 @@ def iter_warc(f): pass if len(buf) < warcContentLength + 4: print('Error: truncated WARC', file = sys.stderr) + yield WARCParsingIssueEvent(WARCParsingIssue.TRUNCATED_FILE) break warcContent = buf[:warcContentLength] buf = buf[warcContentLength + 4:] @@ -210,7 +230,9 @@ def iter_warc(f): try: chunkLineEnd = httpBody.index(b'\r\n', pos) except ValueError: - print('Error: could not find chunk line end in record {}, skipping'.format(recordID), file = sys.stderr) + message = 'could not find chunk line end in record {}'.format(recordID) + print('Error: {}, skipping'.format(message), file = sys.stderr) + yield WARCParsingIssueEvent(WARCParsingIssue.MALFORMED_HTTP_RECORD, message) break chunkLine = httpBody[pos:chunkLineEnd] if b';' in chunkLine: @@ -218,7 +240,9 @@ def iter_warc(f): else: chunkLength = chunkLine.strip() if chunkLength.lstrip(b'0123456789abcdefABCDEF') != b'': - print('Error: malformed chunk length {!r} in record {}, skipping'.format(chunkLength, recordID), file = sys.stderr) + message = 'malformed chunk length {!r} in record {}'.format(chunkLength, recordID) + print('Error: {}, skipping'.format(message), file = sys.stderr) + yield WARCParsingIssueEvent(WARCParsingIssue.MALFORMED_HTTP_RECORD, message) break chunkLength = int(chunkLength, base = 16) if chunkLength == 0: @@ -229,7 +253,9 @@ def iter_warc(f): else: yield HTTPBodyChunk(httpDecompressor.decompress(httpBody)) else: - print('Warning: malformed HTTP request or response in record {}, skipping'.format(recordID), file = sys.stderr) + message = 'malformed HTTP request or response in record {}'.format(recordID) + print('Warning: {}, skipping'.format(message), file = sys.stderr) + yield WARCParsingIssueEvent(WARCParsingIssue.MALFORMED_HTTP_RECORD, message) yield WARCBlockChunk(warcContent) else: yield WARCBlockChunk(warcContent) @@ -267,6 +293,10 @@ class HexDigest(Digest): return (digest if digest else self._digest).hex() +class VerificationError(Exception): + pass + + class VerifyMode(ProcessMode): def __init__(self): self._blockDigester = None @@ -275,6 +305,7 @@ class VerifyMode(ProcessMode): self._brokenPayloadDigester = None self._recordedPayloadDigest = None self._printedBrokenPayloadWarning = False + self._verificationFailed = False def parse_digest(self, digest): if not digest.startswith(b'sha1:'): @@ -289,6 +320,7 @@ class VerifyMode(ProcessMode): def process_event(self, event): if type(event) is NewFile: self._printedBrokenPayloadWarning = False + self._verificationFailed = False elif type(event) is BeginOfRecord: if any(x[0] == b'WARC-Block-Digest' for x in event.warcHeaders): self._blockDigester = hashlib.sha1() @@ -315,10 +347,13 @@ class VerifyMode(ProcessMode): elif type(event) is RawHTTPBodyChunk: if self._brokenPayloadDigester: self._brokenPayloadDigester.update(event.data) + elif type(event) is WARCParsingIssueEvent: + self._verificationFailed = True elif type(event) is EndOfRecord: if self._blockDigester and self._recordedBlockDigest: if not self._recordedBlockDigest.equals(self._blockDigester.digest()): print('Block digest mismatch for record {}: recorded {} v calculated {}'.format(self._recordID, self._recordedBlockDigest.format(), self._recordedBlockDigest.format(self._blockDigester.digest())), file = sys.stderr) + self._verificationFailed = True if self._payloadDigester and self._recordType in (b'request', b'response'): #TODO: Support revisit if not self._recordedPayloadDigest.equals(self._payloadDigester.digest()): if self._recordedPayloadDigest.equals(self._brokenPayloadDigester.digest()): @@ -327,6 +362,9 @@ class VerifyMode(ProcessMode): self._printedBrokenPayloadWarning = True else: print('Payload digest mismatch for record {}: recorded {} vs. calculated {} (calculated broken {})'.format(self._recordID, self._recordedPayloadDigest.format(), self._recordedPayloadDigest.format(self._payloadDigester.digest()), self._recordedPayloadDigest.format(self._brokenPayloadDigester.digest())), file = sys.stderr) + self._verificationFailed = True + elif type(event) is EndOfFile and self._verificationFailed: + raise VerificationError('one or more errors encountered while verifying {}'.format(event.filename)) class DumpResponsesMode(ProcessMode): @@ -546,6 +584,7 @@ def main(): f = sys.stdin.buffer for event in iter_warc(f): processor.process_event(event) + processor.process_event(EndOfFile(f)) except BrokenPipeError: return