From 820384fe1ed74e6b9ec59d07496612aad30d8e89 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 11 Dec 2019 00:53:45 +0000 Subject: [PATCH] Stop deduping small responses For small responses, the additional headers for the revisit outweigh the payload truncation savings. The chosen limit of 100 bytes is completely arbitrary and not backed by any real-world data. --- qwarc/warc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qwarc/warc.py b/qwarc/warc.py index b247b41..8e71ee8 100644 --- a/qwarc/warc.py +++ b/qwarc/warc.py @@ -132,7 +132,7 @@ class WARC: ) payloadDigest = responseRecord.rec_headers.get_header('WARC-Payload-Digest') assert payloadDigest is not None - if self._dedupe and responseRecord.payload_length > 0: # Don't "deduplicate" empty responses + if self._dedupe and responseRecord.payload_length > 100: # Don't deduplicate small responses; the additional headers are typically larger than the payload dedupe savings... if payloadDigest in self._dedupeMap: refersToRecordId, refersToUri, refersToDate = self._dedupeMap[payloadDigest] responseHttpHeaders = responseRecord.http_headers