diff --git a/warc-peek.py b/warc-peek.py index 38af7ee..3a14202 100755 --- a/warc-peek.py +++ b/warc-peek.py @@ -1,8 +1,13 @@ -import os -import sys +#!/usr/bin/env python3 + +import argparse +import logging import zlib +logger = logging.getLogger('warc-peek') + + def finditer(b, sub): pos = 0 while True: @@ -13,19 +18,38 @@ def finditer(b, sub): pos += 1 -with open(sys.argv[1], 'rb') as fp: - fp.seek(int(sys.argv[2]), os.SEEK_SET) - buffer = fp.read(int(sys.argv[3])) - -#print('Buffer length', len(buffer)) -for pos in finditer(buffer, b'\x1f\x8b'): - #print('Trying', pos) - if pos > len(buffer) - 512: # 512 bytes might be a bit too much, but at least it ensures that the decompression will work. - break - try: - dec = zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(buffer[pos:pos+512]) - except: - continue - #print(repr(dec)) - if dec.startswith(b'WARC/1.0\r\n'): - print(int(sys.argv[2]) + pos) +def find_offsets(warcfile, offset, length): + with open(warcfile, 'rb') as fp: + fp.seek(offset) + buffer = fp.read(length) + + logger.debug('Buffer length: {:d}'.format(len(buffer))) + for pos in finditer(buffer, b'\x1f\x8b'): + logger.debug('Trying relative offset {:d}'.format(pos)) + if pos > len(buffer) - 512: # 512 bytes might be a bit too much, but at least it ensures that the decompression will work. + break + try: + dec = zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(buffer[pos:pos+512]) + except: + continue + logger.debug('First 100 bytes of decompressed data: {!r}'.format(dec[:100])) + if dec.startswith(b'WARC/1.0\r\n'): + yield offset + pos + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--debug', action = 'store_true', help = 'Enable debug output') + parser.add_argument('warcfile', help = 'A .warc.gz file') + parser.add_argument('offset', type = int, help = 'Zero-based byte offset of the window') + parser.add_argument('length', type = int, help = 'Length in bytes of the window') + args = parser.parse_args() + + if args.debug: + logging.basicConfig( + format = '{asctime} {levelname} {name} {message}', + style = '{', + level = logging.DEBUG, + ) + for offset in find_offsets(args.warcfile, args.offset, args.length): + print(offset)