The little things give you away... A collection of various small helper stuff
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

56 строки
1.5 KiB

  1. #!/usr/bin/env python3
  2. import argparse
  3. import logging
  4. import zlib
  5. logger = logging.getLogger('warc-peek')
  6. def finditer(b, sub):
  7. pos = 0
  8. while True:
  9. pos = b.find(sub, pos)
  10. if pos < 0:
  11. break
  12. yield pos
  13. pos += 1
  14. def find_offsets(warcfile, offset, length):
  15. with open(warcfile, 'rb') as fp:
  16. fp.seek(offset)
  17. buffer = fp.read(length)
  18. logger.debug('Buffer length: {:d}'.format(len(buffer)))
  19. for pos in finditer(buffer, b'\x1f\x8b'):
  20. logger.debug('Trying relative offset {:d}'.format(pos))
  21. if pos > len(buffer) - 512: # 512 bytes might be a bit too much, but at least it ensures that the decompression will work.
  22. break
  23. try:
  24. dec = zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(buffer[pos:pos+512])
  25. except:
  26. continue
  27. logger.debug('First 100 bytes of decompressed data: {!r}'.format(dec[:100]))
  28. if dec.startswith(b'WARC/1.0\r\n'):
  29. yield offset + pos
  30. if __name__ == '__main__':
  31. parser = argparse.ArgumentParser()
  32. parser.add_argument('--debug', action = 'store_true', help = 'Enable debug output')
  33. parser.add_argument('warcfile', help = 'A .warc.gz file')
  34. parser.add_argument('offset', type = int, help = 'Zero-based byte offset of the window')
  35. parser.add_argument('length', type = int, help = 'Length in bytes of the window')
  36. args = parser.parse_args()
  37. if args.debug:
  38. logging.basicConfig(
  39. format = '{asctime} {levelname} {name} {message}',
  40. style = '{',
  41. level = logging.DEBUG,
  42. )
  43. for offset in find_offsets(args.warcfile, args.offset, args.length):
  44. print(offset)