The little things give you away... A collection of various small helper stuff
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 

56 lignes
1.5 KiB

  1. #!/usr/bin/env python3
  2. import argparse
  3. import logging
  4. import zlib
  5. logger = logging.getLogger('warc-peek')
  6. def finditer(b, sub):
  7. pos = 0
  8. while True:
  9. pos = b.find(sub, pos)
  10. if pos < 0:
  11. break
  12. yield pos
  13. pos += 1
  14. def find_offsets(warcfile, offset, length):
  15. with open(warcfile, 'rb') as fp:
  16. fp.seek(offset)
  17. buffer = fp.read(length)
  18. logger.debug('Buffer length: {:d}'.format(len(buffer)))
  19. for pos in finditer(buffer, b'\x1f\x8b'):
  20. logger.debug('Trying relative offset {:d}'.format(pos))
  21. if pos > len(buffer) - 512: # 512 bytes might be a bit too much, but at least it ensures that the decompression will work.
  22. break
  23. try:
  24. dec = zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(buffer[pos:pos+512])
  25. except:
  26. continue
  27. logger.debug('First 100 bytes of decompressed data: {!r}'.format(dec[:100]))
  28. if dec.startswith(b'WARC/1.0\r\n'):
  29. yield offset + pos
  30. if __name__ == '__main__':
  31. parser = argparse.ArgumentParser()
  32. parser.add_argument('--debug', action = 'store_true', help = 'Enable debug output')
  33. parser.add_argument('warcfile', help = 'A .warc.gz file')
  34. parser.add_argument('offset', type = int, help = 'Zero-based byte offset of the window')
  35. parser.add_argument('length', type = int, help = 'Length in bytes of the window')
  36. args = parser.parse_args()
  37. if args.debug:
  38. logging.basicConfig(
  39. format = '{asctime} {levelname} {name} {message}',
  40. style = '{',
  41. level = logging.DEBUG,
  42. )
  43. for offset in find_offsets(args.warcfile, args.offset, args.length):
  44. print(offset)