The little things give you away... A collection of various small helper stuff
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 

38 lignes
1.3 KiB

  1. #!/usr/bin/env python3
  2. import io
  3. import os
  4. import struct
  5. import subprocess
  6. import sys
  7. import tempfile
  8. if len(sys.argv) != 2 or sys.argv[1] == '--help' or sys.argv[1] == '-h':
  9. print('Usage: unzstd-warc FILE', file = sys.stderr)
  10. print('Decompresses FILE and writes its contents to stdout', file = sys.stderr)
  11. sys.exit(1)
  12. with open(sys.argv[1], 'rb') as fp:
  13. magic = fp.read(4)
  14. assert magic == b'\x5D\x2A\x4D\x18', 'not a valid warc.zst with a custom dictionary'
  15. dictSize = fp.read(4)
  16. assert len(dictSize) == 4, 'missing dict size'
  17. dictSize = struct.unpack('<I', dictSize)[0]
  18. assert dictSize >= 4, 'dict too small'
  19. assert dictSize < 100 * 1024**2, 'dict too large'
  20. d = fp.read(dictSize)
  21. assert d.startswith(b'\x28\xB5\x2F\xFD') or d.startswith(b'\x37\xA4\x30\xEC'), 'not a valid dict'
  22. if d.startswith(b'\x28\xB5\x2F\xFD'): # Compressed dict
  23. # Decompress with unzstd
  24. p = subprocess.Popen(['unzstd'], stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
  25. out, err = p.communicate(d)
  26. assert p.returncode == 0, f'unzstd exited non-zero: return code {p.returncode}, stderr: {p.stderr!r}'
  27. d = out
  28. #elif d.startswith(b'\x37\xA4\x30\xEC'): # Uncompressed dict, nothing to do
  29. with tempfile.NamedTemporaryFile() as dfp:
  30. dfp.write(d)
  31. pzstd = subprocess.Popen(['zstdcat', '-D', dfp.name, sys.argv[1]])
  32. pzstd.communicate()