The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

48 lines
1.5 KiB

  1. #!/usr/bin/env python3
  2. import io
  3. import os
  4. import struct
  5. import subprocess
  6. import sys
  7. import tempfile
  8. def get_dict(fp):
  9. magic = fp.read(4)
  10. assert magic == b'\x5D\x2A\x4D\x18', 'not a valid warc.zst with a custom dictionary'
  11. dictSize = fp.read(4)
  12. assert len(dictSize) == 4, 'missing dict size'
  13. dictSize = struct.unpack('<I', dictSize)[0]
  14. assert dictSize >= 4, 'dict too small'
  15. assert dictSize < 100 * 1024**2, 'dict too large'
  16. d = fp.read(dictSize)
  17. assert d.startswith(b'\x28\xB5\x2F\xFD') or d.startswith(b'\x37\xA4\x30\xEC'), 'not a valid dict'
  18. if d.startswith(b'\x28\xB5\x2F\xFD'): # Compressed dict
  19. # Decompress with unzstd
  20. p = subprocess.Popen(['unzstd'], stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
  21. out, err = p.communicate(d)
  22. assert p.returncode == 0, f'unzstd exited non-zero: return code {p.returncode}, stderr: {p.stderr!r}'
  23. d = out
  24. #elif d.startswith(b'\x37\xA4\x30\xEC'): # Uncompressed dict, nothing to do
  25. return d
  26. if (len(sys.argv) != 2 and sys.stdin.isatty()) or sys.argv[1:2] == ['--help'] or sys.argv[1:2] == ['-h']:
  27. print('Usage: unzstd-warc [FILE]', file = sys.stderr)
  28. print('Decompresses FILE or stdin and writes its contents to stdout', file = sys.stderr)
  29. sys.exit(1)
  30. if len(sys.argv) == 2:
  31. with open(sys.argv[1], 'rb') as fp:
  32. d = get_dict(fp)
  33. else:
  34. d = get_dict(sys.stdin.buffer.raw)
  35. with tempfile.NamedTemporaryFile() as dfp:
  36. dfp.write(d)
  37. args = ['zstdcat', '-D', dfp.name]
  38. if len(sys.argv) == 2:
  39. args.append(sys.argv[1])
  40. pzstd = subprocess.Popen(args)
  41. pzstd.communicate()