The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

57 lines
1.8 KiB

  1. #!/usr/bin/env python3
  2. import io
  3. import os
  4. import struct
  5. import subprocess
  6. import sys
  7. import tempfile
  8. def get_dict(fp):
  9. magic = fp.read(4)
  10. assert magic == b'\x5D\x2A\x4D\x18', 'not a valid warc.zst with a custom dictionary'
  11. dictSize = fp.read(4)
  12. assert len(dictSize) == 4, 'missing dict size'
  13. dictSize = struct.unpack('<I', dictSize)[0]
  14. assert dictSize >= 4, 'dict too small'
  15. assert dictSize < 100 * 1024**2, 'dict too large'
  16. ds = []
  17. dlen = 0
  18. while dlen < dictSize:
  19. c = fp.read(dictSize - dlen)
  20. if c is None or c == b'': # EOF
  21. break
  22. ds.append(c)
  23. dlen += len(c)
  24. d = b''.join(ds)
  25. assert len(d) == dictSize, f'could not read dict fully: expected {dictSize}, got {len(d)}'
  26. assert d.startswith(b'\x28\xB5\x2F\xFD') or d.startswith(b'\x37\xA4\x30\xEC'), 'not a valid dict'
  27. if d.startswith(b'\x28\xB5\x2F\xFD'): # Compressed dict
  28. # Decompress with unzstd
  29. p = subprocess.Popen(['unzstd'], stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
  30. out, err = p.communicate(d)
  31. assert p.returncode == 0, f'unzstd exited non-zero: return code {p.returncode}, stderr: {err!r}'
  32. d = out
  33. #elif d.startswith(b'\x37\xA4\x30\xEC'): # Uncompressed dict, nothing to do
  34. return d
  35. if (len(sys.argv) != 2 and sys.stdin.isatty()) or sys.argv[1:2] == ['--help'] or sys.argv[1:2] == ['-h']:
  36. print('Usage: unzstd-warc [FILE]', file = sys.stderr)
  37. print('Decompresses FILE or stdin and writes its contents to stdout', file = sys.stderr)
  38. sys.exit(1)
  39. if len(sys.argv) == 2:
  40. with open(sys.argv[1], 'rb') as fp:
  41. d = get_dict(fp)
  42. else:
  43. d = get_dict(sys.stdin.buffer.raw)
  44. with tempfile.NamedTemporaryFile() as dfp:
  45. dfp.write(d)
  46. args = ['zstdcat', '-D', dfp.name]
  47. if len(sys.argv) == 2:
  48. args.append(sys.argv[1])
  49. pzstd = subprocess.Popen(args)
  50. pzstd.communicate()