Browse Source

Add zstdwarccat

master
JustAnotherArchivist 2 years ago
parent
commit
d5f646f995
1 changed files with 37 additions and 0 deletions
  1. +37
    -0
      zstdwarccat

+ 37
- 0
zstdwarccat View File

@@ -0,0 +1,37 @@
#!/usr/bin/env python3
import io
import os
import struct
import subprocess
import sys
import tempfile


if len(sys.argv) != 2 or sys.argv[1] == '--help' or sys.argv[1] == '-h':
print('Usage: unzstd-warc FILE', file = sys.stderr)
print('Decompresses FILE and writes its contents to stdout', file = sys.stderr)
sys.exit(1)


with open(sys.argv[1], 'rb') as fp:
magic = fp.read(4)
assert magic == b'\x5D\x2A\x4D\x18', 'not a valid warc.zst with a custom dictionary'
dictSize = fp.read(4)
assert len(dictSize) == 4, 'missing dict size'
dictSize = struct.unpack('<I', dictSize)[0]
assert dictSize >= 4, 'dict too small'
assert dictSize < 100 * 1024**2, 'dict too large'
d = fp.read(dictSize)
assert d.startswith(b'\x28\xB5\x2F\xFD') or d.startswith(b'\x37\xA4\x30\xEC'), 'not a valid dict'
if d.startswith(b'\x28\xB5\x2F\xFD'): # Compressed dict
# Decompress with unzstd
p = subprocess.Popen(['unzstd'], stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
out, err = p.communicate(d)
assert p.returncode == 0, f'unzstd exited non-zero: return code {p.returncode}, stderr: {p.stderr!r}'
d = out
#elif d.startswith(b'\x37\xA4\x30\xEC'): # Uncompressed dict, nothing to do

with tempfile.NamedTemporaryFile() as dfp:
dfp.write(d)
pzstd = subprocess.Popen(['zstdcat', '-D', dfp.name, sys.argv[1]])
pzstd.communicate()

Loading…
Cancel
Save