|
|
@@ -0,0 +1,37 @@ |
|
|
|
#!/usr/bin/env python3 |
|
|
|
import io |
|
|
|
import os |
|
|
|
import struct |
|
|
|
import subprocess |
|
|
|
import sys |
|
|
|
import tempfile |
|
|
|
|
|
|
|
|
|
|
|
if len(sys.argv) != 2 or sys.argv[1] == '--help' or sys.argv[1] == '-h': |
|
|
|
print('Usage: unzstd-warc FILE', file = sys.stderr) |
|
|
|
print('Decompresses FILE and writes its contents to stdout', file = sys.stderr) |
|
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
|
|
|
with open(sys.argv[1], 'rb') as fp: |
|
|
|
magic = fp.read(4) |
|
|
|
assert magic == b'\x5D\x2A\x4D\x18', 'not a valid warc.zst with a custom dictionary' |
|
|
|
dictSize = fp.read(4) |
|
|
|
assert len(dictSize) == 4, 'missing dict size' |
|
|
|
dictSize = struct.unpack('<I', dictSize)[0] |
|
|
|
assert dictSize >= 4, 'dict too small' |
|
|
|
assert dictSize < 100 * 1024**2, 'dict too large' |
|
|
|
d = fp.read(dictSize) |
|
|
|
assert d.startswith(b'\x28\xB5\x2F\xFD') or d.startswith(b'\x37\xA4\x30\xEC'), 'not a valid dict' |
|
|
|
if d.startswith(b'\x28\xB5\x2F\xFD'): # Compressed dict |
|
|
|
# Decompress with unzstd |
|
|
|
p = subprocess.Popen(['unzstd'], stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE) |
|
|
|
out, err = p.communicate(d) |
|
|
|
assert p.returncode == 0, f'unzstd exited non-zero: return code {p.returncode}, stderr: {p.stderr!r}' |
|
|
|
d = out |
|
|
|
#elif d.startswith(b'\x37\xA4\x30\xEC'): # Uncompressed dict, nothing to do |
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile() as dfp: |
|
|
|
dfp.write(d) |
|
|
|
pzstd = subprocess.Popen(['zstdcat', '-D', dfp.name, sys.argv[1]]) |
|
|
|
pzstd.communicate() |