Browse Source

Add support for reading warc.zst from stdin

master
JustAnotherArchivist 2 years ago
parent
commit
f1fcfabafa
1 changed files with 18 additions and 8 deletions
  1. +18
    -8
      zstdwarccat

+ 18
- 8
zstdwarccat View File

@@ -7,13 +7,7 @@ import sys
import tempfile


if len(sys.argv) != 2 or sys.argv[1] == '--help' or sys.argv[1] == '-h':
print('Usage: unzstd-warc FILE', file = sys.stderr)
print('Decompresses FILE and writes its contents to stdout', file = sys.stderr)
sys.exit(1)


with open(sys.argv[1], 'rb') as fp:
def get_dict(fp):
magic = fp.read(4)
assert magic == b'\x5D\x2A\x4D\x18', 'not a valid warc.zst with a custom dictionary'
dictSize = fp.read(4)
@@ -30,8 +24,24 @@ with open(sys.argv[1], 'rb') as fp:
assert p.returncode == 0, f'unzstd exited non-zero: return code {p.returncode}, stderr: {p.stderr!r}'
d = out
#elif d.startswith(b'\x37\xA4\x30\xEC'): # Uncompressed dict, nothing to do
return d


if (len(sys.argv) != 2 and sys.stdin.isatty()) or sys.argv[1:2] == ['--help'] or sys.argv[1:2] == ['-h']:
print('Usage: unzstd-warc [FILE]', file = sys.stderr)
print('Decompresses FILE or stdin and writes its contents to stdout', file = sys.stderr)
sys.exit(1)


if len(sys.argv) == 2:
with open(sys.argv[1], 'rb') as fp:
d = get_dict(fp)
else:
d = get_dict(sys.stdin.buffer.raw)
with tempfile.NamedTemporaryFile() as dfp:
dfp.write(d)
pzstd = subprocess.Popen(['zstdcat', '-D', dfp.name, sys.argv[1]])
args = ['zstdcat', '-D', dfp.name]
if len(sys.argv) == 2:
args.append(sys.argv[1])
pzstd = subprocess.Popen(args)
pzstd.communicate()

Loading…
Cancel
Save