From f1fcfabafadfb1d4732c17d0e509ecec365c69b8 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 27 Jul 2021 00:56:46 +0000 Subject: [PATCH] Add support for reading warc.zst from stdin --- zstdwarccat | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/zstdwarccat b/zstdwarccat index 7ed521e..20a358f 100755 --- a/zstdwarccat +++ b/zstdwarccat @@ -7,13 +7,7 @@ import sys import tempfile -if len(sys.argv) != 2 or sys.argv[1] == '--help' or sys.argv[1] == '-h': - print('Usage: unzstd-warc FILE', file = sys.stderr) - print('Decompresses FILE and writes its contents to stdout', file = sys.stderr) - sys.exit(1) - - -with open(sys.argv[1], 'rb') as fp: +def get_dict(fp): magic = fp.read(4) assert magic == b'\x5D\x2A\x4D\x18', 'not a valid warc.zst with a custom dictionary' dictSize = fp.read(4) @@ -30,8 +24,24 @@ with open(sys.argv[1], 'rb') as fp: assert p.returncode == 0, f'unzstd exited non-zero: return code {p.returncode}, stderr: {p.stderr!r}' d = out #elif d.startswith(b'\x37\xA4\x30\xEC'): # Uncompressed dict, nothing to do + return d + + +if (len(sys.argv) != 2 and sys.stdin.isatty()) or sys.argv[1:2] == ['--help'] or sys.argv[1:2] == ['-h']: + print('Usage: unzstd-warc [FILE]', file = sys.stderr) + print('Decompresses FILE or stdin and writes its contents to stdout', file = sys.stderr) + sys.exit(1) + +if len(sys.argv) == 2: + with open(sys.argv[1], 'rb') as fp: + d = get_dict(fp) +else: + d = get_dict(sys.stdin.buffer.raw) with tempfile.NamedTemporaryFile() as dfp: dfp.write(d) - pzstd = subprocess.Popen(['zstdcat', '-D', dfp.name, sys.argv[1]]) + args = ['zstdcat', '-D', dfp.name] + if len(sys.argv) == 2: + args.append(sys.argv[1]) + pzstd = subprocess.Popen(args) pzstd.communicate()