The little things give you away... A collection of various small helper stuff
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
 
 
 

61 行
2.0 KiB

  1. #!/usr/bin/env python3
  2. import io
  3. import os
  4. import struct
  5. import subprocess
  6. import sys
  7. import tempfile
  8. def get_dict(fp):
  9. magic = fp.read(4)
  10. assert magic == b'\x5D\x2A\x4D\x18', 'not a valid warc.zst with a custom dictionary'
  11. dictSize = fp.read(4)
  12. assert len(dictSize) == 4, 'missing dict size'
  13. dictSize = struct.unpack('<I', dictSize)[0]
  14. assert dictSize >= 4, 'dict too small'
  15. assert dictSize < 100 * 1024**2, 'dict too large'
  16. ds = []
  17. dlen = 0
  18. while dlen < dictSize:
  19. c = fp.read(dictSize - dlen)
  20. if c is None or c == b'': # EOF
  21. break
  22. ds.append(c)
  23. dlen += len(c)
  24. d = b''.join(ds)
  25. assert len(d) == dictSize, f'could not read dict fully: expected {dictSize}, got {len(d)}'
  26. assert d.startswith(b'\x28\xB5\x2F\xFD') or d.startswith(b'\x37\xA4\x30\xEC'), 'not a valid dict'
  27. if d.startswith(b'\x28\xB5\x2F\xFD'): # Compressed dict
  28. # Decompress with unzstd
  29. p = subprocess.Popen(['unzstd'], stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
  30. out, err = p.communicate(d)
  31. assert p.returncode == 0, f'unzstd exited non-zero: return code {p.returncode}, stderr: {err!r}'
  32. d = out
  33. #elif d.startswith(b'\x37\xA4\x30\xEC'): # Uncompressed dict, nothing to do
  34. return d
  35. if (len(sys.argv) != 2 and sys.stdin.isatty()) or sys.argv[1:2] == ['--help'] or sys.argv[1:2] == ['-h']:
  36. print('Usage: unzstd-warc [FILE]', file = sys.stderr)
  37. print('Decompresses FILE or stdin and writes its contents to stdout', file = sys.stderr)
  38. sys.exit(1)
  39. if len(sys.argv) == 2:
  40. with open(sys.argv[1], 'rb') as fp:
  41. d = get_dict(fp)
  42. else:
  43. d = get_dict(sys.stdin.buffer.raw)
  44. # The file must be written to the file system before zstdcat is executed. The most reliable way for that is to close the file. This requires manually deleting it at the end.
  45. with tempfile.NamedTemporaryFile(delete = False) as dfp:
  46. dfp.write(d)
  47. try:
  48. args = ['zstdcat', '-D', dfp.name]
  49. if len(sys.argv) == 2:
  50. args.append(sys.argv[1])
  51. pzstd = subprocess.Popen(args)
  52. pzstd.communicate()
  53. finally:
  54. os.remove(dfp.name)