The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

53 lines
1.6 KiB

  1. #!/usr/bin/env bash
  2. shopt -s extglob globstar nullglob
  3. if [[ ! -e all_start_urls || ! -e wpull.log ]]; then
  4. printf 'Error: cwd does not appear to be a grab-site grab dir\n' >&2
  5. exit 1
  6. fi
  7. # Check whether this has finished
  8. if ! grep -q 'wpull.application.tasks.stats - INFO - FINISHED.' wpull.log && ! grep -q 'wpull.application.app - ERROR - HookStop: Script requested immediate stop.' wpull.log; then
  9. printf 'Error: not finished according to wpull.log\n' >&2
  10. exit 1
  11. fi
  12. if [[ "$(lsof ./*)" ]]; then # Although lsof without an argument would be bad, nullglob doesn't need to be disabled here because at least two files must exist as checked above.
  13. printf 'Error: at least one file in this directory is opened by another process (is grab-site not finished yet?)\n' >&2
  14. exit 1
  15. fi
  16. fs=(temp/*)
  17. if (("${#fs[@]}" > 0)); then
  18. printf 'Error: temp/ is not empty\n' >&2
  19. exit 1
  20. fi
  21. # There can't be any .zst files from grab-site, but verify to make sure (although the below wouldn't remove them)
  22. fs=(**/*.zst)
  23. if (("${#fs[@]}" > 0)); then
  24. printf 'Error: .zst files found\n' >&2
  25. exit 1
  26. fi
  27. # Derive tar name from WARC
  28. fs=(*-meta.warc.gz)
  29. if [[ "${#fs[@]}" -ne 1 ]]; then
  30. printf 'Error: could not find exactly one *-meta.warc.gz file\n' >&2
  31. exit 1
  32. fi
  33. tarfile="${fs[0]%-meta.warc.gz}-grab-site-files.tar.zst"
  34. # Merge WAL
  35. sqlite3 wpull.db 'PRAGMA wal_checkpoint'
  36. # tar all but the WARC and rm
  37. fs=(!(@(*.warc.gz|*.zst)))
  38. if (("${#fs[@]}" == 0)); then
  39. printf 'Nothing to pack\n' >&2
  40. exit 0
  41. fi
  42. if ! tar -c -f "${tarfile}" --zstd --numeric-owner -- "${fs[@]}"; then
  43. printf 'Error: tar exited non-zero\n' >&2
  44. exit 1
  45. fi
  46. rm -r -- "${fs[@]}"