|
- #!/usr/bin/env bash
- shopt -s extglob globstar nullglob
-
- if [[ ! -e all_start_urls || ! -e wpull.log ]]; then
- printf 'Error: cwd does not appear to be a grab-site grab dir\n' >&2
- exit 1
- fi
-
- # Check whether this has finished
- if ! grep -q 'wpull.application.tasks.stats - INFO - FINISHED.' wpull.log && ! grep -q 'wpull.application.app - ERROR - HookStop: Script requested immediate stop.' wpull.log; then
- printf 'Error: not finished according to wpull.log\n' >&2
- exit 1
- fi
- if [[ "$(lsof ./*)" ]]; then # Although lsof without an argument would be bad, nullglob doesn't need to be disabled here because at least two files must exist as checked above.
- printf 'Error: at least one file in this directory is opened by another process (is grab-site not finished yet?)\n' >&2
- exit 1
- fi
- fs=(temp/*)
- if (("${#fs[@]}" > 0)); then
- printf 'Error: temp/ is not empty\n' >&2
- exit 1
- fi
-
- # There can't be any .zst files from grab-site, but verify to make sure (although the below wouldn't remove them)
- fs=(**/*.zst)
- if (("${#fs[@]}" > 0)); then
- printf 'Error: .zst files found\n' >&2
- exit 1
- fi
-
- # Derive tar name from WARC
- fs=(*-meta.warc.gz)
- if [[ "${#fs[@]}" -ne 1 ]]; then
- printf 'Error: could not find exactly one *-meta.warc.gz file\n' >&2
- exit 1
- fi
- tarfile="${fs[0]%-meta.warc.gz}-grab-site-files.tar.zst"
-
- # Merge WAL
- sqlite3 wpull.db 'PRAGMA wal_checkpoint'
-
- # tar all but the WARC and rm
- fs=(!(@(*.warc.gz|*.zst)))
- if (("${#fs[@]}" == 0)); then
- printf 'Nothing to pack\n' >&2
- exit 0
- fi
- if ! tar -c -f "${tarfile}" --zstd --numeric-owner -- "${fs[@]}"; then
- printf 'Error: tar exited non-zero\n' >&2
- exit 1
- fi
- rm -r -- "${fs[@]}"
|