|
- #!/usr/bin/env bash
- printf '\e[1;33mWARNING: This script is experimental and is not intended for productive use yet. Use at your own risk.\e[0m\n\n' >&2
-
- # Requires Bash 4.3+
- if [[ "$1" == '-h' || "$1" == '--help' ]]; then
- printf 'Usage: %q\n' "$0"
- printf '\n'
- printf 'Uploads files from each subdirectory of the PWD into an IA item using the dirname as the identifier; files whose name starts with the identifier are skipped.\n'
- printf 'Configuration happens through a subdir/.dir-to-ia.config file, a Bash script that gets sourced to get the config values.\n'
- printf "Use '%q --example-config' to get an example configuration with explanations.\n" "$0"
- printf 'The upload log for each item gets written to the subdir/.dir-to-ia.log file.\n'
- printf 'It is assumed that no other changes are made to the items while dir-to-ia is running.\n'
- printf 'Multiple instances can safely run in the same directory to upload multiple items in parallel. Uploads within an item are always sequential.\n'
- printf '\n'
- printf 'This program is dedicated to fireonlive.\n'
- exit 1
- fi
-
- # Default config values
- sha256=no
- rm=no
- rmwait=yes
- sha1check=no
- clobber=no
- derive=no
- autosizehint=no
- iauploadstreamopts=()
-
- if [[ "$1" == '--example-config' ]]; then
- cat <<-EOF
- # Calculate SHA-256 hash of each file after uploading
- sha256=${sha256}
-
- # Remove local file after successful upload; when disabled, only files without a filename match in the IA item get uploaded.
- rm=${rm}
-
- # Delay removal until IA processed the upload
- rmwait=${rmwait}
-
- # Run an SHA-1 check against the local file after the upload using the hash calculated by IA; if rm and sha1check are enabled, rmwait must also be enabled.
- sha1check=${sha1check}
-
- # Clobber existing files in IA item (no = existing copy is moved to history/files/ by IA); only meaningful with rm=yes.
- clobber=${clobber}
-
- # Queue derive after upload (applied after every file!)
- derive=${derive}
-
- # Automatic size hint for item creation using the current directory size. When more data will be added, do not use this and set a value in iauploadstreamopts instead if a size hint is desired.
- autosizehint=${autosizehint}
-
- # Custom options for ia-upload-stream, as an array of args; this can be used to choose the part size and concurrency, for example, using (--part-size 1G --concurrency 4).
- iauploadstreamopts=()
-
- # Item metadata (array with 'key:value' elements); the only mandatory variable with no default
- metadata=('collection:opensource' "date:$(printf '%(%Y-%m-%d)T')")
- EOF
- exit
- fi
-
- function tsprintf {
- # Bash 5.0+, ms is empty below that, will break in the year 2286
- local t="${EPOCHREALTIME:--1}" ts ms
- ms="${t:10:4}"
- printf -v ts '%(%Y-%m-%d %H:%M:%S)T%s' "${t%%.*}" "${ms}";
- if [[ $# -eq 1 ]]; then
- printf "%s %s\n" "${ts}" "$@";
- else
- # First argument is the format, rest are arguments to printf; pass through `while read` loop to have a better chance of line buffering.
- {
- printf "%s " "${ts}";
- printf "$1" "${@:2}";
- printf '\n';
- } | while IFS= read -r line; do printf '%s\n' "${line}"; done
- fi
- }
-
- function maybets {
- # Add a timestamp to each line that doesn't start with one.
- local line
- while IFS= read -r line; do
- if [[ "${line}" != [0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]\ [0-9][0-9]:[0-9][0-9]:[0-9][0-9]* ]]; then
- tsprintf "${line}"
- else
- printf '%s\n' "${line}"
- fi
- done
- }
-
- if [[ "${DEBUG}" ]]; then
- function dbgprintf { tsprintf "[DEBUG] $1" "${@:2}" >&2; }
- else
- function dbgprintf { :; }
- fi
-
- scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
- for cmd in ia-upload-stream ia-tasks; do
- if [[ ! -x "${scriptpath}/${cmd}" ]]; then
- printf 'Error: %q not found, make sure it is in the same directory as %q\n' "${cmd}" "$0" >&2
- exit 1
- fi
- done
-
- while :; do
- # Prefilter the subdirs for ones that have actual files in them. This avoids cycling through all dirs when rm=yes.
- # Include dirs with .dirty file so the warning about those is always emitted.
- readarray -d $'\0' -t candidateDirs < <(find -type f \( \! -name '.dir-to-ia.*' -o -name .dir-to-ia.dirty \) -print0 | sed -z 's,^\./,,; s,/.*$,,' | uniq -z)
- for dir in "${candidateDirs[@]}"; do
- dbgprintf 'Considering directory %q' "${dir}"
- identifier="${dir}"
- if [[ ! "${dir}" =~ ^[a-zA-Z0-9] ]]; then
- continue
- fi
- if [[ ! -e "${dir}/.dir-to-ia.config" ]]; then
- continue
- fi
- if [[ -e "${dir}/.dir-to-ia.dirty" ]]; then
- tsprintf "Warning: %q is dirty, skipping" "${dir}" >&2
- continue
- fi
-
- # Everything from here on is executed in a subshell so that the config sourcing can't affect other items.
- # It would be possible to do that without a subshell, but this is easier.
- {
- flock --exclusive --nonblock 3 || continue
- (
- dbgprintf 'Processing %q' "${dir}"
-
- # Source and check configuration
- . "${dir}/.dir-to-ia.config" || { tsprintf 'Sourcing %q failed' "${dir}/.dir-to-ia.config" >&2; exit 1; }
- configbroken=
- for v in sha256 rm rmwait sha1check clobber derive; do
- if [[ "${!v}" != 'yes' && "${!v}" != 'no' ]]; then
- tsprintf "Error in %q: %s must be 'yes' or 'no'" "${dir}/.dir-to-ia.config" "${v}" >&2
- configbroken=yes
- fi
- done
- if read -r _ attrs _ < <(declare -p iauploadstreamopts 2>/dev/null); [[ "${attrs}" != *a* ]]; then
- tsprintf 'Error in %q: iauploadstreamopts must be an array' "${dir}/.dir-to-ia.config" >&2
- configbroken=yes
- fi
- if read -r _ attrs _ < <(declare -p metadata 2>/dev/null); [[ "${attrs}" != *a* ]]; then
- tsprintf 'Error in %q: metadata missing or not an array' "${dir}/.dir-to-ia.config" >&2
- configbroken=yes
- else
- for f in "${metadata[@]}"; do
- if [[ "${f}" != *:* ]]; then
- tsprintf 'Error in %q: metadata field missing colon: %s' "${dir}/.dir-to-ia.config" "${f}" >&2
- configbroken=yes
- fi
- done
- fi
- if [[ "${configbroken}" ]]; then
- exit 1
- fi
- if [[ "${rm}" == 'no' && "${clobber}" == 'yes' ]]; then
- tsprintf 'Error in %q: rm=no and clobber=yes is not permitted' "${dir}/.dir-to-ia.config" >&2
- exit 1
- fi
- if [[ "${rm}" == 'yes' && "${sha1check}" == 'yes' && "${rmwait}" == 'no' ]]; then
- tsprintf 'Error in %q: rm=yes + sha1check=yes is incompatible with rmwait=no' "${dir}/.dir-to-ia.config" >&2
- exit 1
- fi
- dbgprintf 'Configuration:'
- dbgprintf " sha256=${sha256}"
- dbgprintf " rm=${rm}"
- dbgprintf " rmwait=${rmwait}"
- dbgprintf " sha1check=${sha1check}"
- dbgprintf " clobber=${clobber}"
- dbgprintf " derive=${derive}"
- dbgprintf " autosizehint=${autosizehint}"
- dbgprintf " iauploadstreamopts=(${iauploadstreamopts[0]+"$(printf %q "${iauploadstreamopts[0]}")"}$(if [[ ${#iauploadstreamopts[@]} -gt 1 ]]; then printf ' %q' "${iauploadstreamopts[@]:1}"; fi))"
- dbgprintf " metadata=($(printf %q "${metadata[0]}")$(if [[ ${#metadata[@]} -gt 1 ]]; then printf ' %q' "${metadata[@]:1}"; fi))"
-
- dbgprintf 'Version: commit %q' "$(git -C "${scriptpath}" log --max-count=1 --format=%H)"
-
- # Check for previous batch that might require waiting for IA and/or deletion
- # Exits 0 for likely IA-side errors so it gets retried on the next pass.
- if [[ -e "${dir}/.dir-to-ia.batch-files" ]]; then
- if [[ ( "${rm}" == 'yes' && "${rmwait}" == 'yes' ) || "${sha1check}" == 'yes' ]]; then
- tsprintf 'Checking whether IA is done with upload processing...' >&2
- summary="$("${scriptpath}/ia-tasks" --summary --jsonl "${identifier}")"
- st="$?"
- if ((st)); then
- tsprintf "Warning: task processing wait: ia-tasks exited with status ${st}" >&2
- exit 0
- fi
- tasksAndErrors="$(python3 -c 'import json, sys; o = json.load(sys.stdin); print(sum(o.values()), o["error"])' <<<"${summary}")"
- st="$?"
- if ((st)); then
- tsprintf "Warning: task processing wait: python3 for ia-tasks output %q exited with status %d" "${summary}" "${st}" >&2
- exit 0
- fi
- if [[ "${tasksAndErrors}" != '0 0' ]]; then
- tsprintf "Task processing wait: IA tasks not finished (%d tasks remaining, %d errors)" "${tasksAndErrors% *}" "${tasksAndErrors#* }" >&2
- exit 0
- fi
- tsprintf 'IA upload processing finished' >&2
- fi
-
- readarray -d $'\0' -t uploaded <"${dir}/.dir-to-ia.batch-files"
- dbgprintf "Local files:$(printf " %q" "${uploaded[@]}")"
-
- if [[ "${sha1check}" == 'yes' ]]; then
- tsprintf 'Checking SHA-1 hashes...' >&2
- if [[ ! -e "${dir}/.dir-to-ia.batch-sha1sums" ]]; then
- tsprintf 'Error: sha1check enabled but no sha1sums file for the previous batch of uploads (did you enable sha1check while a batch was pending?)' >&2
- exit 1
- fi
- readarray -t localhashes <"${dir}/.dir-to-ia.batch-sha1sums"
- dbgprintf "Local hashes:$(printf " %q" "${localhashes[@]}")"
- if ((${#localhashes[@]} != ${#uploaded[@]})); then
- tsprintf 'Error: %q and %q do not have the same number of entries (%d vs %d)' "${dir}/.dir-to-ia.batch-files" "${dir}/.dir-to-ia.batch-sha1sums" "${#uploaded[@]}" "${#localhashes[@]}" >&2
- exit 1
- fi
- dbgprintf 'Retrieving SHA-1 hashes from IA...'
- # Only two error scenarios: retrieval fails (curl exit != 0), or the file list doesn't contain all files in the array (Python exit 33)
- # To not have to handle weird characters in filenames, don't try to produce a file with the expected format for `sha1sum -c`.
- shopt -s lastpipe
- curl --silent --location --max-time 10 --fail --user-agent 'little-things +https://gitea.arpa.li/JustAnotherArchivist/little-things' "https://archive.org/metadata/${identifier}" \
- | python3 -c 'import json, sys; o = json.load(sys.stdin); files = set(sys.argv[1:]); {(print(f["sha1"], end = "\0"), print(f["name"], end = "\0"), files.remove(f["name"])) for f in o.get("files", []) if f["name"] in files and "sha1" in f}; sys.exit(33) if files else ...' "${uploaded[@]}" \
- | readarray -d $'\0' -t iahashes
- sts=("${PIPESTATUS[@]}")
- dbgprintf "IA hashes:$(printf " %q" "${iahashes[@]}")"
- if ((sts[0])); then
- tsprintf "IA hashes curl exited with status ${sts[0]}" >&2
- exit 0
- fi
- if ((sts[1] == 33)); then
- tsprintf 'IA hashes metadata does not contain all files uploaded in this batch' >&2
- exit 1
- elif ((sts[1])); then
- tsprintf "IA hashes python3 exited with status ${sts[1]}" >&2
- exit 0
- fi
- if ((${#iahashes[@]} % 2 != 0)); then
- tsprintf 'Got an odd number of hashes from IA' >&2
- exit 1
- fi
- if ((${#iahashes[@]} != 2 * ${#localhashes[@]})); then
- tsprintf "Number of hashes from IA (%d) doesn't match number of local hashes (%d)" "$((${#iahashes[@]} / 2))" "${#localhashes[@]}" >&2
- exit 1
- fi
-
- # Local filenames in uploaded, corresponding local hashes + space space dash in localhashes, and iahashes is (hash filename [hash filename...]).
- dbgprintf 'Comparing...'
- hashfail=
- for ((i = 0; i < ${#uploaded[@]}; ++i)); do
- fn="${uploaded[i]}"
- dbgprintf 'Checking hash for %q' "${dir}/${fn}"
- localhash="${localhashes[i]% -}"
- dbgprintf ' local: %s' "${localhash}"
- iahash=
- for ((j = 0; j < "${#iahashes[@]}"; j += 2)); do
- if [[ "${iahashes[j + 1]}" == "${fn}" ]]; then
- iahash="${iahashes[j]}"
- break
- fi
- done
- if [[ -z "${iahash}" ]]; then
- # This should be impossible due to the checks above.
- tsprintf 'Error: could not find IA hash for %q' "${fn}" >&2
- exit 1
- fi
- dbgprintf ' IA: %s' "${iahash}"
- if [[ "${iahash}" != "${localhash}" ]]; then
- tsprintf 'SHA-1 mismatch for %q: local %s vs IA %s' "${dir}/${fn}" "${localhash}" "${iahash}" >&2
- hashfail=yes
- else
- tsprintf 'OK: %q' "${dir}/${fn}" >&2
- fi
- done
- if [[ "${hashfail}" ]]; then
- exit 1
- fi
- fi
-
- if [[ "${rm}" == 'yes' ]]; then
- tsprintf 'Removing:' >&2
- for fn in "${uploaded[@]}"; do
- tsprintf ' %q' "${dir}/${fn}" >&2
- rm "${dir}/${fn}" || { tsprintf 'rm for %q exited with status %d' "${dir}/${fn}" "$?" >&2; exit 1; }
- done
- fi
-
- rm "${dir}/.dir-to-ia.batch-files" || { tsprintf "rm for %q batch files or sha1sums exited with status %d" "${dir}" "$?" >&2; exit 1; }
- if [[ -e "${dir}/.dir-to-ia.batch-sha1sums" ]]; then
- rm "${dir}/.dir-to-ia.batch-sha1sums" || { tsprintf "rm for %q batch files or sha1sums exited with status %d" "${dir}" "$?" >&2; exit 1; }
- fi
- fi
-
- # Loop over local files and upload them
- # The 'quota' controls how many files get uploaded at once.
- # It gets decremented for each uploaded file, and when it reaches zero, the item's task queue is checked to ensure it isn't building up.
- gotFileList=
- nQuota=50
- while IFS= read -r -d $'\0' fn; do
- dbgprintf 'Considering file %q' "${fn}"
- if [[ "${fn}" == .dir-to-ia.* ]]; then
- dbgprintf '%q is a dir-to-ia file, skipping' "${fn}"
- continue
- fi
- if [[ ! -f "${dir}/${fn}" ]]; then
- # Should never happen since the `find` command already uses `-type f`
- dbgprintf '%q is not a regular file, skipping' "${fn}"
- continue
- fi
- if [[ "${fn}" == "${identifier}"* ]]; then
- dbgprintf '%q starts with the identifier, skipping' "${fn}"
- continue
- fi
- if [[ "${rm}" == 'no' ]]; then
- if [[ ! "${gotFileList}" ]]; then
- # This can only proceed if there are no pending tasks; if there are, break out of the loop and revisit later.
- dbgprintf 'Checking whether IA has pending tasks on the item as the item file list/metadata (needed for rm=no) is unreliable otherwise...'
- summary="$("${scriptpath}/ia-tasks" --summary --jsonl"${identifier}")"
- st="$?"
- if ((st)); then
- tsprintf "Warning: metadata reliability check: ia-tasks exited with status ${st}" >&2
- break
- fi
- tasksAndErrors="$(python3 -c 'import json, sys; o = json.load(sys.stdin); print(sum(o.values()), o["error"])' <<<"${summary}")"
- st="$?"
- if ((st)); then
- tsprintf "Warning: metadata reliability check: python3 for ia-tasks output %q exited with status %d" "${summary}" "${st}" >&2
- break
- fi
- if [[ "${tasksAndErrors}" != '0 0' ]]; then
- dbgprintf "Metadata reliability check: IA tasks not finished (%d tasks remaining, %d errors)" "${tasksAndErrors% *}" "${tasksAndErrors#* }" >&2
- break
- fi
- dbgprintf 'No pending tasks, metadata should be reliable' >&2
-
- dbgprintf 'Retrieving existing files on IA...'
- # Scenarios: retrieval fails (curl exit != 0), item doesn't exist (= empty JSON, Python exit 31), no file list (e.g. darked, Python exit 32)
- shopt -s lastpipe
- curl --silent --location --max-time 10 --fail --user-agent 'little-things +https://gitea.arpa.li/JustAnotherArchivist/little-things' "https://archive.org/metadata/${identifier}" \
- | python3 -c 'import json, sys; o = json.load(sys.stdin); sys.exit(31) if o == {} else ...; sys.exit(32) if "files" not in o else ...; {print(f["name"], end = "\0") for f in o["files"] if not f["name"].startswith(sys.argv[1])}' "${identifier}" \
- | readarray -d $'\0' -t iafiles
- sts=("${PIPESTATUS[@]}")
- dbgprintf "Existing files on IA:$(printf " %q" "${iafiles[@]}")"
- if ((sts[0])); then
- tsprintf "IA metadata curl exited with status ${sts[0]}" >&2
- fi
- if ((sts[1] == 31)); then
- # Item doesn't exist, but that's fine. Reset the status so it doesn't trigger the exit below.
- sts[1]=0
- elif ((sts[1] == 32)); then
- tsprintf "IA returned no file list in metadata" >&2
- elif ((sts[1])); then
- tsprintf "IA metadata python3 exited with status ${sts[1]}" >&2
- fi
- if ((sts[0] || sts[1])); then
- break
- fi
- gotFileList=yes
- fi
-
- found=
- for remoteFn in "${iafiles[@]}"; do
- if [[ "${fn}" == "${remoteFn}" ]]; then
- found=yes
- break
- fi
- done
- if [[ "${found}" ]]; then
- dbgprintf '%q found in IA item, skipping' "${fn}"
- continue
- fi
- fi
-
- tsprintf 'Uploading %q to %q...' "${fn}" "${identifier}" >&2
- cmd=("${scriptpath}/ia-upload-stream")
- if [[ "${clobber}" == 'yes' ]]; then cmd+=('--clobber'); fi
- if [[ "${derive}" == 'no' ]]; then cmd+=('--no-derive'); fi
- if [[ "${autosizehint}" == 'yes' ]]; then
- read -r size _ < <(du -s -k "${dir}")
- cmd+=('--size-hint' "${size}K")
- fi
- cmd+=(--input-file "${dir}/${fn}")
- cmd+=("${iauploadstreamopts[@]}")
- cmd+=("${identifier}")
- cmd+=("${fn}")
- cmd+=("${metadata[@]}")
-
- dbgprintf "Upload command:$(printf " %q" "${cmd[@]}")"
- "${cmd[@]}" || { tsprintf "ia-upload-stream exited with status $?" >&2; exit 1; }
-
- if [[ "${sha256}" == 'yes' ]]; then
- tsprintf 'Calculating SHA-256...' >&2
- (cd "${dir}" && sha256sum "${fn}" > >(tee -a '.dir-to-ia.sha256sum')) || { tsprintf 'sha256sum for %q exited with status %d' "${fn}" "$?" >&2; exit 1; }
- fi
- if [[ "${sha1check}" == 'yes' ]]; then
- tsprintf 'Calculating SHA-1...' >&2
- sha1sum <"${dir}/${fn}" > >(tee -a "${dir}/.dir-to-ia.batch-sha1sums") || { tsprintf 'sha1sum for %q exited with status %d' "${fn}" "$?" >&2; exit 1; }
- fi
-
- if [[ "${rm}" == 'yes' || "${sha1check}" == 'yes' ]]; then
- printf '%s\0' "${fn}" >>"${dir}/.dir-to-ia.batch-files"
- fi
-
- ((--nQuota))
- if ((nQuota == 0)); then
- dbgprintf "Checking IA's task queue to determine how many more files we can upload in this iteration..."
- summary="$("${scriptpath}/ia-tasks" --summary --jsonl "${identifier}")"
- st="$?"
- if ((st)); then
- tsprintf "Warning: task queue check: ia-tasks exited with status ${st}" >&2
- exit 0
- fi
- tasksAndErrors="$(python3 -c 'import json, sys; o = json.load(sys.stdin); print(sum(o.values()), o["error"])' <<<"${summary}")"
- st="$?"
- if ((st)); then
- tsprintf "Warning: task queue check: python3 for ia-tasks output %q exited with status %d" "${summary}" "${st}" >&2
- exit 0
- fi
- tasks="${tasksAndErrors% *}"
- errors="${tasksAndErrors#* }"
- ((nQuota = 50 - tasks))
- if ((nQuota < 0 || errors > 0)); then
- nQuota=0
- fi
- dbgprintf "IA item has %d tasks remaining and %d errors, new quota: %d" "${tasks}" "${errors}" "${nQuota}"
- if ((nQuota == 0)); then
- break
- fi
- fi
- done < <(cd "${dir}" && find . -type f -print0 | sed -z 's,^\./,,')
-
- # Waiting for IA and/or deletions are handled in the next iteration
- )
- st="$?"
- if ((st)); then
- tsprintf 'Subshell exited with status %d, marking as dirty' "${st}" >&2
- printf '%s\n' \
- 'An error occurred on the last upload, so this file marks this directory as dirty to not cause worse problems.' \
- 'Delete this file once the underlying issue is resolved to make dir-to-ia process this directory again.' \
- >"${dir}/.dir-to-ia.dirty"
- fi
- dbgprintf 'Done with %q' "${dir}"
- } 3>"${dir}/.dir-to-ia.lock" &> >(stdbuf -oL tr '\r' '\n' | maybets | tee -a "${dir}/.dir-to-ia.log" >&2)
- done
- sleep 60
- done
|