|
|
@@ -0,0 +1,200 @@ |
|
|
|
#!/bin/bash |
|
|
|
# Requires Bash 4.3+ |
|
|
|
if [[ "$1" == '-h' || "$1" == '--help' ]]; then |
|
|
|
printf 'Usage: %q\n' "$0" |
|
|
|
printf 'Uploads files from each subdirectory of the PWD into an IA item using the dirname as the identifier\n' |
|
|
|
printf 'Configuration happens through a subdir/.dir-to-ia.config file, a Bash script that gets sourced to get the config values\n' |
|
|
|
printf "Use '%q --example-config' to get an example configuration with explanations\n" "$0" |
|
|
|
printf 'The upload log for each item gets written to subdir/.dir-to-ia.log\n' |
|
|
|
exit 1 |
|
|
|
fi |
|
|
|
|
|
|
|
# Default config values |
|
|
|
sha256=no |
|
|
|
rm=yes |
|
|
|
rmwait=yes |
|
|
|
clobber=no |
|
|
|
derive=no |
|
|
|
iaconfigfile= |
|
|
|
|
|
|
|
if [[ "$1" == '--example-config' ]]; then |
|
|
|
cat <<-EOF |
|
|
|
# Calculate SHA-256 hash of each file after uploading |
|
|
|
sha256=${sha256} |
|
|
|
|
|
|
|
# Remove local file after successful upload |
|
|
|
rm=${rm} |
|
|
|
|
|
|
|
# Delay removal until IA processed the upload |
|
|
|
rmwait=${rmwait} |
|
|
|
|
|
|
|
# Clobber existing files in IA item (no = existing copy is moved to history/files/ by IA) |
|
|
|
clobber=${clobber} |
|
|
|
|
|
|
|
# Queue derive after upload (applied after every file!) |
|
|
|
derive=${derive} |
|
|
|
|
|
|
|
# 'ia' config file (default: empty, meaning to use ia's own default paths and precedence rules) |
|
|
|
iaconfigfile=${iaconfigfile} |
|
|
|
|
|
|
|
# Item metadata (array with 'key:value' elements); the only mandatory variable with no default |
|
|
|
metadata=('collection:opensource' "date:$(printf '%(%Y-%m-%d)T')") |
|
|
|
EOF |
|
|
|
exit |
|
|
|
fi |
|
|
|
|
|
|
|
function tsprintf { |
|
|
|
datefmt='%(%Y-%m-%d %H:%M:%S)T'; |
|
|
|
if [[ $# -eq 1 ]]; then |
|
|
|
printf "${datefmt} %s\n" -1 "$@"; |
|
|
|
else |
|
|
|
# First argument is the format, rest are arguments to printf; pass through `while read` loop to have a better chance of line buffering. |
|
|
|
{ |
|
|
|
printf '%(%Y-%m-%d %H:%M:%S)T '; |
|
|
|
printf "$1" "${@:2}"; |
|
|
|
printf '\n'; |
|
|
|
} | while IFS= read -r line; do printf '%s\n' "${line}"; done |
|
|
|
fi |
|
|
|
} |
|
|
|
|
|
|
|
if [[ "${DEBUG}" ]]; then |
|
|
|
function dbgprint { for l; do tsprintf "[DEBUG] ${l}" >&2; done; } |
|
|
|
else |
|
|
|
function dbgprint { :; } |
|
|
|
fi |
|
|
|
|
|
|
|
for cmd in ia-upload-stream ia-wait-item-tasks ia; do |
|
|
|
if ! command -v "${cmd}" &>/dev/null; then |
|
|
|
echo "Error: ${cmd} not found, make sure it is in PATH" >&2 |
|
|
|
exit 1 |
|
|
|
fi |
|
|
|
done |
|
|
|
|
|
|
|
while :; do |
|
|
|
for dir in */; do |
|
|
|
dir="${dir%/}" |
|
|
|
identifier="${dir}" |
|
|
|
if [[ ! "${dir}" =~ ^[a-zA-Z0-9] ]]; then |
|
|
|
continue |
|
|
|
fi |
|
|
|
if [[ ! -e "${dir}/.dir-to-ia.config" ]]; then |
|
|
|
continue |
|
|
|
fi |
|
|
|
|
|
|
|
dbgprint "Processing ${dir}" |
|
|
|
|
|
|
|
# Everything from here on is executed in a subshell so that the config sourcing can't affect other items. |
|
|
|
# It would be possible to do that without a subshell, but this is easier. |
|
|
|
( |
|
|
|
# Source and check configuration |
|
|
|
. "${dir}/.dir-to-ia.config" || { tsprintf "Sourcing ${dir}/.dir-to-ia.config failed" >&2; exit 1; } |
|
|
|
configbroken= |
|
|
|
for v in sha256 rm rmwait clobber derive; do |
|
|
|
if [[ "${!v}" != 'yes' && "${!v}" != 'no' ]]; then |
|
|
|
tsprintf "Error in %q: %s must be 'yes' or 'no'" "${dir}/.dir-to-ia.config" "${v}" >&2 |
|
|
|
configbroken=yes |
|
|
|
fi |
|
|
|
done |
|
|
|
if read -r _ attrs _ < <(declare -p metadata 2>/dev/null); [[ "${attrs}" != *a* ]]; then |
|
|
|
tsprintf 'Error in %q: metadata missing or not an array' "${dir}/.dir-to-ia.config" >&2 |
|
|
|
configbroken=yes |
|
|
|
else |
|
|
|
for f in "${metadata[@]}"; do |
|
|
|
if [[ "${f}" != *:* ]]; then |
|
|
|
tsprintf 'Error in %q: metadata field missing colon: %s' "${dir}/.dir-to-ia.config" "${f}" >&2 |
|
|
|
configbroken=yes |
|
|
|
fi |
|
|
|
done |
|
|
|
fi |
|
|
|
if [[ "${configbroken}" ]]; then |
|
|
|
exit 1 |
|
|
|
fi |
|
|
|
if [[ "${rm}" == 'no' && "${clobber}" == 'yes' ]]; then |
|
|
|
tsprintf 'Error in %q: rm=no and clobber=yes is not permitted' "${dir}/.dir-to-ia.config" >&2 |
|
|
|
exit 1 |
|
|
|
fi |
|
|
|
dbgprint 'Configuration:' \ |
|
|
|
" sha256=${sha256}" \ |
|
|
|
" rm=${rm}" \ |
|
|
|
" rmwait=${rmwait}" \ |
|
|
|
" clobber=${clobber}" \ |
|
|
|
" derive=${derive}" \ |
|
|
|
" iaconfigfile${iaconfigfile:+: }${iaconfigfile:- not set}" \ |
|
|
|
" metadata=($(printf %q "${metadata[0]}")$(if [[ ${#metadata[@]} -gt 1 ]]; then printf ' %q' "${metadata[@]:1}"; fi))" |
|
|
|
|
|
|
|
# If removing local files is disabled, check first which ones are already on IA so they can be skipped. |
|
|
|
#TODO Do this only if there are files to upload |
|
|
|
if [[ "${rm}" == 'no' ]]; then |
|
|
|
dbgprint 'Retrieving existing files on IA...' |
|
|
|
#TODO Figure out a better way to verify correct retrieval than appending a placeholder. |
|
|
|
readarray -d $'\0' -t iafiles < <(curl --silent --location --max-time 10 --fail "https://archive.org/metadata/${identifier}" | python3 -c 'import json, sys; o = json.load(sys.stdin); {print(f["name"], end = "\0") for f in o.get("files", []) if not f["name"].startswith(sys.argv[1])}; print("__dir-to-ia_end__", end = "\0")' "${identifier}") |
|
|
|
if [[ "${#iafiles[@]}" -eq 0 || "${iafiles[-1]}" != '__dir-to-ia_end__' ]]; then |
|
|
|
tsprintf 'Error: could not retrieve metadata' >&2 |
|
|
|
exit 1 |
|
|
|
fi |
|
|
|
unset iafiles[-1] |
|
|
|
dbgprint "Existing files on IA:$(printf " %q" "${iafiles[@]}")" |
|
|
|
fi |
|
|
|
|
|
|
|
# Loop over local files and upload them |
|
|
|
while IFS= read -r -d $'\0' fn; do |
|
|
|
dbgprint "Considering file ${fn}" |
|
|
|
if [[ "${fn}" == .dir-to-ia.* ]]; then |
|
|
|
dbgprint "${fn} is a dir-to-ia file, skipping" |
|
|
|
continue |
|
|
|
fi |
|
|
|
if [[ ! -f "${dir}/${fn}" ]]; then |
|
|
|
# Should never happen since the `find` command already uses `-type f` |
|
|
|
dbgprint "${fn} is not a regular file, skipping" |
|
|
|
continue |
|
|
|
fi |
|
|
|
if [[ "${fn}" == "${identifier}"* ]]; then |
|
|
|
dbgprint "${fn} starts with the identifier, skipping" |
|
|
|
continue |
|
|
|
fi |
|
|
|
if [[ "${rm}" == 'no' ]]; then |
|
|
|
found= |
|
|
|
for remoteFn in "${iafiles[@]}"; do |
|
|
|
if [[ "${fn}" == "${remoteFn}" ]]; then |
|
|
|
found=yes |
|
|
|
break |
|
|
|
fi |
|
|
|
done |
|
|
|
if [[ "${found}" ]]; then |
|
|
|
dbgprint "${fn} found in IA item, skipping" |
|
|
|
continue |
|
|
|
fi |
|
|
|
fi |
|
|
|
|
|
|
|
tsprintf 'Uploading %q to %q...' "${fn}" "${identifier}" >&2 |
|
|
|
cmd=('ia-upload-stream') |
|
|
|
if [[ "${clobber}" == 'yes' ]]; then cmd+=('--clobber'); fi |
|
|
|
if [[ "${derive}" == 'no' ]]; then cmd+=('--no-derive'); fi |
|
|
|
if [[ "${iaconfigfile}" ]]; then cmd+=('--ia-config-file' "${iaconfigfile}"); fi |
|
|
|
cmd+=("${identifier}") |
|
|
|
cmd+=("${fn}") |
|
|
|
cmd+=("${metadata[@]}") |
|
|
|
|
|
|
|
dbgprint "Upload command:$(printf " %q" "${cmd[@]}")" |
|
|
|
"${cmd[@]}" <"${dir}/${fn}" || { tsprintf "ia-upload-stream exited with status $?" >&2; exit 1; } |
|
|
|
|
|
|
|
if [[ "${sha256}" == 'yes' ]]; then |
|
|
|
tsprintf 'Calculating SHA-256...' >&2 |
|
|
|
(cd "${dir}" && sha256sum "${fn}") || { tsprintf "sha256sum exited with status $?" >&2; exit 1; } |
|
|
|
fi |
|
|
|
|
|
|
|
if [[ "${rm}" == 'yes' ]]; then |
|
|
|
if [[ "${rmwait}" == 'yes' ]]; then |
|
|
|
tsprintf 'Waiting for IA to process the upload...' >&2 |
|
|
|
ia-wait-item-tasks "${identifier}" || { tsprintf "ia-wait-item-tasks exited with status $?" >&2; exit 1; } |
|
|
|
fi |
|
|
|
tsprintf 'IA upload processing finished, removing %q' "${fn}" >&2 |
|
|
|
# No option to run an extra SHA-1 check or similar since ia-upload-stream already sends an MD5 for each chunk, so corruption should be impossible. |
|
|
|
rm "${dir}/${fn}" |
|
|
|
fi |
|
|
|
done < <(cd "${dir}" && find . -type f -print0 | sed -z 's,^\./,,') |
|
|
|
) &> >(tee -a "${dir}/.dir-to-ia.log" >&2) |
|
|
|
dbgprint "Done with ${dir}" |
|
|
|
done |
|
|
|
sleep 60 |
|
|
|
done |