|
- #!/bin/bash
- export TZ=UTC
-
- envvars=(HTTP2IRC_GET_URL HTTP2IRC_POST_URL IA_S3_ACCESS IA_S3_SECRET)
- for envvar in "${envvars[@]}"; do
- if [[ ! -v "${envvar}" ]]; then
- { printf 'Error: one or more of the required environment variables (%s' "${envvars[0]}"; printf ', %s' "${envvars[@]:1}"; printf ') missing\n'; } >&2
- exit 1
- fi
- done
-
- # Optional env variables
- declare -i timeout="${CODEARCHIVER_BOT_TIMEOUT:-0}"
- declare -i nproclimit="${CODEARCHIVER_BOT_NPROC:-1}"
- declare -i nproc=0
-
- for dep in awk codearchiver curl ia-upload-stream python3 sha256sum tee zstd; do
- if ! command -v "${dep}" &>/dev/null; then
- printf 'Error: %s not found\n' "${dep}" >&2
- exit 1
- fi
- done
-
- function log {
- printf '%s %s\n' "${EPOCHREALTIME}" "$1" >&2
- }
-
- function log_loop {
- prefix="$1"
- # If the output does not end with a LF, add one. Then replace CRLF with LF and replace remaining CR with LF.
- { lastchar="$(tee /dev/fd/3 | tail -c 1 | xxd -p)"; if [[ "${lastchar}" != '0a' ]]; then printf '\n'; fi } 3>&1 |
- sed -u 's,\r$,,; s,\r,\n,g' |
- while IFS= read -r line; do log "${prefix}${line}"; done
- }
-
- function send {
- local message="$1"
- log "Sending message: ${message}"
- curl --silent --verbose --max-time 10 --data "${message}" "${HTTP2IRC_POST_URL}" 2> >(log_loop 'curl http2irc POST: ') | log_loop 'http2irc POST response: '
- }
-
- function respond {
- local nick="$1"
- local message="$2"
- send "${nick}: ${message}"
- }
-
- function taint_block {
- message="Storage is tainted, not ${1}"
- if [[ -e '.tainted' ]]; then
- log "${message}"
- while [[ -e '.tainted' ]]; do
- sleep 1
- done
- fi
- }
-
- { # Group the pipeline without requiring a backslash every time
- while :; do
- # Read from http2irc
- log 'Starting http2irc GET stream...'
- curl --silent --verbose --no-buffer "${HTTP2IRC_GET_URL}" 2> >(log_loop 'curl http2irc GET: ')
- printf '\n' # Ensure that there's a trailing LF for `read`
- done |
-
- # Log all raw input
- tee >(log_loop 'Received http2irc line: ') |
-
- # Transform the JSONL data into a more suitable format for the following: lines of 'modes SP nick SP message'
- python3 -u -c 'import json, sys'$'\n''def json_parse_or_none(s):'$'\n'' try: return json.loads(s)'$'\n'' except json.JSONDecodeError as e:'$'\n'' print(f"Could not parse {s[:100]}…: {type(e).__name__}: {e!s}")'$'\n''{print(o["user"]["modes"] or "_", o["user"]["nick"], o["message"]) for o in map(json_parse_or_none, sys.stdin) if o and o.get("command") == "PRIVMSG"}' |
-
- # For valid bot commands with adequate permissions, assign a job ID and respond. Suppress everything else. Print lines of 'jobid SP nick SP URL' for the processing below.
- while read -r modes nick message; do
- if [[ "${message}" == '!help' ]]; then
- respond "${nick}" '`!a URL`: archives a single repository'
- respond "${nick}" '`!a < URL`: archives a list of repositories (no success/failure report, no warnings/errors report, check logs manually!)'
- continue
- fi
- if [[ "${message}" != '!a '* ]]; then
- continue
- fi
- if [[ "${modes}" != *[@+]* ]]; then
- respond "${nick}" 'Only voiced or opped users may use this command.'
- continue
- fi
- if [[ "${message}" =~ ^'!a '([a-z-]+\+)?[a-z]+://[^\ ]+$ ]]; then
- # Individual job
- jobs=("${message:3}")
- src="${message:3}"
- elif [[ "${message}" =~ ^'!a < 'https://transfer\.archivete\.am/[a-zA-Z0-9]+/.+$ ]]; then
- # List job
- jobs=()
- url="${message:5}"
- bad=
- log "Retrieving list job list: ${url}"
- while read -r line; do
- if [[ "${line}" =~ ^'!a '([a-z-]+\+)?[a-z]+://[^\ ]+$ ]]; then
- jobs+=("${line}")
- elif [[ "${line}" == '' ]]; then
- # Ignore empty lines
- continue
- else
- respond "${nick}" "Malformed line in ${url}: ${line}"
- bad=1
- break
- fi
- done < <({ curl --silent --verbose --fail --max-time 10 "${message:5}" 2> >(log_loop 'curl list job: '); printf '\n'; } | tee >(log_loop 'List input line: '))
- if [[ "${bad}" ]]; then
- continue
- fi
- src="${url}"
- else
- respond "${nick}" "I don't understand your command. Please forgive me."
- continue
- fi
- read -r jobid </proc/sys/kernel/random/uuid
- respond "${nick}" "Queueing job ${jobid} for ${src}"
- appendcounter=; if [[ ${#jobs[@]} -gt 1 ]]; then appendcounter=yes; fi
- for ((i=0; i<${#jobs[@]}; ++i)); do
- job="${jobs[${i}]}"
- singlejobid="${jobid}"; if [[ "${appendcounter}" ]]; then singlejobid+="_${i}"; fi
- printf '%s %s %s\n' "${singlejobid}" "${nick}" "${job}"
- done
- if [[ "${appendcounter}" ]]; then printf '%s %s end\n' "${jobid}" "${nick}"; fi # Special value for sending a message when all list URLs have been processed
- done |
-
- # The actual work loop
- while IFS= read -r line; do
- singlejobid="${line%% *}"
- line="${line#* }"
- nick="${line%% *}"
- url="${line#* }"
-
- # Handle marker for end of list job: tell the user it's done and move on.
- if [[ "${url}" == 'end' ]]; then
- # No status code reflection here because the start of the list job might not even have been in this batch.
- respond "${nick}" "Job ${singlejobid} finished."
- continue
- fi
-
- # Block until there's a free slot
- while [[ "${nproc}" -ge "${nproclimit}" ]]; do
- # Wait for one subshell to exit
- wait -n
- nproc+=-1
- done
-
- taint_block 'continuing with work loop'
-
- # Find nonexistent filename for log file with lock
- # mkdir is pretty much always atomic, creating files might not be depending on the underlying file system (e.g. networked ones like NFS).
- while ! mkdir '.loglock' 2> >(log_loop 'mkdir loglock (work) err: '); do
- sleep 1
- done
- trap 'rmdir ".loglock"' EXIT # Unlock if something below fails
- logbasename="$(date +%Y%m%dT%H%M%SZ)_${singlejobid}"
- if [[ -e "${logbasename}_codearchiver.log" || -e "${logbasename}_codearchiver.log.zst" ]]; then
- for ((j=0; ; ++j)); do
- if [[ ! -e "${logbasename}_coll${j}_codearchiver.log" || -e "${logbasename}_coll${j}_codearchiver.log.zst" ]]; then
- break
- fi
- done
- logbasename="${logbasename}_coll${j}"
- fi
- logname="${logbasename}_codearchiver.log"
- artefactsname="${logbasename}_codearchiver_artefacts.txt"
- # Create the log file already in case spawning the tee process for it below is too slow
- touch "${logname}"
- trap - EXIT # Reset trap
- rmdir '.loglock' # Unlock
-
- # Run codearchiver in a background shell, duplicating WARNINGs and higher in the bot output
- # Produces lines of filenames to upload on stdout
- log "Running ${url} (${singlejobid}), logging into ${logname}"
- (
- taint_block "launching job ${singlejobid}"
-
- timeout --signal=INT "${timeout}" \
- codearchiver --verbose --write-artefacts-fd-3 "${url}" \
- > >(log_loop "codearchiver ${singlejobid} out: ") \
- 2> >(tee "${logname}" | grep -Fv -e ' INFO ' | log_loop "codearchiver ${singlejobid} err: ") \
- 3> >(tee "${artefactsname}" | log_loop "Artefact from codearchiver ${singlejobid}: ")
- status="$?"
- log "codearchiver ${url} finished with status code ${status}"
- #TODO Integrate this into the pipe from codearchiver above to avoid rereading the entire log file
- declare -i badcount=$(awk '! ($3 ~ /^INFO$/) { cnt += 1; } END { printf "%d\n", cnt; }' "${logname}")
-
- # Compress log file with zstd -19
- log "Compressing log file ${logname}"
- zstd -19 --rm "${logname}" 2> >(log_loop 'zstd err: ')
- if [[ -e "${logname}.zst" && ! -e "${logname}" ]]; then
- # Compression successful
- logname="${logname}.zst"
- fi
-
- # Verify that there are no artefacts if codearchiver exited non-zero
- # Since codearchiver handles errors internally normally, this should not usually happen, but it could occur e.g. if running out of disk space and leaving partial files in the storage.
- # With parallelism, this could in theory lead to artefacts of a successful run depending on artefacts from a failed run, which we wouldn't want.
- # So, if there are artefacts of a failed process, touch the .tainted file to stop the uploader and new processes starting and send a warning to IRC.
- # Emit the log filename for upload always (even on tainted storage), artefacts list and artefacts only on zero exit.
- readarray -t artefacts <"${artefactsname}"
- if [[ "${status}" -ne 0 && "${#artefacts[@]}" -ne 0 ]]; then
- touch '.tainted'
- send "Job ${singlejobid} exited non-zero but left artefacts behind!"
- msg="$(printf 'Artefact files by non-zero exit process: '; printf ' %q' "${artefacts[@]}")"
- log "${msg}"
- elif [[ "${status}" -eq 0 ]]; then
- for file in "${artefacts[@]}"; do
- printf '%s\n' "${file}"
- done
- printf '%s\n' "${artefactsname}"
- fi
- printf '%s\n' "${logname}"
-
- # For individual jobs, tell the user about warnings and success/failure
- if [[ "${singlejobid}" != *_* ]]; then
- if [[ "${status}" -eq 0 ]]; then
- respond "${nick}" "Job ${singlejobid} succeeded."
- else
- respond "${nick}" "Job ${singlejobid} failed."
- fi
-
- if [[ ${badcount} -gt 0 ]]; then
- respond "${nick}" "Job ${singlejobid} produced ${badcount} warnings or errors."
- fi
- fi
- ) &
- nproc+=1
- done |
-
- # Upload
- while :; do
- # Process in batches for efficiency of parallel IA upload processing
- declare -a filenames=()
- while read -r -t 1 filename; do
- filenames+=("${filename}")
- done
- if [[ ${#filenames[@]} -eq 0 ]]; then
- continue
- fi
-
- log 'Starting upload batch'
-
- # Record SHA-256 hashes for new files
- sha256sum "${filenames[@]}" > >(log_loop 'sha256sum: ')
-
- taint_block 'uploading anything'
-
- # Upload
- date="$(date '+%Y-%m-%d')"
- identifier="codearchiver_${date//-/}"
- if [[ -z "${CODEARCHIVER_BOT_TEST}" ]]; then
- collection='archiveteam_codearchiver'
- else
- identifier="test_${identifier}"
- collection='test_collection'
- fi
- uploadsfine=y
- for f in "${filenames[@]}"; do
- taint_block "starting upload for $(printf '%q' "${f}")"
-
- log "Uploading $(printf '%q' "${f}") to ${identifier}"
- ia-upload-stream --no-derive "${identifier}" "${f}" \
- "collection:${collection}" \
- 'mediatype:software' \
- "date:${date}" \
- <"${f}" 2> >(log_loop 'ia-upload-stream: ')
- status="$?"
- if [[ "${status}" -ne 0 ]]; then
- log "Upload failed: exit status ${status}"
- if [[ "${uploadsfine}" ]]; then
- send "Upload failed: exit status ${status}"
- fi
- uploadsfine=
- fi
- done
-
- if [[ -z "${uploadsfine}" ]]; then
- log 'At least one upload in the batch failed, not removing anything'
- continue
- fi
-
- # Wait until all tasks for the item are done
- while :; do
- tasks="$(python3 -c 'import json, sys; o = json.load(sys.stdin); print(sum(o["value"]["summary"].values()))' < <({ curl --silent --verbose --fail --max-time 10 --header "Authorization: LOW ${IA_S3_ACCESS}:${IA_S3_SECRET}" "https://archive.org/services/tasks.php?identifier=${identifier}&summary=1&history=0" 2> >(log_loop 'curl IA tasks err: '); } | tee >(log_loop 'curl IA tasks out: ')))"
- if [[ "${tasks}" == '0' ]]; then
- break
- fi
- sleep 60
- done
-
- taint_block 'removing any files after upload'
-
- # Replace non-metadata files with a symlink to .uploaded dummy file
- # No locking with codearchiver processes is necessary because those will only read metadata (which is left alone) or write files.
- # However, a lock with the log filename finding is required.
- while ! mkdir '.loglock' 2> >(log_loop 'mkdir loglock (upload) err: '); do
- sleep 1.1 # Slightly longer than above to minimise repeated collisions
- done
- trap 'rmdir ".loglock"' EXIT
- touch '.uploaded'
- for f in "${filenames[@]}"; do
- if [[ "${f}" != *_codearchiver_metadata.txt ]]; then
- log "Replacing $(printf '%q' "${f}") with symlink to .uploaded"
- { rm --verbose -- "${f}" && ln --symbolic --verbose '.uploaded' "${f}"; } |& log_loop 'rm/ln: '
- fi
- done
- trap - EXIT
- rmdir '.loglock'
- done
- }
|