#!/bin/bash export TZ=UTC envvars=(HTTP2IRC_GET_URL HTTP2IRC_POST_URL IA_S3_ACCESS IA_S3_SECRET) for envvar in "${envvars[@]}"; do if [[ ! -v "${envvar}" ]]; then { printf 'Error: one or more of the required environment variables (%s' "${envvars[0]}"; printf ', %s' "${envvars[@]:1}"; printf ') missing\n'; } >&2 exit 1 fi done # Optional env variables declare -i timeout="${CODEARCHIVER_BOT_TIMEOUT:-0}" declare -i nproclimit="${CODEARCHIVER_BOT_NPROC:-1}" declare -i nproc=0 for dep in awk codearchiver curl ia-upload-stream python3 sha256sum tee zstd; do if ! command -v "${dep}" &>/dev/null; then printf 'Error: %s not found\n' "${dep}" >&2 exit 1 fi done function log { printf '%s %s\n' "${EPOCHREALTIME}" "$1" >&2 } function log_loop { prefix="$1" # If there is output and it does not end with a LF, add one. Then replace CRLF with LF and replace remaining CR with LF. { lastchar="$(tee /dev/fd/42 | tail -c 1 | xxd -p)"; if [[ "${lastchar}" && "${lastchar}" != '0a' ]]; then printf '\n'; fi } 42>&1 | sed -u 's,\r$,,; s,\r,\n,g' | while IFS= read -r line; do log "${prefix}${line}"; done } function send { local message="$1" log "Sending message: ${message}" curl --silent --verbose --max-time 10 --data "${message}" "${HTTP2IRC_POST_URL}" 2> >(log_loop 'curl http2irc POST: ') | log_loop 'http2irc POST response: ' } function respond { local nick="$1" local message="$2" send "${nick}: ${message}" } function taint_block { message="Storage is tainted, not ${1}" if [[ -e '.tainted' ]]; then log "${message}" while [[ -e '.tainted' ]]; do sleep 1 done fi } { # Group the pipeline without requiring a backslash every time while :; do # Read from http2irc log 'Starting http2irc GET stream...' curl --silent --verbose --no-buffer "${HTTP2IRC_GET_URL}" 2> >(log_loop 'curl http2irc GET: ') printf '\n' # Ensure that there's a trailing LF for `read` done | # Log all raw input tee >(log_loop 'Received http2irc line: ') | # Transform the JSONL data into a more suitable format for the following: lines of 'modes SP nick SP message' python3 -u -c 'import json, sys'$'\n''def json_parse_or_none(s):'$'\n'' try: return json.loads(s)'$'\n'' except json.JSONDecodeError as e:'$'\n'' print(f"Could not parse {s[:100]}…: {type(e).__name__}: {e!s}")'$'\n''{print(o["user"]["modes"] or "_", o["user"]["nick"], o["message"]) for o in map(json_parse_or_none, sys.stdin) if o and o.get("command") == "PRIVMSG"}' | # For valid bot commands with adequate permissions, assign a job ID and respond. Suppress everything else. Print lines of 'jobid SP nick SP URL' for the processing below. while read -r modes nick message; do if [[ "${message}" == '!help' ]]; then respond "${nick}" '`!a URL`: archives a single repository' respond "${nick}" '`!a < URL`: archives a list of repositories (no success/failure report, no warnings/errors report, check logs manually!)' continue fi if [[ "${message}" != '!a '* ]]; then continue fi if [[ "${modes}" != *[@+]* ]]; then respond "${nick}" 'Only voiced or opped users may use this command.' continue fi if [[ "${message}" =~ ^'!a '([a-z-]+\+)?[a-z]+://[^\ ]+$ ]]; then # Individual job jobs=("${message:3}") src="${message:3}" elif [[ "${message}" =~ ^'!a < 'https://transfer\.archivete\.am/[a-zA-Z0-9]+/.+$ ]]; then # List job jobs=() url="${message:5}" bad= log "Retrieving list job list: ${url}" while read -r line; do if [[ "${line}" =~ ^([a-z-]+\+)?[a-z]+://[^\ ]+$ ]]; then jobs+=("${line}") elif [[ "${line}" == '' ]]; then # Ignore empty lines continue else respond "${nick}" "Malformed line in ${url}: ${line}" bad=1 break fi done < <({ curl --silent --verbose --fail --max-time 10 "${message:5}" 2> >(log_loop 'curl list job: '); printf '\n'; } | tee >(log_loop 'List input line: ')) if [[ "${bad}" ]]; then continue fi src="${url}" else respond "${nick}" "I don't understand your command. Please forgive me." continue fi read -r jobid >(log_loop 'mkdir loglock (work) err: '); do sleep 1 done trap 'rmdir ".loglock"' EXIT # Unlock if something below fails logbasename="$(printf '%(%Y%m%dT%H%M%SZ)T')_${singlejobid}" if [[ -e "${logbasename}_codearchiver.log" || -e "${logbasename}_codearchiver.log.zst" ]]; then for ((j=0; ; ++j)); do if [[ ! -e "${logbasename}_coll${j}_codearchiver.log" || -e "${logbasename}_coll${j}_codearchiver.log.zst" ]]; then break fi done logbasename="${logbasename}_coll${j}" fi logname="${logbasename}_codearchiver.log" artefactsname="${logbasename}_codearchiver_artefacts.txt" # Create the log file already in case spawning the tee process for it below is too slow touch "${logname}" trap - EXIT # Reset trap rmdir '.loglock' # Unlock # Run codearchiver in a background shell, duplicating WARNINGs and higher in the bot output # Produces lines of filenames to upload on stdout log "Running ${url} (${singlejobid}), logging into ${logname}" ( taint_block "launching job ${singlejobid}" timeout --signal=INT "${timeout}" \ codearchiver --verbose --write-artefacts-fd-3 "${url}" \ 3> >(tee "${artefactsname}" | log_loop "Artefact from codearchiver ${singlejobid}: ") \ > >(log_loop "codearchiver ${singlejobid} out: ") \ 2> >(tee "${logname}" | grep -Fv -e ' INFO ' | log_loop "codearchiver ${singlejobid} err: ") status="$?" log "codearchiver ${url} finished with status code ${status}" #TODO Integrate this into the pipe from codearchiver above to avoid rereading the entire log file declare -i badcount=$(awk '! ($3 ~ /^INFO$/) { cnt += 1; } END { printf "%d\n", cnt; }' "${logname}") # Compress log file with zstd -19 log "Compressing log file ${logname}" zstd -19 --rm "${logname}" 2> >(log_loop 'zstd err: ') if [[ -e "${logname}.zst" && ! -e "${logname}" ]]; then # Compression successful logname="${logname}.zst" fi # Verify that there are no artefacts if codearchiver exited non-zero # Since codearchiver handles errors internally normally, this should not usually happen, but it could occur e.g. if running out of disk space and leaving partial files in the storage. # With parallelism, this could in theory lead to artefacts of a successful run depending on artefacts from a failed run, which we wouldn't want. # So, if there are artefacts of a failed process, touch the .tainted file to stop the uploader and new processes starting and send a warning to IRC. # Emit the log filename for upload always (even on tainted storage), artefacts list and artefacts only on zero exit. readarray -t artefacts <"${artefactsname}" if [[ "${status}" -ne 0 && "${#artefacts[@]}" -ne 0 ]]; then touch '.tainted' send "Job ${singlejobid} exited non-zero but left artefacts behind!" msg="$(printf 'Artefact files by non-zero exit process: '; printf ' %q' "${artefacts[@]}")" log "${msg}" elif [[ "${status}" -eq 0 ]]; then for file in "${artefacts[@]}"; do printf '%s\n' "${file}" done printf '%s\n' "${artefactsname}" fi printf '%s\n' "${logname}" # For individual jobs, tell the user about warnings and success/failure if [[ "${singlejobid}" != *_* ]]; then if [[ "${status}" -eq 0 ]]; then respond "${nick}" "Job ${singlejobid} succeeded." else respond "${nick}" "Job ${singlejobid} failed." fi if [[ ${badcount} -gt 0 ]]; then respond "${nick}" "Job ${singlejobid} produced ${badcount} warnings or errors." fi fi ) & nproc+=1 done | # Upload while :; do # Process in batches for efficiency of parallel IA upload processing declare -a filenames=() while read -r -t 1 filename; do filenames+=("${filename}") done if [[ ${#filenames[@]} -eq 0 ]]; then continue fi log 'Starting upload batch' # Record SHA-256 hashes for new files sha256sum "${filenames[@]}" > >(log_loop 'sha256sum: ') taint_block 'uploading anything' # Upload date="$(printf '%(%Y-%m-%d)T')" identifier="codearchiver_${date//-/}" if [[ -z "${CODEARCHIVER_BOT_TEST}" ]]; then collection='archiveteam_codearchiver' else identifier="test_${identifier}" collection='test_collection' fi uploadsfine=y for f in "${filenames[@]}"; do taint_block "starting upload for $(printf '%q' "${f}")" log "Uploading $(printf '%q' "${f}") to ${identifier}" ia-upload-stream --no-derive "${identifier}" "${f}" \ "collection:${collection}" \ 'mediatype:software' \ "date:${date}" \ 'noarchivetorrent:true' \ <"${f}" 2> >(log_loop 'ia-upload-stream: ') status="$?" if [[ "${status}" -ne 0 ]]; then log "Upload failed: exit status ${status}" if [[ "${uploadsfine}" ]]; then send "Upload failed: exit status ${status}" fi uploadsfine= fi done if [[ -z "${uploadsfine}" ]]; then log 'At least one upload in the batch failed, not removing anything' continue fi # Wait until all tasks for the item are done while :; do tasks="$(python3 -c 'import json, sys; o = json.load(sys.stdin); totalTasks = sum(o["value"]["summary"].values()); virusChecks = sum(1 for task in o["value"]["catalog"] if task.get("cmd") == "book_op.php" and task.get("args", {}).get("op10") == "VirusCheck"); print(f"Expected exactly 0 or 1 VirusCheck tasks, got {virusChecks}", file = sys.stderr) if virusChecks not in (0, 1) else None; print("Ignoring VirusCheck book_op task", file = sys.stderr) if virusChecks > 0 else None; print(totalTasks - min(virusChecks, 1))' < <({ curl --silent --verbose --fail --max-time 10 --header "Authorization: LOW ${IA_S3_ACCESS}:${IA_S3_SECRET}" "https://archive.org/services/tasks.php?identifier=${identifier}&catalog=1&history=0" 2> >(log_loop 'curl IA tasks err: '); } | tee >(log_loop 'curl IA tasks out: ')))" if [[ "${tasks}" == '0' ]]; then break fi sleep 60 done taint_block 'removing any files after upload' # Replace non-metadata files with a symlink to .uploaded dummy file # No locking with codearchiver processes is necessary because those will only read metadata (which is left alone) or write files. # However, a lock with the log filename finding is required. while ! mkdir '.loglock' 2> >(log_loop 'mkdir loglock (upload) err: '); do sleep 1.1 # Slightly longer than above to minimise repeated collisions done trap 'rmdir ".loglock"' EXIT touch '.uploaded' for f in "${filenames[@]}"; do if [[ "${f}" != *_codearchiver_metadata.txt ]]; then log "Replacing $(printf '%q' "${f}") with symlink to .uploaded" { rm --verbose -- "${f}" && ln --symbolic --verbose '.uploaded' "${f}"; } |& log_loop 'rm/ln: ' fi done trap - EXIT rmdir '.loglock' done }