From 6fd569a57733a5525e2c191d3983d5bf1814cd29 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 29 Mar 2023 06:53:07 +0000 Subject: [PATCH] Taint storage if any codearchiver process exited non-zero as it might cause parallel processes to rely on broken data --- codearchiver-bot | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/codearchiver-bot b/codearchiver-bot index e92013a..baf36bb 100755 --- a/codearchiver-bot +++ b/codearchiver-bot @@ -43,6 +43,16 @@ function respond { send "${nick}: ${message}" } +function taint_block { + message="Storage is tainted, not ${1}" + if [[ -e '.tainted' ]]; then + log "${message}" + while [[ -e '.tainted' ]]; do + sleep 1 + done + fi +} + { # Group the pipeline without requiring a backslash every time while :; do # Read from http2irc @@ -126,6 +136,8 @@ function respond { continue fi + taint_block 'continuing with work loop' + # Find nonexistent filename for log file with lock # mkdir is pretty much always atomic, creating files might not be depending on the underlying file system (e.g. networked ones like NFS). while ! mkdir '.loglock' 2> >(log_loop 'mkdir loglock (work) err: '); do @@ -152,6 +164,8 @@ function respond { # Produces lines of filenames to upload on stdout log "Running ${url} (${singlejobid}), logging into ${logname}" ( + taint_block "launching job ${singlejobid}" + timeout --signal=INT "${timeout}" \ codearchiver --verbose --write-artefacts-fd-3 "${url}" \ > >(log_loop "codearchiver ${singlejobid} out: ") \ @@ -170,14 +184,18 @@ function respond { logname="${logname}.zst" fi - # Move everything but the log file to ./failed/ if codearchiver exited non-zero + # Verify that there are no artefacts if codearchiver exited non-zero + # Since codearchiver handles errors internally normally, this should not usually happen, but it could occur e.g. if running out of disk space and leaving partial files in the storage. + # With parallelism, this could in theory lead to artefacts of a successful run depending on artefacts from a failed run, which we wouldn't want. + # So, if there are artefacts of a failed process, touch the .tainted file to stop the uploader and new processes starting and send a warning to IRC. + # Emit the log filename for upload always (even on tainted storage), artefacts list and artefacts only on zero exit. readarray -t artefacts <"${artefactsname}" - if [[ "${status}" -ne 0 ]]; then - msg="$(printf 'Moving artefact files'; printf ' %q' "${artefacts[@]}" "${artefactsname}"; printf ' from non-zero exit for job %s to ./failed/\n' "${singlejobid}";)" + if [[ "${status}" -ne 0 && "${#artefacts[@]}" -ne 0 ]]; then + touch '.tainted' + send "Job ${singlejobid} exited non-zero but left artefacts behind!" + msg="$(printf 'Artefact files by non-zero exit process: '; printf ' %q' "${artefacts[@]}")" log "${msg}" - mkdir --parents ./failed/ - mv --verbose -- "${artefacts[@]}" "${artefactsname}" ./failed/ 2> >(log_loop 'mv err: ') | log_loop 'mv out: ' - else + elif [[ "${status}" -eq 0 ]]; then for file in "${artefacts[@]}"; do printf '%s\n' "${file}" done @@ -217,6 +235,8 @@ function respond { # Record SHA-256 hashes for new files sha256sum "${filenames[@]}" > >(log_loop 'sha256sum: ') + taint_block 'uploading anything' + # Upload date="$(date '+%Y-%m-%d')" identifier="codearchiver_${date//-/}" @@ -228,6 +248,8 @@ function respond { fi uploadsfine=y for f in "${filenames[@]}"; do + taint_block "starting upload for $(printf '%q' "${f}")" + log "Uploading $(printf '%q' "${f}") to ${identifier}" ia-upload-stream --no-derive "${identifier}" "${f}" \ "collection:${collection}" \ @@ -258,6 +280,8 @@ function respond { sleep 60 done + taint_block 'removing any files after upload' + # Replace non-metadata files with a symlink to .uploaded dummy file # No locking with codearchiver processes is necessary because those will only read metadata (which is left alone) or write files. # However, a lock with the log filename finding is required.