diff --git a/codearchiver-bot b/codearchiver-bot index 05db19a..dd2d61d 100755 --- a/codearchiver-bot +++ b/codearchiver-bot @@ -113,52 +113,50 @@ function respond { done | # The actual work loop - while :; do - # Process in batches for efficiency of parallel IA processing after uploads - declare -a batch=() - while read -r -t 1 line; do - batch+=("${line}") - done + while IFS= read -r line; do + singlejobid="${line%% *}" + line="${line#* }" + nick="${line%% *}" + url="${line#* }" - if [[ ${#batch[@]} -eq 0 ]]; then + # Handle marker for end of list job: tell the user it's done and move on. + if [[ "${url}" == 'end' ]]; then + # No status code reflection here because the start of the list job might not even have been in this batch. + respond "${nick}" "Job ${singlejobid} finished." continue fi - newfiles=() - - for ((i=0; i<${#batch[@]}; ++i)); do - line="${batch[${i}]}" - singlejobid="${line%% *}" - line="${line#* }" - nick="${line%% *}" - url="${line#* }" - - # Handle marker for end of list job: tell the user it's done and move on. - if [[ "${url}" == 'end' ]]; then - # No status code reflection here because the start of the list job might not even have been in this batch. - respond "${nick}" "Job ${singlejobid} finished." - continue - fi - - # Find nonexistent filename for log file (*not* concurrency-safe!) - logbasename="$(date +%Y%m%dT%H%M%SZ)_${singlejobid}" - if [[ -e "${logbasename}_codearchiver.log" || -e "${logbasename}_codearchiver.log.zst" ]]; then - for ((j=0; ; ++j)); do - if [[ ! -e "${logbasename}_coll${j}_codearchiver.log" || -e "${logbasename}_coll${j}_codearchiver.log.zst" ]]; then - break - fi - done - logbasename="${logbasename}_coll${j}" - fi - logname="${logbasename}_codearchiver.log" - artefactsname="${logbasename}_codearchiver_artefacts.txt" + # Find nonexistent filename for log file with lock + # mkdir is pretty much always atomic, creating files might not be depending on the underlying file system (e.g. networked ones like NFS). + while ! mkdir '.loglock' 2> >(log_loop 'mkdir loglock (work) err: '); do + sleep 1 + done + trap 'rmdir ".loglock"' EXIT # Unlock if something below fails + logbasename="$(date +%Y%m%dT%H%M%SZ)_${singlejobid}" + if [[ -e "${logbasename}_codearchiver.log" || -e "${logbasename}_codearchiver.log.zst" ]]; then + for ((j=0; ; ++j)); do + if [[ ! -e "${logbasename}_coll${j}_codearchiver.log" || -e "${logbasename}_coll${j}_codearchiver.log.zst" ]]; then + break + fi + done + logbasename="${logbasename}_coll${j}" + fi + logname="${logbasename}_codearchiver.log" + artefactsname="${logbasename}_codearchiver_artefacts.txt" + # Create the log file already in case spawning the tee process for it below is too slow + touch "${logname}" + trap - EXIT # Reset trap + rmdir '.loglock' # Unlock - # Run codearchiver, duplicating WARNINGs and higher in the bot output - log "Running ${url} (${singlejobid}), logging into ${logname}" + # Run codearchiver in a background shell, duplicating WARNINGs and higher in the bot output + # Produces lines of filenames to upload on stdout + log "Running ${url} (${singlejobid}), logging into ${logname}" + ( timeout --signal=INT "${timeout}" \ codearchiver --verbose --write-artefacts-fd-3 "${url}" \ - 2> >(tee "${logname}" | grep -Fv -e ' INFO ' | log_loop "From codearchiver ${singlejobid}: ") \ - 3> >(tee "${artefactsname}" | log_loop "New artefacts from codearchiver ${singlejobid}: ") + > >(log_loop "codearchiver ${singlejobid} out: ") \ + 2> >(tee "${logname}" | grep -Fv -e ' INFO ' | log_loop "codearchiver ${singlejobid} err: ") \ + 3> >(tee "${artefactsname}" | log_loop "Artefact from codearchiver ${singlejobid}: ") status="$?" log "codearchiver ${url} finished with status code ${status}" #TODO Integrate this into the pipe from codearchiver above to avoid rereading the entire log file @@ -181,11 +179,11 @@ function respond { mv --verbose -- "${artefacts[@]}" "${artefactsname}" ./failed/ 2> >(log_loop 'mv err: ') | log_loop 'mv out: ' else for file in "${artefacts[@]}"; do - newfiles+=("${file}") + printf '%s\n' "${file}" done - newfiles+=("${artefactsname}") + printf '%s\n' "${artefactsname}" fi - newfiles+=("${logname}") + printf '%s\n' "${logname}" # For individual jobs, tell the user about warnings and success/failure if [[ "${singlejobid}" != *_* ]]; then @@ -199,11 +197,23 @@ function respond { respond "${nick}" "Job ${singlejobid} produced ${badcount} warnings or errors." fi fi + ) & + wait + done | + + # Upload + while :; do + # Process in batches for efficiency of parallel IA upload processing + declare -a filenames=() + while read -r -t 1 filename; do + filenames+=("${filename}") done + if [[ ${#filenames[@]} -eq 0 ]]; then + continue + fi # Record SHA-256 hashes for new files - log "SHA-256 hashes:" - sha256sum "${newfiles[@]}" > >(log_loop 'sha256sum: ') + sha256sum "${filenames[@]}" > >(log_loop 'sha256sum: ') # Upload date="$(date '+%Y-%m-%d')" @@ -215,7 +225,7 @@ function respond { collection='test_collection' fi uploadsfine=y - for f in "${newfiles[@]}"; do + for f in "${filenames[@]}"; do ia-upload-stream --no-derive "${identifier}" "${f}" \ "collection:${collection}" \ 'mediatype:software' \ @@ -231,24 +241,34 @@ function respond { fi done - if [[ "${uploadsfine}" ]]; then - # Wait until all tasks for the item are done - while :; do - tasks="$(python3 -c 'import json, sys; o = json.load(sys.stdin); print(sum(o["value"]["summary"].values()))' < <({ curl --silent --verbose --fail --max-time 10 --header "Authorization: LOW ${IA_S3_ACCESS}:${IA_S3_SECRET}" "https://archive.org/services/tasks.php?identifier=${identifier}&summary=1&history=0" 2> >(log_loop 'curl IA tasks err: '); } | tee >(log_loop 'curl IA tasks out: ')))" - if [[ "${tasks}" == '0' ]]; then - break - fi - sleep 60 - done - - # Replace non-metadata files with a symlink to .uploaded dummy file - touch '.uploaded' - for f in "${newfiles[@]}"; do - if [[ "${f}" != *_codearchiver_metadata.txt ]]; then - log "Replacing ${f} with symlink to .uploaded" - { rm --verbose -- "${f}" && ln --symbolic --verbose '.uploaded' "${f}"; } |& log_loop 'rm/ln: ' - fi - done + if [[ -z "${uploadsfine}" ]]; then + continue fi + + # Wait until all tasks for the item are done + while :; do + tasks="$(python3 -c 'import json, sys; o = json.load(sys.stdin); print(sum(o["value"]["summary"].values()))' < <({ curl --silent --verbose --fail --max-time 10 --header "Authorization: LOW ${IA_S3_ACCESS}:${IA_S3_SECRET}" "https://archive.org/services/tasks.php?identifier=${identifier}&summary=1&history=0" 2> >(log_loop 'curl IA tasks err: '); } | tee >(log_loop 'curl IA tasks out: ')))" + if [[ "${tasks}" == '0' ]]; then + break + fi + sleep 60 + done + + # Replace non-metadata files with a symlink to .uploaded dummy file + # No locking with codearchiver processes is necessary because those will only read metadata (which is left alone) or write files. + # However, a lock with the log filename finding is required. + while ! mkdir '.loglock' 2> >(log_loop 'mkdir loglock (upload) err: '); do + sleep 1.1 # Slightly longer than above to minimise repeated collisions + done + trap 'rmdir ".loglock"' EXIT + touch '.uploaded' + for f in "${filenames[@]}"; do + if [[ "${f}" != *_codearchiver_metadata.txt ]]; then + log "Replacing ${f} with symlink to .uploaded" + { rm --verbose -- "${f}" && ln --symbolic --verbose '.uploaded' "${f}"; } |& log_loop 'rm/ln: ' + fi + done + trap - EXIT + rmdir '.loglock' done }