From 50769d4d231d2213a437c989ecdfcacc2fb5c4fe Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 29 Mar 2023 07:02:18 +0000 Subject: [PATCH] Parallel codearchiver processes --- README.md | 1 + codearchiver-bot | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e15fba8..25ae2f1 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ Configuration happens via environment variables: * `IA_S3_ACCESS` and `IA_S3_SECRET`: authentication for IA * `CODEARCHIVER_BOT_TEST` (optional): enables test mode when set to any non-empty value, with uploads going into items prefixed with `test_` and placed into `test_collection`. * `CODEARCHIVER_BOT_TIMEOUT` (optional): number of seconds how long a `codearchiver` command may run. Default: unlimited (0) +* `CODEARCHIVER_BOT_NPROC` (optional): number of parallel `codearchiver` processes. Default: 1 The data produced by `codearchiver-bot` must be kept in its working directory for correct deduplication. However, there's nothing unique there; all data is uploaded to IA continuously, and operation can be restored from there by downloading all `*_codearchiver_metadata.txt` files and creating placeholders (e.g. symlinks to `.uploaded` as the script does by default) for everything else. diff --git a/codearchiver-bot b/codearchiver-bot index baf36bb..221d6cd 100755 --- a/codearchiver-bot +++ b/codearchiver-bot @@ -11,6 +11,8 @@ done # Optional env variables declare -i timeout="${CODEARCHIVER_BOT_TIMEOUT:-0}" +declare -i nproclimit="${CODEARCHIVER_BOT_NPROC:-1}" +declare -i nproc=0 for dep in awk codearchiver curl ia-upload-stream python3 sha256sum tee zstd; do if ! command -v "${dep}" &>/dev/null; then @@ -136,6 +138,13 @@ function taint_block { continue fi + # Block until there's a free slot + while [[ "${nproc}" -ge "${nproclimit}" ]]; do + # Wait for one subshell to exit + wait -n + nproc+=-1 + done + taint_block 'continuing with work loop' # Find nonexistent filename for log file with lock @@ -216,7 +225,7 @@ function taint_block { fi fi ) & - wait + nproc+=1 done | # Upload