From b6f0cf2217126e8a3fbe54ea4c5b84e0e85cbe05 Mon Sep 17 00:00:00 2001 From: Roelf Wichertjes Date: Thu, 24 Mar 2022 15:40:34 +0100 Subject: [PATCH] Convert mover into a chunker. --- Dockerfile | 12 ++++++------ entrypoint.sh | 13 +++++++++---- mover.sh | 25 ++++++++++++++++++++++--- uploader.sh | 1 - 4 files changed, 37 insertions(+), 14 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8fa08ef..05bd0f1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,9 +5,9 @@ ENV TINI_VERSION v0.19.0 ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini RUN chmod +x /tini -RUN apt-get update && apt-get dist-upgrade -y && apt-get install -y \ - build-essential dpkg-dev devscripts cmake git python3 python3-dev python3-pip \ - && rm -rf /var/lib/apt/lists/* +#RUN apt-get update && apt-get dist-upgrade -y && apt-get install -y \ +# build-essential dpkg-dev devscripts cmake git python3 python3-dev python3-pip \ +# && rm -rf /var/lib/apt/lists/* # Create data mount RUN mkdir -p /data @@ -18,9 +18,9 @@ WORKDIR / COPY mover.sh /mover.sh COPY entrypoint.sh /entrypoint.sh COPY uploader.sh /uploader.sh -ADD https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/master/pipeline/requirements.txt /requirements.txt -ADD https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/master/uploader/uploader.py /uploader.py +#ADD https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/master/pipeline/requirements.txt /requirements.txt +#ADD https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/master/uploader/uploader.py /uploader.py -RUN pip3 install -r /requirements.txt +#RUN pip3 install -r /requirements.txt ENTRYPOINT [ "/tini", "--", "/entrypoint.sh" ] diff --git a/entrypoint.sh b/entrypoint.sh index 40dac49..d98febd 100755 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -7,11 +7,16 @@ trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT export SHARED_WARCS_DIR="${SHARED_WARCS_DIR:-/data/}" export INCOMING="${INCOMING:-${SHARED_WARCS_DIR}/incoming/}" export UPLOAD_QUEUE="${UPLOAD_QUEUE:-${SHARED_WARCS_DIR}/upload-queue/}" +export MOVER_WORKING_DIR="${MOVER_WORKING_DIR:-${SHARED_WARCS_DIR}/mover-work/}" -mkdir -pv "${INCOMING}" -chown nobody:nogroup "${INCOMING}" -mkdir -pv "${UPLOAD_QUEUE}" -chown nobody:nogroup "${UPLOAD_QUEUE}" +function makedir() { + mkdir -pv "$1" + chown nobody:nogroup "$1" +} + +makedir "${INCOMING}" +makedir "${UPLOAD_QUEUE}" +makedir "${MOVER_WORKING_DIR}" case "$1" in "mover") diff --git a/mover.sh b/mover.sh index 9f4f2aa..e2ed1b8 100755 --- a/mover.sh +++ b/mover.sh @@ -3,10 +3,19 @@ set -euo pipefail export INCOMING="${INCOMING:-/data/incoming/}" +export MOVER_WORKING_DIR="${MOVER_WORKING_DIR:-/data/mover-work/}" export UPLOAD_QUEUE="${UPLOAD_QUEUE:-/data/upload-queue/}" +export MEGABYTES_PER_CHUNK="${MEGABYTES_PER_CHUNK:-$((1024*500))}" +export BYTES_PER_CHUNK=$((1024*1024*MEGABYTES_PER_CHUNK)) + +mkdir -p "${MOVER_WORKING_DIR}" +mkdir -p "${UPLOAD_QUEUE}" +mkdir -p "${INCOMING}" while [[ 1 ]] ; do - # find every .warc.gz in the rsync directory + mkdir -p "${MOVER_WORKING_DIR}/current" + export cur_size=$( du -B1 -s "${MOVER_WORKING_DIR}/current" | grep -oE "^[0-9]+" ) + find "${INCOMING}" -type f -not -name ".*"\ | while read filename do @@ -15,9 +24,19 @@ while [[ 1 ]] ; do then continue fi + echo "Moving ${filename}" - mkdir -vp "${UPLOAD_QUEUE}" - mv -v "${filename}" "${UPLOAD_QUEUE}/" + export cur_size=$((cur_size + $( du -B1 -s "${filename}" | grep -oE "^[0-9]+" ))) + mkdir -p "${MOVER_WORKING_DIR}/current" + mv -v "${filename}" "${MOVER_WORKING_DIR}/current/" + + if [[ "${cur_size}" -gt "${BYTES_PER_CHUNK}" ]]; then + timestamp=$( date +'%Y%m%d%H%M%S' ) + uuid=$(cat /proc/sys/kernel/random/uuid | cut -d- -f1) + echo "Current archive is full, moving to ${timestamp}_${uuid}." + mv "${MOVER_WORKING_DIR}/current" "${UPLOAD_QUEUE}/${timestamp}_${uuid}" + export cur_size=0 + fi done echo "Sleeping 30 seconds..." diff --git a/uploader.sh b/uploader.sh index 8370d50..7ec7d2f 100755 --- a/uploader.sh +++ b/uploader.sh @@ -1,3 +1,2 @@ #!/bin/bash set -exuo pipefail -exec python3 -u /uploader.py "${UPLOAD_QUEUE}"