From 0bb647c658f0331b9bd8035ff50b146b6b94e3cc Mon Sep 17 00:00:00 2001 From: Roelf Wichertjes Date: Mon, 28 Feb 2022 12:12:35 +0100 Subject: [PATCH] Fix issues around the output directory and ensure pipeline shuts down container when STOP is issued. --- Dockerfile | 7 +++++-- docker-compose.yml | 1 + entrypoint.sh | 28 +++++++++++++++++++--------- stager.sh | 20 ++++++++++++++++++++ wait-empty.sh | 14 ++++++++++++++ 5 files changed, 59 insertions(+), 11 deletions(-) create mode 100755 stager.sh create mode 100755 wait-empty.sh diff --git a/Dockerfile b/Dockerfile index a7500e4..2530bf5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,7 +8,6 @@ RUN mkdir -p tcp_closer/src/build WORKDIR /tcp_closer/src/build RUN cmake .. RUN make package -RUN find . -type f FROM python:3.6-stretch RUN apt-get update && apt-get dist-upgrade -y && apt-get install -y \ @@ -34,8 +33,10 @@ RUN pip install websockets requests RUN curl -L https://yt-dl.org/downloads/latest/youtube-dl -o /usr/local/bin/youtube-dl # Setup output directory -RUN mkdir /data +RUN mkdir -p /data +RUN mkdir -p /local-staging RUN chown ab:ab /data +RUN chown ab:ab /local-staging RUN chmod a+rx /usr/local/bin/youtube-dl # Install tini @@ -71,6 +72,8 @@ RUN pip install -r pipeline/requirements.txt # Copy in entrypoint COPY entrypoint.sh /entrypoint.sh +COPY stager.sh /stager.sh +COPY wait-empty.sh /wait-empty.sh # Button up image USER ab diff --git a/docker-compose.yml b/docker-compose.yml index d658f97..b0dd230 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -49,6 +49,7 @@ services: pipeline: build: . command: "pipeline" + privileged: true volumes: - warcs:/data environment: diff --git a/entrypoint.sh b/entrypoint.sh index 166d16b..44f60b2 100755 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -1,6 +1,9 @@ #!/bin/bash set -exuo pipefail +# https://stackoverflow.com/a/2173421 +trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT + if [ -z ${COUCHDB_URL+x} ]; then echo "Skipping couchdb init" else @@ -13,21 +16,22 @@ else fi pushd "db/design_docs" - curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/_users - curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot - curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot_logs + curl -s $COUCHDB_CURL_ARGS -X PUT "$COUCHDB_URL/_users" + curl -s $COUCHDB_CURL_ARGS -X PUT "$COUCHDB_URL/archivebot" + curl -s $COUCHDB_CURL_ARGS -X PUT "$COUCHDB_URL/archivebot_logs" grep -v _rev archive_urls.json > /tmp/archive_urls.json grep -v _rev ignore_patterns.json > /tmp/ignore_patterns.json grep -v _rev jobs.json > /tmp/jobs.json grep -v _rev user_agents.json > /tmp/user_agents.json - curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/archive_urls -d @/tmp/archive_urls.json - curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/ignore_patterns -d @/tmp/ignore_patterns.json - curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/jobs -d @/tmp/jobs.json - curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/user_agents -d @/tmp/user_agents.json + curl -s $COUCHDB_CURL_ARGS -X PUT "$COUCHDB_URL/archivebot/_design/archive_urls" -d @/tmp/archive_urls.json + curl -s $COUCHDB_CURL_ARGS -X PUT "$COUCHDB_URL/archivebot/_design/ignore_patterns" -d @/tmp/ignore_patterns.json + curl -s $COUCHDB_CURL_ARGS -X PUT "$COUCHDB_URL/archivebot/_design/jobs" -d @/tmp/jobs.json + curl -s $COUCHDB_CURL_ARGS -X PUT "$COUCHDB_URL/archivebot/_design/user_agents" -d @/tmp/user_agents.json popd fi -export FINISHED_WARCS_DIR=${FINISHED_WARCS_DIR:-/data/} +export SHARED_WARCS_DIR="${SHARED_WARCS_DIR:-/data/}" +mkdir -pv "$SHARED_WARCS_DIR/upload-queue/" case "$1" in "bot") @@ -71,13 +75,19 @@ case "$1" in export PIPELINE_NAME="${PIPELINE_NAME:-${PIPELINE_PREFIX}-$(hostname -s)}" export PIPELINE_NAME="${PIPELINE_NAME:0:30}" export NO_SCREEN=1 + export STAGING_WARCS_DIR="$(mktemp -p "$SHARED_WARCS_DIR" -d staging-XXXXXXXXXX)" + export FINISHED_WARCS_DIR="/local-staging/" sudo /usr/sbin/tcp-closer -4 --dport 443 --idle_time 21601000 --last_recv_limit 43200000 --interval 300 & sudo /usr/sbin/tcp-closer -6 --dport 443 --idle_time 21601000 --last_recv_limit 43200000 --interval 300 & + /stager.sh "$FINISHED_WARCS_DIR" "$STAGING_WARCS_DIR" "$SHARED_WARCS_DIR/upload-queue/" & run-pipeline3 pipeline.py --disable-web-server \ --concurrent $PIPELINE_CONCURRENT $PIPELINE_NAME + /wait-empty.sh "$FINISHED_WARCS_DIR" + /wait-empty.sh "$STAGING_WARCS_DIR" + rmdir "$STAGING_WARCS_DIR" ;; "uploader") - python ./uploader/uploader.py $FINISHED_WARCS_DIR + python ./uploader/uploader.py "$SHARED_WARCS_DIR/upload-queue/" ;; "analyzer") export UPDATES_CHANNEL=updates diff --git a/stager.sh b/stager.sh new file mode 100755 index 0000000..78df4fc --- /dev/null +++ b/stager.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +set -euo pipefail + +INPUT="$1" +STAGING="$2" +OUTPUT="$3" + +while true ; do + echo "Attempting to move files..." +COUNT=$(find "$INPUT" -type f | wc -l) + if [[ $COUNT -ne 0 ]]; then + echo "Found $COUNT files!" + echo "Moving from input to staging..." + mv -v "$INPUT"/* "$STAGING/" || true + echo "Atomically moving from staging to output..." + mv -v "$STAGING"/* "$OUTPUT/" || true + fi + sleep 5 +done + diff --git a/wait-empty.sh b/wait-empty.sh new file mode 100755 index 0000000..3d8f410 --- /dev/null +++ b/wait-empty.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +while true + do + COUNT=$(find "$1" -type f |wc -l) + if [ $COUNT -ne 0 ] ; then + echo "Directory $1 is not empty ($COUNT files remaining). Waiting 5 seconds..." + sleep 5 + else + echo "Directory $1 is empty!" + break + fi + done +