Browse Source

Fix issues around the output directory and ensure pipeline shuts down container when STOP is issued.

master
Roelf Wichertjes 2 years ago
parent
commit
0bb647c658
Signed by: rewby GPG Key ID: 4C2B6D2972EE5423
5 changed files with 59 additions and 11 deletions
  1. +5
    -2
      Dockerfile
  2. +1
    -0
      docker-compose.yml
  3. +19
    -9
      entrypoint.sh
  4. +20
    -0
      stager.sh
  5. +14
    -0
      wait-empty.sh

+ 5
- 2
Dockerfile View File

@@ -8,7 +8,6 @@ RUN mkdir -p tcp_closer/src/build
WORKDIR /tcp_closer/src/build
RUN cmake ..
RUN make package
RUN find . -type f

FROM python:3.6-stretch
RUN apt-get update && apt-get dist-upgrade -y && apt-get install -y \
@@ -34,8 +33,10 @@ RUN pip install websockets requests
RUN curl -L https://yt-dl.org/downloads/latest/youtube-dl -o /usr/local/bin/youtube-dl

# Setup output directory
RUN mkdir /data
RUN mkdir -p /data
RUN mkdir -p /local-staging
RUN chown ab:ab /data
RUN chown ab:ab /local-staging
RUN chmod a+rx /usr/local/bin/youtube-dl

# Install tini
@@ -71,6 +72,8 @@ RUN pip install -r pipeline/requirements.txt

# Copy in entrypoint
COPY entrypoint.sh /entrypoint.sh
COPY stager.sh /stager.sh
COPY wait-empty.sh /wait-empty.sh

# Button up image
USER ab


+ 1
- 0
docker-compose.yml View File

@@ -49,6 +49,7 @@ services:
pipeline:
build: .
command: "pipeline"
privileged: true
volumes:
- warcs:/data
environment:


+ 19
- 9
entrypoint.sh View File

@@ -1,6 +1,9 @@
#!/bin/bash
set -exuo pipefail

# https://stackoverflow.com/a/2173421
trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT

if [ -z ${COUCHDB_URL+x} ]; then
echo "Skipping couchdb init"
else
@@ -13,21 +16,22 @@ else
fi

pushd "db/design_docs"
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/_users
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot_logs
curl -s $COUCHDB_CURL_ARGS -X PUT "$COUCHDB_URL/_users"
curl -s $COUCHDB_CURL_ARGS -X PUT "$COUCHDB_URL/archivebot"
curl -s $COUCHDB_CURL_ARGS -X PUT "$COUCHDB_URL/archivebot_logs"
grep -v _rev archive_urls.json > /tmp/archive_urls.json
grep -v _rev ignore_patterns.json > /tmp/ignore_patterns.json
grep -v _rev jobs.json > /tmp/jobs.json
grep -v _rev user_agents.json > /tmp/user_agents.json
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/archive_urls -d @/tmp/archive_urls.json
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/ignore_patterns -d @/tmp/ignore_patterns.json
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/jobs -d @/tmp/jobs.json
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/user_agents -d @/tmp/user_agents.json
curl -s $COUCHDB_CURL_ARGS -X PUT "$COUCHDB_URL/archivebot/_design/archive_urls" -d @/tmp/archive_urls.json
curl -s $COUCHDB_CURL_ARGS -X PUT "$COUCHDB_URL/archivebot/_design/ignore_patterns" -d @/tmp/ignore_patterns.json
curl -s $COUCHDB_CURL_ARGS -X PUT "$COUCHDB_URL/archivebot/_design/jobs" -d @/tmp/jobs.json
curl -s $COUCHDB_CURL_ARGS -X PUT "$COUCHDB_URL/archivebot/_design/user_agents" -d @/tmp/user_agents.json
popd
fi

export FINISHED_WARCS_DIR=${FINISHED_WARCS_DIR:-/data/}
export SHARED_WARCS_DIR="${SHARED_WARCS_DIR:-/data/}"
mkdir -pv "$SHARED_WARCS_DIR/upload-queue/"

case "$1" in
"bot")
@@ -71,13 +75,19 @@ case "$1" in
export PIPELINE_NAME="${PIPELINE_NAME:-${PIPELINE_PREFIX}-$(hostname -s)}"
export PIPELINE_NAME="${PIPELINE_NAME:0:30}"
export NO_SCREEN=1
export STAGING_WARCS_DIR="$(mktemp -p "$SHARED_WARCS_DIR" -d staging-XXXXXXXXXX)"
export FINISHED_WARCS_DIR="/local-staging/"
sudo /usr/sbin/tcp-closer -4 --dport 443 --idle_time 21601000 --last_recv_limit 43200000 --interval 300 &
sudo /usr/sbin/tcp-closer -6 --dport 443 --idle_time 21601000 --last_recv_limit 43200000 --interval 300 &
/stager.sh "$FINISHED_WARCS_DIR" "$STAGING_WARCS_DIR" "$SHARED_WARCS_DIR/upload-queue/" &
run-pipeline3 pipeline.py --disable-web-server \
--concurrent $PIPELINE_CONCURRENT $PIPELINE_NAME
/wait-empty.sh "$FINISHED_WARCS_DIR"
/wait-empty.sh "$STAGING_WARCS_DIR"
rmdir "$STAGING_WARCS_DIR"
;;
"uploader")
python ./uploader/uploader.py $FINISHED_WARCS_DIR
python ./uploader/uploader.py "$SHARED_WARCS_DIR/upload-queue/"
;;
"analyzer")
export UPDATES_CHANNEL=updates


+ 20
- 0
stager.sh View File

@@ -0,0 +1,20 @@
#!/usr/bin/env bash
set -euo pipefail

INPUT="$1"
STAGING="$2"
OUTPUT="$3"

while true ; do
echo "Attempting to move files..."
COUNT=$(find "$INPUT" -type f | wc -l)
if [[ $COUNT -ne 0 ]]; then
echo "Found $COUNT files!"
echo "Moving from input to staging..."
mv -v "$INPUT"/* "$STAGING/" || true
echo "Atomically moving from staging to output..."
mv -v "$STAGING"/* "$OUTPUT/" || true
fi
sleep 5
done


+ 14
- 0
wait-empty.sh View File

@@ -0,0 +1,14 @@
#!/usr/bin/env bash

while true
do
COUNT=$(find "$1" -type f |wc -l)
if [ $COUNT -ne 0 ] ; then
echo "Directory $1 is not empty ($COUNT files remaining). Waiting 5 seconds..."
sleep 5
else
echo "Directory $1 is empty!"
break
fi
done


Loading…
Cancel
Save