|
|
@@ -1,6 +1,9 @@ |
|
|
|
#!/bin/bash |
|
|
|
set -exuo pipefail |
|
|
|
|
|
|
|
# https://stackoverflow.com/a/2173421 |
|
|
|
trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT |
|
|
|
|
|
|
|
if [ -z ${COUCHDB_URL+x} ]; then |
|
|
|
echo "Skipping couchdb init" |
|
|
|
else |
|
|
@@ -13,21 +16,22 @@ else |
|
|
|
fi |
|
|
|
|
|
|
|
pushd "db/design_docs" |
|
|
|
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/_users |
|
|
|
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot |
|
|
|
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot_logs |
|
|
|
curl -s $COUCHDB_CURL_ARGS -X PUT "$COUCHDB_URL/_users" |
|
|
|
curl -s $COUCHDB_CURL_ARGS -X PUT "$COUCHDB_URL/archivebot" |
|
|
|
curl -s $COUCHDB_CURL_ARGS -X PUT "$COUCHDB_URL/archivebot_logs" |
|
|
|
grep -v _rev archive_urls.json > /tmp/archive_urls.json |
|
|
|
grep -v _rev ignore_patterns.json > /tmp/ignore_patterns.json |
|
|
|
grep -v _rev jobs.json > /tmp/jobs.json |
|
|
|
grep -v _rev user_agents.json > /tmp/user_agents.json |
|
|
|
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/archive_urls -d @/tmp/archive_urls.json |
|
|
|
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/ignore_patterns -d @/tmp/ignore_patterns.json |
|
|
|
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/jobs -d @/tmp/jobs.json |
|
|
|
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/user_agents -d @/tmp/user_agents.json |
|
|
|
curl -s $COUCHDB_CURL_ARGS -X PUT "$COUCHDB_URL/archivebot/_design/archive_urls" -d @/tmp/archive_urls.json |
|
|
|
curl -s $COUCHDB_CURL_ARGS -X PUT "$COUCHDB_URL/archivebot/_design/ignore_patterns" -d @/tmp/ignore_patterns.json |
|
|
|
curl -s $COUCHDB_CURL_ARGS -X PUT "$COUCHDB_URL/archivebot/_design/jobs" -d @/tmp/jobs.json |
|
|
|
curl -s $COUCHDB_CURL_ARGS -X PUT "$COUCHDB_URL/archivebot/_design/user_agents" -d @/tmp/user_agents.json |
|
|
|
popd |
|
|
|
fi |
|
|
|
|
|
|
|
export FINISHED_WARCS_DIR=${FINISHED_WARCS_DIR:-/data/} |
|
|
|
export SHARED_WARCS_DIR="${SHARED_WARCS_DIR:-/data/}" |
|
|
|
mkdir -pv "$SHARED_WARCS_DIR/upload-queue/" |
|
|
|
|
|
|
|
case "$1" in |
|
|
|
"bot") |
|
|
@@ -71,13 +75,19 @@ case "$1" in |
|
|
|
export PIPELINE_NAME="${PIPELINE_NAME:-${PIPELINE_PREFIX}-$(hostname -s)}" |
|
|
|
export PIPELINE_NAME="${PIPELINE_NAME:0:30}" |
|
|
|
export NO_SCREEN=1 |
|
|
|
export STAGING_WARCS_DIR="$(mktemp -p "$SHARED_WARCS_DIR" -d staging-XXXXXXXXXX)" |
|
|
|
export FINISHED_WARCS_DIR="/local-staging/" |
|
|
|
sudo /usr/sbin/tcp-closer -4 --dport 443 --idle_time 21601000 --last_recv_limit 43200000 --interval 300 & |
|
|
|
sudo /usr/sbin/tcp-closer -6 --dport 443 --idle_time 21601000 --last_recv_limit 43200000 --interval 300 & |
|
|
|
/stager.sh "$FINISHED_WARCS_DIR" "$STAGING_WARCS_DIR" "$SHARED_WARCS_DIR/upload-queue/" & |
|
|
|
run-pipeline3 pipeline.py --disable-web-server \ |
|
|
|
--concurrent $PIPELINE_CONCURRENT $PIPELINE_NAME |
|
|
|
/wait-empty.sh "$FINISHED_WARCS_DIR" |
|
|
|
/wait-empty.sh "$STAGING_WARCS_DIR" |
|
|
|
rmdir "$STAGING_WARCS_DIR" |
|
|
|
;; |
|
|
|
"uploader") |
|
|
|
python ./uploader/uploader.py $FINISHED_WARCS_DIR |
|
|
|
python ./uploader/uploader.py "$SHARED_WARCS_DIR/upload-queue/" |
|
|
|
;; |
|
|
|
"analyzer") |
|
|
|
export UPDATES_CHANNEL=updates |
|
|
|