commit e42f241a00798ac3aabc2651e9afd198436b895b
Author: arkiver <arkiver@protonmail.com>
Date:   Mon Feb 27 01:37:50 2023 +0100

    initial

diff --git a/.drone.yml b/.drone.yml
new file mode 100644
index 0000000..fc6c019
--- /dev/null
+++ b/.drone.yml
@@ -0,0 +1,18 @@
+---
+kind: pipeline
+name: default
+steps:
+- name: docker
+  image: plugins/docker
+  settings:
+    registry: atdr-writer.meo.ws
+    username:
+      from_secret: atdr_user
+    password:
+      from_secret: atdr_pass
+    repo: atdr-writer.meo.ws/archiveteam/archiveteam-tar-uploader
+    dockerfile: Dockerfile
+    purge: true
+    auto_tag: false
+    tags:
+    - latest
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b25c15b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*~
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..d12e82a
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,12 @@
+FROM debian:stretch-slim
+RUN echo 'deb http://ftp.de.debian.org/debian buster-backports main' >> /etc/apt/sources.list
+RUN DEBIAN_FRONTEND=noninteractive DEBIAN_PRIORITY=critical apt-get -qqy --no-install-recommends -o Dpkg::Options::=--force-confdef -o Dpkg::Options::=--force-confold -o Dpkg::Options::=--force-unsafe-io update \
+ && DEBIAN_FRONTEND=noninteractive DEBIAN_PRIORITY=critical apt-get -qqy --no-install-recommends -o Dpkg::Options::=--force-confdef -o Dpkg::Options::=--force-confold -o Dpkg::Options::=--force-unsafe-io install python python2.7 rsync git ca-certificates curl python-pip \
+ && DEBIAN_FRONTEND=noninteractive DEBIAN_PRIORITY=critical apt-get -qqy --no-install-recommends -o Dpkg::Options::=--force-confdef -o Dpkg::Options::=--force-confold -o Dpkg::Options::=--force-unsafe-io -t buster-backports install zstd libzstd-dev libzstd1 \
+ && pip install zstandard && pip install requests
+COPY * factory/
+RUN rm -rf /factory/megawarc && git clone https://github.com/archiveteam/megawarc.git /factory/megawarc
+WORKDIR /factory
+COPY docker-boot.sh /
+RUN chmod +x /docker-boot.sh
+ENTRYPOINT ["/docker-boot.sh"]
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..1add16a
--- /dev/null
+++ b/README.md
@@ -0,0 +1 @@
+This is a rough edited version of https://github.com/ArchiveTeam/archiveteam-megawarc-factory to upload tar files instead of packing and uploading WARCs.
diff --git a/chunk-multiple b/chunk-multiple
new file mode 100755
index 0000000..d7357b0
--- /dev/null
+++ b/chunk-multiple
@@ -0,0 +1,21 @@
+#!/bin/bash
+# This loops the chunker script while the RUN file exists.
+# See chunker for details.
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+while [[ -f RUN ]]
+do
+  date
+  "${SCRIPT_DIR}/chunker"
+  result="${?}"
+  if [[ "${result}" -ne 0 ]]
+  then
+    date
+    echo "chunker exited with ${result}"
+    exit "${result}"
+  fi
+
+  echo "Sleeping..."
+  sleep 1
+done
+
diff --git a/chunker b/chunker
new file mode 100755
index 0000000..8206480
--- /dev/null
+++ b/chunker
@@ -0,0 +1,58 @@
+#!/bin/bash
+# Move uploaded .warc.gz files to an archive directory.
+# When the archive is large enough, make a tar and start with a
+# new archive.
+#
+# Be careful: this script assumes that any file in the upload directory
+# that has a name that ends with *.warc.gz is a fully uploaded file and
+# can be moved somewhere else. Remember this when running Rsync.
+#
+
+INCOMING_UPLOADS_DIR="${1}" # /home/archiveteam/uploads
+CHUNKER_WORKING_DIR="${2}"  # /home/archiveteam/processed
+PACKING_QUEUE_DIR="${CHUNKER_WORKING_DIR}/archive"
+MEGABYTES_PER_CHUNK=$((1024*25))
+
+# if not specified in command-line arguments
+if [ -z "${INCOMING_UPLOADS_DIR}" ]
+then
+  source ./config.sh || exit 1
+fi
+
+BYTES_PER_CHUNK=$((1024*1024*MEGABYTES_PER_CHUNK))
+
+mkdir -p "${CHUNKER_WORKING_DIR}" || exit 1
+mkdir -p "${PACKING_QUEUE_DIR}" || exit 1
+
+mkdir -p "${CHUNKER_WORKING_DIR}/current" || exit 1
+cur_size=$( du -B1 -s "${CHUNKER_WORKING_DIR}/current" | grep -oE "^[0-9]+" )
+
+# find every .warc.gz in the upload directory
+find "${INCOMING_UPLOADS_DIR}" -type f -regex ".+\.tar$" \
+| while read -r filename
+do
+  # skip partial uploads
+  if [[ "${filename}" =~ rsync-tmp ]]
+  then
+    continue
+  fi
+
+  cur_size=$((cur_size + $( du -B1 -s "${filename}" | grep -oE "^[0-9]+" )))
+
+  # move to the current/ directory
+  echo "Moving ${filename}"
+  mkdir -p "${CHUNKER_WORKING_DIR}/current"
+  mv "${filename}" "${CHUNKER_WORKING_DIR}/current/"
+
+  # if the current/ directory is large enough,
+  # rename it to archive-XXXXX and start a new current/
+  if [[ "${cur_size}" -gt "${BYTES_PER_CHUNK}" ]]
+  then
+    timestamp=$( date +'%Y%m%d%H%M%S' )
+    uuid=$(cat /proc/sys/kernel/random/uuid | cut -d- -f1)
+    echo "Current archive is full, moving to ${timestamp}_${uuid}."
+    mv "${CHUNKER_WORKING_DIR}/current" "${UPLOAD_QUEUE_DIR}/${timestamp}_${uuid}"
+    cur_size=0
+  fi
+done
+
diff --git a/config.example.sh b/config.example.sh
new file mode 100755
index 0000000..357a8ac
--- /dev/null
+++ b/config.example.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+# Create a copy of this config.sh, customise it and place it in the
+# working directory of the packing and upload scripts.
+
+####################
+# CHUNKER SETTINGS #
+####################
+# start a new chunk when the current chunk is at least this large
+MEGABYTES_PER_CHUNK=$((1024*25))
+
+###################
+# UPLOAD METADATA #
+###################
+# your Archive.org S3 keys
+IA_AUTH="ACCESS_KEY:SECRET"
+
+# the name of the collection to add the uploads to
+IA_COLLECTION="archiveteam_TODO"
+
+# the title of the items (" ${item_timestamp}" will be appended)
+IA_ITEM_TITLE="Archive Team TODO:"
+
+# the prefix of the item name ("${item_timestamp}" is appended)
+IA_ITEM_PREFIX="archiveteam_todo_"
+
+# the prefix of the megawarc filename ("${item_timestamp}" is appended)
+FILE_PREFIX="todo_"
+
+# the date field for the item
+IA_ITEM_DATE=$( date +"%Y-%m" )
+
+# offload items to another rsync storage instead of uploading to IA
+OFFLOAD_TARGET="rsync://somewhere-far-away:portnum/module-name/directory/"
+# it is also possible to create a list of targets and the offloader will pick one at random and retry others on failure
+# simply comment out the line above and put all rsync target urls separated by newline in a file called "offload_targets"
+
+# the API for requesting the ZSTD dictionaries
+ZST_DICTIONARY_API="API_URL"
+
+###############
+# DIRECTORIES #
+###############
+# Put your directories on one or two filesystems (see README).
+FS1_BASE_DIR="/archiveteam/ssd/project"
+FS2_BASE_DIR="/archiveteam/disk/project"
+
+## THESE DIRECTORIES ON FILESYSTEM 1: for warcs
+
+# the rsync upload directory
+# (the chunker will package the .warc.gz files in this directory)
+INCOMING_UPLOADS_DIR="${FS1_BASE_DIR}/incoming-uploads"
+
+# the chunker working directory
+# (this directory will hold the current in-progress chunk)
+CHUNKER_WORKING_DIR="${FS1_BASE_DIR}/chunker-work"
+
+# the chunker output directory / the packer queue
+# (this directory will hold the completed chunks)
+PACKING_QUEUE_DIR="${FS1_BASE_DIR}/packing-queue"
+
+# the packer working directory - warc side
+# (this directory will hold the current chunk)
+PACKER_WORKING_CHUNKS_DIR="${FS1_BASE_DIR}/packer-work-in"
+
+## THESE DIRECTORIES ON FILESYSTEM 2: for megawarcs
+
+# the packer working directory - megawarc side
+# (this directory will hold the current megawarc)
+PACKER_WORKING_MEGAWARC_DIR="${FS2_BASE_DIR}/packer-work-out"
+
+# the packer output directory / the upload queue
+# (this directory will hold the completed megawarcs)
+UPLOAD_QUEUE_DIR="${FS2_BASE_DIR}/upload-queue"
+
+# the uploader working directory
+# (this directory will hold the current megawarc)
+UPLOADER_WORKING_DIR="${FS2_BASE_DIR}/uploader-work"
+
+# the final destination for uploaded megawarcs
+# leave this empty to remove megawarcs after uploading
+COMPLETED_DIR="${FS2_BASE_DIR}/uploaded"
+
+
+# remove this
+echo "config.sh not customised."
+exit 1
+
+
diff --git a/docker-boot.sh b/docker-boot.sh
new file mode 100755
index 0000000..2afc811
--- /dev/null
+++ b/docker-boot.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+
+ set -e
+
+ test -d /data || {
+        echo "No /data mount found"
+        exit 1
+}
+
+ mkdir -p /data/incoming /data/chunker-work /data/packing-queue /data/packer-work-in /data/packer-work-out /data/upload-queue /data/uploader-work
+
+ IA_ITEM_DATE_LIT='$( date +"%Y-%m" )'
+cat > /factory/config.sh << EOF
+#!/bin/bash
+MEGABYTES_PER_CHUNK="${MEGABYTES_PER_CHUNK}"
+IA_AUTH="${IA_AUTH}"
+IA_COLLECTION="${IA_COLLECTION}"
+IA_ITEM_TITLE="${IA_ITEM_TITLE}"
+IA_ITEM_PREFIX="${IA_ITEM_PREFIX}"
+FILE_PREFIX="${FILE_PREFIX}"
+IA_ITEM_DATE="${IA_ITEM_DATE_LIT}"
+OFFLOAD_TARGET="${OFFLOAD_TARGET}"
+ZST_DICTIONARY_API="${ZST_DICTIONARY_API}"
+LOAD_BALANCER="${LOAD_BALANCER}"
+INCOMING_UPLOADS_DIR="/data/incoming"
+CHUNKER_WORKING_DIR="/data/chunker-work"
+PACKING_QUEUE_DIR="/data/packing-queue"
+PACKER_WORKING_CHUNKS_DIR="/data/packer-work-in"
+PACKER_WORKING_MEGAWARC_DIR="/data/packer-work-out"
+UPLOAD_QUEUE_DIR="/data/upload-queue"
+UPLOADER_WORKING_DIR="/data/uploader-work"
+COMPLETED_DIR=""
+EOF
+
+ touch /factory/RUN
+
+ case "${1}" in
+        chunk|chunker|chunk-multiple)
+                if test -z "${MEGABYTES_PER_CHUNK}"; then
+                        echo "Missing param: MEGABYTES_PER_CHUNK=${MEGABYTES_PER_CHUNK}"
+                        exit 1
+                fi
+                exec /factory/chunk-multiple
+        ;;
+        pack|pack-one|packer|pack-multiple)
+                if test -z "${FILE_PREFIX}" || test -z "${ZST_DICTIONARY_API}"; then
+                        echo "Missing param: FILE_PREFIX=${FILE_PREFIX} ZST_DICTIONARY_API=${ZST_DICTIONARY_API}"
+                        exit 1
+                fi
+                exec /factory/pack-multiple
+        ;;
+        upload|upload-one|upload-multiple)
+                if test -z "${IA_AUTH}" || test -z "${IA_COLLECTION}" || test -z "${IA_ITEM_TITLE}" || test -z "${IA_ITEM_PREFIX}" || test -z "${FILE_PREFIX}"; then
+                        echo "Missing param: IA_AUTH=${IA_AUTH} IA_COLLECTION=${IA_COLLECTION} IA_ITEM_TITLE=${IA_ITEM_TITLE} IA_ITEM_PREFIX=${IA_ITEM_PREFIX} FILE_PREFIX=${FILE_PREFIX}"
+                        exit 1
+                fi
+                exec /factory/upload-multiple
+        ;;
+        offload|offload-one|offload-multiple)
+                if test -z "${OFFLOAD_TARGET}" && ! test -f "${PWD}/offload_targets"; then
+                        echo "Missing param: OFFLOAD_TARGET=${OFFLOAD_TARGET} and no ${PWD}/offload_targets existing"
+                        exit 1
+                fi
+                exec /factory/offload-multiple
+        ;;
+        *)
+                echo "Usage: chunk|pack|upload|offload"
+                exit 1
+        ;;
+esac
diff --git a/du-all b/du-all
new file mode 100755
index 0000000..2c94605
--- /dev/null
+++ b/du-all
@@ -0,0 +1,14 @@
+#!/bin/bash
+# This shows du -hs for the important directories.
+source ./config.sh || exit 1
+
+du -hs \
+  "${INCOMING_UPLOADS_DIR}" \
+  "${CHUNKER_WORKING_DIR}" \
+  "${PACKING_QUEUE_DIR}/"* \
+  "${PACKER_WORKING_CHUNKS_DIR}/"* \
+  "${PACKER_WORKING_MEGAWARC_DIR}/"* \
+  "${UPLOAD_QUEUE_DIR}/"* \
+  "${UPLOADER_WORKING_DIR}/"* \
+2> >(grep -v 'du: cannot \(access\|read\)' >&2)
+
diff --git a/upload-multiple b/upload-multiple
new file mode 100755
index 0000000..5b1fc6f
--- /dev/null
+++ b/upload-multiple
@@ -0,0 +1,17 @@
+#!/bin/bash
+# This loops the upload-one script while the RUN file exists.
+# See upload-one for details.
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+while [[ -f RUN ]]
+do
+  "${SCRIPT_DIR}/upload-one"
+  result="${?}"
+  if [[ "${result}" -ne 0 ]]
+  then
+    date
+    echo "uploader exited with ${result}"
+    exit "${result}"
+  fi
+done
+
diff --git a/upload-one b/upload-one
new file mode 100755
index 0000000..c256911
--- /dev/null
+++ b/upload-one
@@ -0,0 +1,137 @@
+#!/bin/bash
+# Uploads megawarcs from the upload queue.
+# (Needs a config.sh in the working directory.)
+#
+#   ./upload-one
+#
+# 1. Grabs an item from UPLOAD_QUEUE_DIR
+# 2. Reserves the item by moving the directory to the
+#    UPLOADER_WORKING_DIR
+# 3. Uploads the item to s3.us.archive.org
+# 4. Removes the source files from the working directory
+#    If COMPLETED_DIR is set, uploaded files are moved there.
+#
+# The program exits with 1 on any nontransient error.
+#
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+source ./config.sh || exit 1
+
+mkdir -p "${UPLOAD_QUEUE_DIR}" || exit 1
+mkdir -p "${UPLOADER_WORKING_DIR}" || exit 1
+
+if [ ! -z "${COMPLETED_DIR}" ]
+then
+  mkdir -p "${COMPLETED_DIR}" || exit 1
+fi
+
+function mayicontinue {
+  echo
+# echo "May I continue?"
+# read
+# echo
+}
+
+mayicontinue
+
+
+# try to grab an item from UPLOAD_QUEUE_DIR
+ITEM=none
+while [[ "${ITEM}" = none ]]
+do
+  possible_item=$( ls -1 "${UPLOAD_QUEUE_DIR}" | grep -E '[0-9]{14}_[a-f0-9]{8}$' | sort | head -n 1 )
+  if test -n "${possible_item}"
+  then
+    echo "Trying to grab ${possible_item}"
+    if mv "${UPLOAD_QUEUE_DIR}/${possible_item}" "${UPLOADER_WORKING_DIR}/"
+    then
+      ITEM="${possible_item}"
+    else
+      echo "Failed to move ${possible_item}"
+      sleep 5
+    fi
+  else
+    date
+    echo "No current item found!"
+    sleep 30
+    exit 0
+  fi
+done
+
+
+echo "$( date ): Start uploading for item ${ITEM}" >> uploader.log
+
+# upload megawarc
+size_hint=$( du --bytes -s "${UPLOADER_WORKING_DIR}/${ITEM}" | grep -oE "^[0-9]+" )
+# (upload the large files first to optimise S3 snowballing)
+
+find "${UPLOADER_WORKING_DIR}/${ITEM}" -type f -regextype posix-egrep -regex ".+\.tar$" -printf "%f\n" \
+| while read -r filename
+do
+  result=1
+  while [[ "${result}" -ne 0 ]]
+  do
+    curl -v --location --fail \
+      --speed-limit 1 --speed-time 900 \
+      --header "x-archive-queue-derive:1" \
+      --header "x-amz-auto-make-bucket:1" \
+      --header "x-archive-keep-old-version:1" \
+      --header "x-archive-meta-collection:${IA_COLLECTION}" \
+      --header "x-archive-meta-mediatype:data" \
+      --header "x-archive-meta-title:${IA_ITEM_TITLE} ${ITEM}" \
+      --header "x-archive-meta-date:${IA_ITEM_DATE}" \
+      --header "x-archive-meta-language:eng" \
+      --header "x-archive-meta-noarchivetorrent:true" \
+      --header "x-archive-size-hint:${size_hint}" \
+      --header "authorization: LOW ${IA_AUTH}" \
+      --upload-file "${UPLOADER_WORKING_DIR}/${ITEM}/${filename}" \
+      "https://${LOAD_BALANCER:-s3}.us.archive.org/${IA_ITEM_PREFIX}${ITEM}/${filename}" \
+      > /dev/null
+    result="${?}"
+    if [[ "${result}" -ne 0 ]]
+    then
+      date
+      echo "Error while uploading ${ITEM}, curl said ${result}"
+      echo "Will retry in 30 seconds"
+      sleep 30
+    fi
+  done
+done
+
+echo "Uploaded ${ITEM}"
+
+echo "$( date ): Completed uploading for item ${ITEM}" >> uploader.log
+
+
+mayicontinue
+
+
+# move or remove megawarc
+if [ -z "${COMPLETED_DIR}" ]
+then
+  # remove
+  rm -rf "${UPLOADER_WORKING_DIR}/${ITEM}"
+  result="${?}"
+
+  if [[ "${result}" -ne 0 ]]
+  then
+    date
+    echo "rm -rf megawarc exited with ${result} for ${ITEM}"
+    exit 1
+  fi
+else
+  # move
+  mv "${UPLOADER_WORKING_DIR}/${ITEM}" "${COMPLETED_DIR}/"
+  result="${?}"
+
+  if [[ "${result}" -ne 0 ]]
+  then
+    date
+    echo "rm -rf megawarc exited with ${result} for ${ITEM}"
+    exit 1
+  fi
+fi
+
+exit 0
+