commit e42f241a00798ac3aabc2651e9afd198436b895b Author: arkiver Date: Mon Feb 27 01:37:50 2023 +0100 initial diff --git a/.drone.yml b/.drone.yml new file mode 100644 index 0000000..fc6c019 --- /dev/null +++ b/.drone.yml @@ -0,0 +1,18 @@ +--- +kind: pipeline +name: default +steps: +- name: docker + image: plugins/docker + settings: + registry: atdr-writer.meo.ws + username: + from_secret: atdr_user + password: + from_secret: atdr_pass + repo: atdr-writer.meo.ws/archiveteam/archiveteam-tar-uploader + dockerfile: Dockerfile + purge: true + auto_tag: false + tags: + - latest diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b25c15b --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*~ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..d12e82a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,12 @@ +FROM debian:stretch-slim +RUN echo 'deb http://ftp.de.debian.org/debian buster-backports main' >> /etc/apt/sources.list +RUN DEBIAN_FRONTEND=noninteractive DEBIAN_PRIORITY=critical apt-get -qqy --no-install-recommends -o Dpkg::Options::=--force-confdef -o Dpkg::Options::=--force-confold -o Dpkg::Options::=--force-unsafe-io update \ + && DEBIAN_FRONTEND=noninteractive DEBIAN_PRIORITY=critical apt-get -qqy --no-install-recommends -o Dpkg::Options::=--force-confdef -o Dpkg::Options::=--force-confold -o Dpkg::Options::=--force-unsafe-io install python python2.7 rsync git ca-certificates curl python-pip \ + && DEBIAN_FRONTEND=noninteractive DEBIAN_PRIORITY=critical apt-get -qqy --no-install-recommends -o Dpkg::Options::=--force-confdef -o Dpkg::Options::=--force-confold -o Dpkg::Options::=--force-unsafe-io -t buster-backports install zstd libzstd-dev libzstd1 \ + && pip install zstandard && pip install requests +COPY * factory/ +RUN rm -rf /factory/megawarc && git clone https://github.com/archiveteam/megawarc.git /factory/megawarc +WORKDIR /factory +COPY docker-boot.sh / +RUN chmod +x /docker-boot.sh +ENTRYPOINT ["/docker-boot.sh"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..1add16a --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +This is a rough edited version of https://github.com/ArchiveTeam/archiveteam-megawarc-factory to upload tar files instead of packing and uploading WARCs. diff --git a/chunk-multiple b/chunk-multiple new file mode 100755 index 0000000..d7357b0 --- /dev/null +++ b/chunk-multiple @@ -0,0 +1,21 @@ +#!/bin/bash +# This loops the chunker script while the RUN file exists. +# See chunker for details. +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +while [[ -f RUN ]] +do + date + "${SCRIPT_DIR}/chunker" + result="${?}" + if [[ "${result}" -ne 0 ]] + then + date + echo "chunker exited with ${result}" + exit "${result}" + fi + + echo "Sleeping..." + sleep 1 +done + diff --git a/chunker b/chunker new file mode 100755 index 0000000..8206480 --- /dev/null +++ b/chunker @@ -0,0 +1,58 @@ +#!/bin/bash +# Move uploaded .warc.gz files to an archive directory. +# When the archive is large enough, make a tar and start with a +# new archive. +# +# Be careful: this script assumes that any file in the upload directory +# that has a name that ends with *.warc.gz is a fully uploaded file and +# can be moved somewhere else. Remember this when running Rsync. +# + +INCOMING_UPLOADS_DIR="${1}" # /home/archiveteam/uploads +CHUNKER_WORKING_DIR="${2}" # /home/archiveteam/processed +PACKING_QUEUE_DIR="${CHUNKER_WORKING_DIR}/archive" +MEGABYTES_PER_CHUNK=$((1024*25)) + +# if not specified in command-line arguments +if [ -z "${INCOMING_UPLOADS_DIR}" ] +then + source ./config.sh || exit 1 +fi + +BYTES_PER_CHUNK=$((1024*1024*MEGABYTES_PER_CHUNK)) + +mkdir -p "${CHUNKER_WORKING_DIR}" || exit 1 +mkdir -p "${PACKING_QUEUE_DIR}" || exit 1 + +mkdir -p "${CHUNKER_WORKING_DIR}/current" || exit 1 +cur_size=$( du -B1 -s "${CHUNKER_WORKING_DIR}/current" | grep -oE "^[0-9]+" ) + +# find every .warc.gz in the upload directory +find "${INCOMING_UPLOADS_DIR}" -type f -regex ".+\.tar$" \ +| while read -r filename +do + # skip partial uploads + if [[ "${filename}" =~ rsync-tmp ]] + then + continue + fi + + cur_size=$((cur_size + $( du -B1 -s "${filename}" | grep -oE "^[0-9]+" ))) + + # move to the current/ directory + echo "Moving ${filename}" + mkdir -p "${CHUNKER_WORKING_DIR}/current" + mv "${filename}" "${CHUNKER_WORKING_DIR}/current/" + + # if the current/ directory is large enough, + # rename it to archive-XXXXX and start a new current/ + if [[ "${cur_size}" -gt "${BYTES_PER_CHUNK}" ]] + then + timestamp=$( date +'%Y%m%d%H%M%S' ) + uuid=$(cat /proc/sys/kernel/random/uuid | cut -d- -f1) + echo "Current archive is full, moving to ${timestamp}_${uuid}." + mv "${CHUNKER_WORKING_DIR}/current" "${UPLOAD_QUEUE_DIR}/${timestamp}_${uuid}" + cur_size=0 + fi +done + diff --git a/config.example.sh b/config.example.sh new file mode 100755 index 0000000..357a8ac --- /dev/null +++ b/config.example.sh @@ -0,0 +1,88 @@ +#!/bin/bash +# Create a copy of this config.sh, customise it and place it in the +# working directory of the packing and upload scripts. + +#################### +# CHUNKER SETTINGS # +#################### +# start a new chunk when the current chunk is at least this large +MEGABYTES_PER_CHUNK=$((1024*25)) + +################### +# UPLOAD METADATA # +################### +# your Archive.org S3 keys +IA_AUTH="ACCESS_KEY:SECRET" + +# the name of the collection to add the uploads to +IA_COLLECTION="archiveteam_TODO" + +# the title of the items (" ${item_timestamp}" will be appended) +IA_ITEM_TITLE="Archive Team TODO:" + +# the prefix of the item name ("${item_timestamp}" is appended) +IA_ITEM_PREFIX="archiveteam_todo_" + +# the prefix of the megawarc filename ("${item_timestamp}" is appended) +FILE_PREFIX="todo_" + +# the date field for the item +IA_ITEM_DATE=$( date +"%Y-%m" ) + +# offload items to another rsync storage instead of uploading to IA +OFFLOAD_TARGET="rsync://somewhere-far-away:portnum/module-name/directory/" +# it is also possible to create a list of targets and the offloader will pick one at random and retry others on failure +# simply comment out the line above and put all rsync target urls separated by newline in a file called "offload_targets" + +# the API for requesting the ZSTD dictionaries +ZST_DICTIONARY_API="API_URL" + +############### +# DIRECTORIES # +############### +# Put your directories on one or two filesystems (see README). +FS1_BASE_DIR="/archiveteam/ssd/project" +FS2_BASE_DIR="/archiveteam/disk/project" + +## THESE DIRECTORIES ON FILESYSTEM 1: for warcs + +# the rsync upload directory +# (the chunker will package the .warc.gz files in this directory) +INCOMING_UPLOADS_DIR="${FS1_BASE_DIR}/incoming-uploads" + +# the chunker working directory +# (this directory will hold the current in-progress chunk) +CHUNKER_WORKING_DIR="${FS1_BASE_DIR}/chunker-work" + +# the chunker output directory / the packer queue +# (this directory will hold the completed chunks) +PACKING_QUEUE_DIR="${FS1_BASE_DIR}/packing-queue" + +# the packer working directory - warc side +# (this directory will hold the current chunk) +PACKER_WORKING_CHUNKS_DIR="${FS1_BASE_DIR}/packer-work-in" + +## THESE DIRECTORIES ON FILESYSTEM 2: for megawarcs + +# the packer working directory - megawarc side +# (this directory will hold the current megawarc) +PACKER_WORKING_MEGAWARC_DIR="${FS2_BASE_DIR}/packer-work-out" + +# the packer output directory / the upload queue +# (this directory will hold the completed megawarcs) +UPLOAD_QUEUE_DIR="${FS2_BASE_DIR}/upload-queue" + +# the uploader working directory +# (this directory will hold the current megawarc) +UPLOADER_WORKING_DIR="${FS2_BASE_DIR}/uploader-work" + +# the final destination for uploaded megawarcs +# leave this empty to remove megawarcs after uploading +COMPLETED_DIR="${FS2_BASE_DIR}/uploaded" + + +# remove this +echo "config.sh not customised." +exit 1 + + diff --git a/docker-boot.sh b/docker-boot.sh new file mode 100755 index 0000000..2afc811 --- /dev/null +++ b/docker-boot.sh @@ -0,0 +1,70 @@ +#!/bin/bash + + set -e + + test -d /data || { + echo "No /data mount found" + exit 1 +} + + mkdir -p /data/incoming /data/chunker-work /data/packing-queue /data/packer-work-in /data/packer-work-out /data/upload-queue /data/uploader-work + + IA_ITEM_DATE_LIT='$( date +"%Y-%m" )' +cat > /factory/config.sh << EOF +#!/bin/bash +MEGABYTES_PER_CHUNK="${MEGABYTES_PER_CHUNK}" +IA_AUTH="${IA_AUTH}" +IA_COLLECTION="${IA_COLLECTION}" +IA_ITEM_TITLE="${IA_ITEM_TITLE}" +IA_ITEM_PREFIX="${IA_ITEM_PREFIX}" +FILE_PREFIX="${FILE_PREFIX}" +IA_ITEM_DATE="${IA_ITEM_DATE_LIT}" +OFFLOAD_TARGET="${OFFLOAD_TARGET}" +ZST_DICTIONARY_API="${ZST_DICTIONARY_API}" +LOAD_BALANCER="${LOAD_BALANCER}" +INCOMING_UPLOADS_DIR="/data/incoming" +CHUNKER_WORKING_DIR="/data/chunker-work" +PACKING_QUEUE_DIR="/data/packing-queue" +PACKER_WORKING_CHUNKS_DIR="/data/packer-work-in" +PACKER_WORKING_MEGAWARC_DIR="/data/packer-work-out" +UPLOAD_QUEUE_DIR="/data/upload-queue" +UPLOADER_WORKING_DIR="/data/uploader-work" +COMPLETED_DIR="" +EOF + + touch /factory/RUN + + case "${1}" in + chunk|chunker|chunk-multiple) + if test -z "${MEGABYTES_PER_CHUNK}"; then + echo "Missing param: MEGABYTES_PER_CHUNK=${MEGABYTES_PER_CHUNK}" + exit 1 + fi + exec /factory/chunk-multiple + ;; + pack|pack-one|packer|pack-multiple) + if test -z "${FILE_PREFIX}" || test -z "${ZST_DICTIONARY_API}"; then + echo "Missing param: FILE_PREFIX=${FILE_PREFIX} ZST_DICTIONARY_API=${ZST_DICTIONARY_API}" + exit 1 + fi + exec /factory/pack-multiple + ;; + upload|upload-one|upload-multiple) + if test -z "${IA_AUTH}" || test -z "${IA_COLLECTION}" || test -z "${IA_ITEM_TITLE}" || test -z "${IA_ITEM_PREFIX}" || test -z "${FILE_PREFIX}"; then + echo "Missing param: IA_AUTH=${IA_AUTH} IA_COLLECTION=${IA_COLLECTION} IA_ITEM_TITLE=${IA_ITEM_TITLE} IA_ITEM_PREFIX=${IA_ITEM_PREFIX} FILE_PREFIX=${FILE_PREFIX}" + exit 1 + fi + exec /factory/upload-multiple + ;; + offload|offload-one|offload-multiple) + if test -z "${OFFLOAD_TARGET}" && ! test -f "${PWD}/offload_targets"; then + echo "Missing param: OFFLOAD_TARGET=${OFFLOAD_TARGET} and no ${PWD}/offload_targets existing" + exit 1 + fi + exec /factory/offload-multiple + ;; + *) + echo "Usage: chunk|pack|upload|offload" + exit 1 + ;; +esac diff --git a/du-all b/du-all new file mode 100755 index 0000000..2c94605 --- /dev/null +++ b/du-all @@ -0,0 +1,14 @@ +#!/bin/bash +# This shows du -hs for the important directories. +source ./config.sh || exit 1 + +du -hs \ + "${INCOMING_UPLOADS_DIR}" \ + "${CHUNKER_WORKING_DIR}" \ + "${PACKING_QUEUE_DIR}/"* \ + "${PACKER_WORKING_CHUNKS_DIR}/"* \ + "${PACKER_WORKING_MEGAWARC_DIR}/"* \ + "${UPLOAD_QUEUE_DIR}/"* \ + "${UPLOADER_WORKING_DIR}/"* \ +2> >(grep -v 'du: cannot \(access\|read\)' >&2) + diff --git a/upload-multiple b/upload-multiple new file mode 100755 index 0000000..5b1fc6f --- /dev/null +++ b/upload-multiple @@ -0,0 +1,17 @@ +#!/bin/bash +# This loops the upload-one script while the RUN file exists. +# See upload-one for details. +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +while [[ -f RUN ]] +do + "${SCRIPT_DIR}/upload-one" + result="${?}" + if [[ "${result}" -ne 0 ]] + then + date + echo "uploader exited with ${result}" + exit "${result}" + fi +done + diff --git a/upload-one b/upload-one new file mode 100755 index 0000000..c256911 --- /dev/null +++ b/upload-one @@ -0,0 +1,137 @@ +#!/bin/bash +# Uploads megawarcs from the upload queue. +# (Needs a config.sh in the working directory.) +# +# ./upload-one +# +# 1. Grabs an item from UPLOAD_QUEUE_DIR +# 2. Reserves the item by moving the directory to the +# UPLOADER_WORKING_DIR +# 3. Uploads the item to s3.us.archive.org +# 4. Removes the source files from the working directory +# If COMPLETED_DIR is set, uploaded files are moved there. +# +# The program exits with 1 on any nontransient error. +# + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +source ./config.sh || exit 1 + +mkdir -p "${UPLOAD_QUEUE_DIR}" || exit 1 +mkdir -p "${UPLOADER_WORKING_DIR}" || exit 1 + +if [ ! -z "${COMPLETED_DIR}" ] +then + mkdir -p "${COMPLETED_DIR}" || exit 1 +fi + +function mayicontinue { + echo +# echo "May I continue?" +# read +# echo +} + +mayicontinue + + +# try to grab an item from UPLOAD_QUEUE_DIR +ITEM=none +while [[ "${ITEM}" = none ]] +do + possible_item=$( ls -1 "${UPLOAD_QUEUE_DIR}" | grep -E '[0-9]{14}_[a-f0-9]{8}$' | sort | head -n 1 ) + if test -n "${possible_item}" + then + echo "Trying to grab ${possible_item}" + if mv "${UPLOAD_QUEUE_DIR}/${possible_item}" "${UPLOADER_WORKING_DIR}/" + then + ITEM="${possible_item}" + else + echo "Failed to move ${possible_item}" + sleep 5 + fi + else + date + echo "No current item found!" + sleep 30 + exit 0 + fi +done + + +echo "$( date ): Start uploading for item ${ITEM}" >> uploader.log + +# upload megawarc +size_hint=$( du --bytes -s "${UPLOADER_WORKING_DIR}/${ITEM}" | grep -oE "^[0-9]+" ) +# (upload the large files first to optimise S3 snowballing) + +find "${UPLOADER_WORKING_DIR}/${ITEM}" -type f -regextype posix-egrep -regex ".+\.tar$" -printf "%f\n" \ +| while read -r filename +do + result=1 + while [[ "${result}" -ne 0 ]] + do + curl -v --location --fail \ + --speed-limit 1 --speed-time 900 \ + --header "x-archive-queue-derive:1" \ + --header "x-amz-auto-make-bucket:1" \ + --header "x-archive-keep-old-version:1" \ + --header "x-archive-meta-collection:${IA_COLLECTION}" \ + --header "x-archive-meta-mediatype:data" \ + --header "x-archive-meta-title:${IA_ITEM_TITLE} ${ITEM}" \ + --header "x-archive-meta-date:${IA_ITEM_DATE}" \ + --header "x-archive-meta-language:eng" \ + --header "x-archive-meta-noarchivetorrent:true" \ + --header "x-archive-size-hint:${size_hint}" \ + --header "authorization: LOW ${IA_AUTH}" \ + --upload-file "${UPLOADER_WORKING_DIR}/${ITEM}/${filename}" \ + "https://${LOAD_BALANCER:-s3}.us.archive.org/${IA_ITEM_PREFIX}${ITEM}/${filename}" \ + > /dev/null + result="${?}" + if [[ "${result}" -ne 0 ]] + then + date + echo "Error while uploading ${ITEM}, curl said ${result}" + echo "Will retry in 30 seconds" + sleep 30 + fi + done +done + +echo "Uploaded ${ITEM}" + +echo "$( date ): Completed uploading for item ${ITEM}" >> uploader.log + + +mayicontinue + + +# move or remove megawarc +if [ -z "${COMPLETED_DIR}" ] +then + # remove + rm -rf "${UPLOADER_WORKING_DIR}/${ITEM}" + result="${?}" + + if [[ "${result}" -ne 0 ]] + then + date + echo "rm -rf megawarc exited with ${result} for ${ITEM}" + exit 1 + fi +else + # move + mv "${UPLOADER_WORKING_DIR}/${ITEM}" "${COMPLETED_DIR}/" + result="${?}" + + if [[ "${result}" -ne 0 ]] + then + date + echo "rm -rf megawarc exited with ${result} for ${ITEM}" + exit 1 + fi +fi + +exit 0 +