@@ -0,0 +1,18 @@ | |||||
--- | |||||
kind: pipeline | |||||
name: default | |||||
steps: | |||||
- name: docker | |||||
image: plugins/docker | |||||
settings: | |||||
registry: atdr-writer.meo.ws | |||||
username: | |||||
from_secret: atdr_user | |||||
password: | |||||
from_secret: atdr_pass | |||||
repo: atdr-writer.meo.ws/archiveteam/archiveteam-tar-uploader | |||||
dockerfile: Dockerfile | |||||
purge: true | |||||
auto_tag: false | |||||
tags: | |||||
- latest |
@@ -0,0 +1 @@ | |||||
*~ |
@@ -0,0 +1,12 @@ | |||||
FROM debian:stretch-slim | |||||
RUN echo 'deb http://ftp.de.debian.org/debian buster-backports main' >> /etc/apt/sources.list | |||||
RUN DEBIAN_FRONTEND=noninteractive DEBIAN_PRIORITY=critical apt-get -qqy --no-install-recommends -o Dpkg::Options::=--force-confdef -o Dpkg::Options::=--force-confold -o Dpkg::Options::=--force-unsafe-io update \ | |||||
&& DEBIAN_FRONTEND=noninteractive DEBIAN_PRIORITY=critical apt-get -qqy --no-install-recommends -o Dpkg::Options::=--force-confdef -o Dpkg::Options::=--force-confold -o Dpkg::Options::=--force-unsafe-io install python python2.7 rsync git ca-certificates curl python-pip \ | |||||
&& DEBIAN_FRONTEND=noninteractive DEBIAN_PRIORITY=critical apt-get -qqy --no-install-recommends -o Dpkg::Options::=--force-confdef -o Dpkg::Options::=--force-confold -o Dpkg::Options::=--force-unsafe-io -t buster-backports install zstd libzstd-dev libzstd1 \ | |||||
&& pip install zstandard && pip install requests | |||||
COPY * factory/ | |||||
RUN rm -rf /factory/megawarc && git clone https://github.com/archiveteam/megawarc.git /factory/megawarc | |||||
WORKDIR /factory | |||||
COPY docker-boot.sh / | |||||
RUN chmod +x /docker-boot.sh | |||||
ENTRYPOINT ["/docker-boot.sh"] |
@@ -0,0 +1 @@ | |||||
This is a rough edited version of https://github.com/ArchiveTeam/archiveteam-megawarc-factory to upload tar files instead of packing and uploading WARCs. |
@@ -0,0 +1,21 @@ | |||||
#!/bin/bash | |||||
# This loops the chunker script while the RUN file exists. | |||||
# See chunker for details. | |||||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | |||||
while [[ -f RUN ]] | |||||
do | |||||
date | |||||
"${SCRIPT_DIR}/chunker" | |||||
result="${?}" | |||||
if [[ "${result}" -ne 0 ]] | |||||
then | |||||
date | |||||
echo "chunker exited with ${result}" | |||||
exit "${result}" | |||||
fi | |||||
echo "Sleeping..." | |||||
sleep 1 | |||||
done | |||||
@@ -0,0 +1,58 @@ | |||||
#!/bin/bash | |||||
# Move uploaded .warc.gz files to an archive directory. | |||||
# When the archive is large enough, make a tar and start with a | |||||
# new archive. | |||||
# | |||||
# Be careful: this script assumes that any file in the upload directory | |||||
# that has a name that ends with *.warc.gz is a fully uploaded file and | |||||
# can be moved somewhere else. Remember this when running Rsync. | |||||
# | |||||
INCOMING_UPLOADS_DIR="${1}" # /home/archiveteam/uploads | |||||
CHUNKER_WORKING_DIR="${2}" # /home/archiveteam/processed | |||||
PACKING_QUEUE_DIR="${CHUNKER_WORKING_DIR}/archive" | |||||
MEGABYTES_PER_CHUNK=$((1024*25)) | |||||
# if not specified in command-line arguments | |||||
if [ -z "${INCOMING_UPLOADS_DIR}" ] | |||||
then | |||||
source ./config.sh || exit 1 | |||||
fi | |||||
BYTES_PER_CHUNK=$((1024*1024*MEGABYTES_PER_CHUNK)) | |||||
mkdir -p "${CHUNKER_WORKING_DIR}" || exit 1 | |||||
mkdir -p "${PACKING_QUEUE_DIR}" || exit 1 | |||||
mkdir -p "${CHUNKER_WORKING_DIR}/current" || exit 1 | |||||
cur_size=$( du -B1 -s "${CHUNKER_WORKING_DIR}/current" | grep -oE "^[0-9]+" ) | |||||
# find every .warc.gz in the upload directory | |||||
find "${INCOMING_UPLOADS_DIR}" -type f -regex ".+\.tar$" \ | |||||
| while read -r filename | |||||
do | |||||
# skip partial uploads | |||||
if [[ "${filename}" =~ rsync-tmp ]] | |||||
then | |||||
continue | |||||
fi | |||||
cur_size=$((cur_size + $( du -B1 -s "${filename}" | grep -oE "^[0-9]+" ))) | |||||
# move to the current/ directory | |||||
echo "Moving ${filename}" | |||||
mkdir -p "${CHUNKER_WORKING_DIR}/current" | |||||
mv "${filename}" "${CHUNKER_WORKING_DIR}/current/" | |||||
# if the current/ directory is large enough, | |||||
# rename it to archive-XXXXX and start a new current/ | |||||
if [[ "${cur_size}" -gt "${BYTES_PER_CHUNK}" ]] | |||||
then | |||||
timestamp=$( date +'%Y%m%d%H%M%S' ) | |||||
uuid=$(cat /proc/sys/kernel/random/uuid | cut -d- -f1) | |||||
echo "Current archive is full, moving to ${timestamp}_${uuid}." | |||||
mv "${CHUNKER_WORKING_DIR}/current" "${UPLOAD_QUEUE_DIR}/${timestamp}_${uuid}" | |||||
cur_size=0 | |||||
fi | |||||
done | |||||
@@ -0,0 +1,88 @@ | |||||
#!/bin/bash | |||||
# Create a copy of this config.sh, customise it and place it in the | |||||
# working directory of the packing and upload scripts. | |||||
#################### | |||||
# CHUNKER SETTINGS # | |||||
#################### | |||||
# start a new chunk when the current chunk is at least this large | |||||
MEGABYTES_PER_CHUNK=$((1024*25)) | |||||
################### | |||||
# UPLOAD METADATA # | |||||
################### | |||||
# your Archive.org S3 keys | |||||
IA_AUTH="ACCESS_KEY:SECRET" | |||||
# the name of the collection to add the uploads to | |||||
IA_COLLECTION="archiveteam_TODO" | |||||
# the title of the items (" ${item_timestamp}" will be appended) | |||||
IA_ITEM_TITLE="Archive Team TODO:" | |||||
# the prefix of the item name ("${item_timestamp}" is appended) | |||||
IA_ITEM_PREFIX="archiveteam_todo_" | |||||
# the prefix of the megawarc filename ("${item_timestamp}" is appended) | |||||
FILE_PREFIX="todo_" | |||||
# the date field for the item | |||||
IA_ITEM_DATE=$( date +"%Y-%m" ) | |||||
# offload items to another rsync storage instead of uploading to IA | |||||
OFFLOAD_TARGET="rsync://somewhere-far-away:portnum/module-name/directory/" | |||||
# it is also possible to create a list of targets and the offloader will pick one at random and retry others on failure | |||||
# simply comment out the line above and put all rsync target urls separated by newline in a file called "offload_targets" | |||||
# the API for requesting the ZSTD dictionaries | |||||
ZST_DICTIONARY_API="API_URL" | |||||
############### | |||||
# DIRECTORIES # | |||||
############### | |||||
# Put your directories on one or two filesystems (see README). | |||||
FS1_BASE_DIR="/archiveteam/ssd/project" | |||||
FS2_BASE_DIR="/archiveteam/disk/project" | |||||
## THESE DIRECTORIES ON FILESYSTEM 1: for warcs | |||||
# the rsync upload directory | |||||
# (the chunker will package the .warc.gz files in this directory) | |||||
INCOMING_UPLOADS_DIR="${FS1_BASE_DIR}/incoming-uploads" | |||||
# the chunker working directory | |||||
# (this directory will hold the current in-progress chunk) | |||||
CHUNKER_WORKING_DIR="${FS1_BASE_DIR}/chunker-work" | |||||
# the chunker output directory / the packer queue | |||||
# (this directory will hold the completed chunks) | |||||
PACKING_QUEUE_DIR="${FS1_BASE_DIR}/packing-queue" | |||||
# the packer working directory - warc side | |||||
# (this directory will hold the current chunk) | |||||
PACKER_WORKING_CHUNKS_DIR="${FS1_BASE_DIR}/packer-work-in" | |||||
## THESE DIRECTORIES ON FILESYSTEM 2: for megawarcs | |||||
# the packer working directory - megawarc side | |||||
# (this directory will hold the current megawarc) | |||||
PACKER_WORKING_MEGAWARC_DIR="${FS2_BASE_DIR}/packer-work-out" | |||||
# the packer output directory / the upload queue | |||||
# (this directory will hold the completed megawarcs) | |||||
UPLOAD_QUEUE_DIR="${FS2_BASE_DIR}/upload-queue" | |||||
# the uploader working directory | |||||
# (this directory will hold the current megawarc) | |||||
UPLOADER_WORKING_DIR="${FS2_BASE_DIR}/uploader-work" | |||||
# the final destination for uploaded megawarcs | |||||
# leave this empty to remove megawarcs after uploading | |||||
COMPLETED_DIR="${FS2_BASE_DIR}/uploaded" | |||||
# remove this | |||||
echo "config.sh not customised." | |||||
exit 1 | |||||
@@ -0,0 +1,70 @@ | |||||
#!/bin/bash | |||||
set -e | |||||
test -d /data || { | |||||
echo "No /data mount found" | |||||
exit 1 | |||||
} | |||||
mkdir -p /data/incoming /data/chunker-work /data/packing-queue /data/packer-work-in /data/packer-work-out /data/upload-queue /data/uploader-work | |||||
IA_ITEM_DATE_LIT='$( date +"%Y-%m" )' | |||||
cat > /factory/config.sh << EOF | |||||
#!/bin/bash | |||||
MEGABYTES_PER_CHUNK="${MEGABYTES_PER_CHUNK}" | |||||
IA_AUTH="${IA_AUTH}" | |||||
IA_COLLECTION="${IA_COLLECTION}" | |||||
IA_ITEM_TITLE="${IA_ITEM_TITLE}" | |||||
IA_ITEM_PREFIX="${IA_ITEM_PREFIX}" | |||||
FILE_PREFIX="${FILE_PREFIX}" | |||||
IA_ITEM_DATE="${IA_ITEM_DATE_LIT}" | |||||
OFFLOAD_TARGET="${OFFLOAD_TARGET}" | |||||
ZST_DICTIONARY_API="${ZST_DICTIONARY_API}" | |||||
LOAD_BALANCER="${LOAD_BALANCER}" | |||||
INCOMING_UPLOADS_DIR="/data/incoming" | |||||
CHUNKER_WORKING_DIR="/data/chunker-work" | |||||
PACKING_QUEUE_DIR="/data/packing-queue" | |||||
PACKER_WORKING_CHUNKS_DIR="/data/packer-work-in" | |||||
PACKER_WORKING_MEGAWARC_DIR="/data/packer-work-out" | |||||
UPLOAD_QUEUE_DIR="/data/upload-queue" | |||||
UPLOADER_WORKING_DIR="/data/uploader-work" | |||||
COMPLETED_DIR="" | |||||
EOF | |||||
touch /factory/RUN | |||||
case "${1}" in | |||||
chunk|chunker|chunk-multiple) | |||||
if test -z "${MEGABYTES_PER_CHUNK}"; then | |||||
echo "Missing param: MEGABYTES_PER_CHUNK=${MEGABYTES_PER_CHUNK}" | |||||
exit 1 | |||||
fi | |||||
exec /factory/chunk-multiple | |||||
;; | |||||
pack|pack-one|packer|pack-multiple) | |||||
if test -z "${FILE_PREFIX}" || test -z "${ZST_DICTIONARY_API}"; then | |||||
echo "Missing param: FILE_PREFIX=${FILE_PREFIX} ZST_DICTIONARY_API=${ZST_DICTIONARY_API}" | |||||
exit 1 | |||||
fi | |||||
exec /factory/pack-multiple | |||||
;; | |||||
upload|upload-one|upload-multiple) | |||||
if test -z "${IA_AUTH}" || test -z "${IA_COLLECTION}" || test -z "${IA_ITEM_TITLE}" || test -z "${IA_ITEM_PREFIX}" || test -z "${FILE_PREFIX}"; then | |||||
echo "Missing param: IA_AUTH=${IA_AUTH} IA_COLLECTION=${IA_COLLECTION} IA_ITEM_TITLE=${IA_ITEM_TITLE} IA_ITEM_PREFIX=${IA_ITEM_PREFIX} FILE_PREFIX=${FILE_PREFIX}" | |||||
exit 1 | |||||
fi | |||||
exec /factory/upload-multiple | |||||
;; | |||||
offload|offload-one|offload-multiple) | |||||
if test -z "${OFFLOAD_TARGET}" && ! test -f "${PWD}/offload_targets"; then | |||||
echo "Missing param: OFFLOAD_TARGET=${OFFLOAD_TARGET} and no ${PWD}/offload_targets existing" | |||||
exit 1 | |||||
fi | |||||
exec /factory/offload-multiple | |||||
;; | |||||
*) | |||||
echo "Usage: chunk|pack|upload|offload" | |||||
exit 1 | |||||
;; | |||||
esac |
@@ -0,0 +1,14 @@ | |||||
#!/bin/bash | |||||
# This shows du -hs for the important directories. | |||||
source ./config.sh || exit 1 | |||||
du -hs \ | |||||
"${INCOMING_UPLOADS_DIR}" \ | |||||
"${CHUNKER_WORKING_DIR}" \ | |||||
"${PACKING_QUEUE_DIR}/"* \ | |||||
"${PACKER_WORKING_CHUNKS_DIR}/"* \ | |||||
"${PACKER_WORKING_MEGAWARC_DIR}/"* \ | |||||
"${UPLOAD_QUEUE_DIR}/"* \ | |||||
"${UPLOADER_WORKING_DIR}/"* \ | |||||
2> >(grep -v 'du: cannot \(access\|read\)' >&2) | |||||
@@ -0,0 +1,17 @@ | |||||
#!/bin/bash | |||||
# This loops the upload-one script while the RUN file exists. | |||||
# See upload-one for details. | |||||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | |||||
while [[ -f RUN ]] | |||||
do | |||||
"${SCRIPT_DIR}/upload-one" | |||||
result="${?}" | |||||
if [[ "${result}" -ne 0 ]] | |||||
then | |||||
date | |||||
echo "uploader exited with ${result}" | |||||
exit "${result}" | |||||
fi | |||||
done | |||||
@@ -0,0 +1,137 @@ | |||||
#!/bin/bash | |||||
# Uploads megawarcs from the upload queue. | |||||
# (Needs a config.sh in the working directory.) | |||||
# | |||||
# ./upload-one | |||||
# | |||||
# 1. Grabs an item from UPLOAD_QUEUE_DIR | |||||
# 2. Reserves the item by moving the directory to the | |||||
# UPLOADER_WORKING_DIR | |||||
# 3. Uploads the item to s3.us.archive.org | |||||
# 4. Removes the source files from the working directory | |||||
# If COMPLETED_DIR is set, uploaded files are moved there. | |||||
# | |||||
# The program exits with 1 on any nontransient error. | |||||
# | |||||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | |||||
source ./config.sh || exit 1 | |||||
mkdir -p "${UPLOAD_QUEUE_DIR}" || exit 1 | |||||
mkdir -p "${UPLOADER_WORKING_DIR}" || exit 1 | |||||
if [ ! -z "${COMPLETED_DIR}" ] | |||||
then | |||||
mkdir -p "${COMPLETED_DIR}" || exit 1 | |||||
fi | |||||
function mayicontinue { | |||||
echo | |||||
# echo "May I continue?" | |||||
# read | |||||
# echo | |||||
} | |||||
mayicontinue | |||||
# try to grab an item from UPLOAD_QUEUE_DIR | |||||
ITEM=none | |||||
while [[ "${ITEM}" = none ]] | |||||
do | |||||
possible_item=$( ls -1 "${UPLOAD_QUEUE_DIR}" | grep -E '[0-9]{14}_[a-f0-9]{8}$' | sort | head -n 1 ) | |||||
if test -n "${possible_item}" | |||||
then | |||||
echo "Trying to grab ${possible_item}" | |||||
if mv "${UPLOAD_QUEUE_DIR}/${possible_item}" "${UPLOADER_WORKING_DIR}/" | |||||
then | |||||
ITEM="${possible_item}" | |||||
else | |||||
echo "Failed to move ${possible_item}" | |||||
sleep 5 | |||||
fi | |||||
else | |||||
date | |||||
echo "No current item found!" | |||||
sleep 30 | |||||
exit 0 | |||||
fi | |||||
done | |||||
echo "$( date ): Start uploading for item ${ITEM}" >> uploader.log | |||||
# upload megawarc | |||||
size_hint=$( du --bytes -s "${UPLOADER_WORKING_DIR}/${ITEM}" | grep -oE "^[0-9]+" ) | |||||
# (upload the large files first to optimise S3 snowballing) | |||||
find "${UPLOADER_WORKING_DIR}/${ITEM}" -type f -regextype posix-egrep -regex ".+\.tar$" -printf "%f\n" \ | |||||
| while read -r filename | |||||
do | |||||
result=1 | |||||
while [[ "${result}" -ne 0 ]] | |||||
do | |||||
curl -v --location --fail \ | |||||
--speed-limit 1 --speed-time 900 \ | |||||
--header "x-archive-queue-derive:1" \ | |||||
--header "x-amz-auto-make-bucket:1" \ | |||||
--header "x-archive-keep-old-version:1" \ | |||||
--header "x-archive-meta-collection:${IA_COLLECTION}" \ | |||||
--header "x-archive-meta-mediatype:data" \ | |||||
--header "x-archive-meta-title:${IA_ITEM_TITLE} ${ITEM}" \ | |||||
--header "x-archive-meta-date:${IA_ITEM_DATE}" \ | |||||
--header "x-archive-meta-language:eng" \ | |||||
--header "x-archive-meta-noarchivetorrent:true" \ | |||||
--header "x-archive-size-hint:${size_hint}" \ | |||||
--header "authorization: LOW ${IA_AUTH}" \ | |||||
--upload-file "${UPLOADER_WORKING_DIR}/${ITEM}/${filename}" \ | |||||
"https://${LOAD_BALANCER:-s3}.us.archive.org/${IA_ITEM_PREFIX}${ITEM}/${filename}" \ | |||||
> /dev/null | |||||
result="${?}" | |||||
if [[ "${result}" -ne 0 ]] | |||||
then | |||||
date | |||||
echo "Error while uploading ${ITEM}, curl said ${result}" | |||||
echo "Will retry in 30 seconds" | |||||
sleep 30 | |||||
fi | |||||
done | |||||
done | |||||
echo "Uploaded ${ITEM}" | |||||
echo "$( date ): Completed uploading for item ${ITEM}" >> uploader.log | |||||
mayicontinue | |||||
# move or remove megawarc | |||||
if [ -z "${COMPLETED_DIR}" ] | |||||
then | |||||
# remove | |||||
rm -rf "${UPLOADER_WORKING_DIR}/${ITEM}" | |||||
result="${?}" | |||||
if [[ "${result}" -ne 0 ]] | |||||
then | |||||
date | |||||
echo "rm -rf megawarc exited with ${result} for ${ITEM}" | |||||
exit 1 | |||||
fi | |||||
else | |||||
# move | |||||
mv "${UPLOADER_WORKING_DIR}/${ITEM}" "${COMPLETED_DIR}/" | |||||
result="${?}" | |||||
if [[ "${result}" -ne 0 ]] | |||||
then | |||||
date | |||||
echo "rm -rf megawarc exited with ${result} for ${ITEM}" | |||||
exit 1 | |||||
fi | |||||
fi | |||||
exit 0 | |||||