Browse Source

initial

master
arkiver 1 year ago
commit
e42f241a00
11 changed files with 437 additions and 0 deletions
  1. +18
    -0
      .drone.yml
  2. +1
    -0
      .gitignore
  3. +12
    -0
      Dockerfile
  4. +1
    -0
      README.md
  5. +21
    -0
      chunk-multiple
  6. +58
    -0
      chunker
  7. +88
    -0
      config.example.sh
  8. +70
    -0
      docker-boot.sh
  9. +14
    -0
      du-all
  10. +17
    -0
      upload-multiple
  11. +137
    -0
      upload-one

+ 18
- 0
.drone.yml View File

@@ -0,0 +1,18 @@
---
kind: pipeline
name: default
steps:
- name: docker
image: plugins/docker
settings:
registry: atdr-writer.meo.ws
username:
from_secret: atdr_user
password:
from_secret: atdr_pass
repo: atdr-writer.meo.ws/archiveteam/archiveteam-tar-uploader
dockerfile: Dockerfile
purge: true
auto_tag: false
tags:
- latest

+ 1
- 0
.gitignore View File

@@ -0,0 +1 @@
*~

+ 12
- 0
Dockerfile View File

@@ -0,0 +1,12 @@
FROM debian:stretch-slim
RUN echo 'deb http://ftp.de.debian.org/debian buster-backports main' >> /etc/apt/sources.list
RUN DEBIAN_FRONTEND=noninteractive DEBIAN_PRIORITY=critical apt-get -qqy --no-install-recommends -o Dpkg::Options::=--force-confdef -o Dpkg::Options::=--force-confold -o Dpkg::Options::=--force-unsafe-io update \
&& DEBIAN_FRONTEND=noninteractive DEBIAN_PRIORITY=critical apt-get -qqy --no-install-recommends -o Dpkg::Options::=--force-confdef -o Dpkg::Options::=--force-confold -o Dpkg::Options::=--force-unsafe-io install python python2.7 rsync git ca-certificates curl python-pip \
&& DEBIAN_FRONTEND=noninteractive DEBIAN_PRIORITY=critical apt-get -qqy --no-install-recommends -o Dpkg::Options::=--force-confdef -o Dpkg::Options::=--force-confold -o Dpkg::Options::=--force-unsafe-io -t buster-backports install zstd libzstd-dev libzstd1 \
&& pip install zstandard && pip install requests
COPY * factory/
RUN rm -rf /factory/megawarc && git clone https://github.com/archiveteam/megawarc.git /factory/megawarc
WORKDIR /factory
COPY docker-boot.sh /
RUN chmod +x /docker-boot.sh
ENTRYPOINT ["/docker-boot.sh"]

+ 1
- 0
README.md View File

@@ -0,0 +1 @@
This is a rough edited version of https://github.com/ArchiveTeam/archiveteam-megawarc-factory to upload tar files instead of packing and uploading WARCs.

+ 21
- 0
chunk-multiple View File

@@ -0,0 +1,21 @@
#!/bin/bash
# This loops the chunker script while the RUN file exists.
# See chunker for details.
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

while [[ -f RUN ]]
do
date
"${SCRIPT_DIR}/chunker"
result="${?}"
if [[ "${result}" -ne 0 ]]
then
date
echo "chunker exited with ${result}"
exit "${result}"
fi

echo "Sleeping..."
sleep 1
done


+ 58
- 0
chunker View File

@@ -0,0 +1,58 @@
#!/bin/bash
# Move uploaded .warc.gz files to an archive directory.
# When the archive is large enough, make a tar and start with a
# new archive.
#
# Be careful: this script assumes that any file in the upload directory
# that has a name that ends with *.warc.gz is a fully uploaded file and
# can be moved somewhere else. Remember this when running Rsync.
#

INCOMING_UPLOADS_DIR="${1}" # /home/archiveteam/uploads
CHUNKER_WORKING_DIR="${2}" # /home/archiveteam/processed
PACKING_QUEUE_DIR="${CHUNKER_WORKING_DIR}/archive"
MEGABYTES_PER_CHUNK=$((1024*25))

# if not specified in command-line arguments
if [ -z "${INCOMING_UPLOADS_DIR}" ]
then
source ./config.sh || exit 1
fi

BYTES_PER_CHUNK=$((1024*1024*MEGABYTES_PER_CHUNK))

mkdir -p "${CHUNKER_WORKING_DIR}" || exit 1
mkdir -p "${PACKING_QUEUE_DIR}" || exit 1

mkdir -p "${CHUNKER_WORKING_DIR}/current" || exit 1
cur_size=$( du -B1 -s "${CHUNKER_WORKING_DIR}/current" | grep -oE "^[0-9]+" )

# find every .warc.gz in the upload directory
find "${INCOMING_UPLOADS_DIR}" -type f -regex ".+\.tar$" \
| while read -r filename
do
# skip partial uploads
if [[ "${filename}" =~ rsync-tmp ]]
then
continue
fi

cur_size=$((cur_size + $( du -B1 -s "${filename}" | grep -oE "^[0-9]+" )))

# move to the current/ directory
echo "Moving ${filename}"
mkdir -p "${CHUNKER_WORKING_DIR}/current"
mv "${filename}" "${CHUNKER_WORKING_DIR}/current/"

# if the current/ directory is large enough,
# rename it to archive-XXXXX and start a new current/
if [[ "${cur_size}" -gt "${BYTES_PER_CHUNK}" ]]
then
timestamp=$( date +'%Y%m%d%H%M%S' )
uuid=$(cat /proc/sys/kernel/random/uuid | cut -d- -f1)
echo "Current archive is full, moving to ${timestamp}_${uuid}."
mv "${CHUNKER_WORKING_DIR}/current" "${UPLOAD_QUEUE_DIR}/${timestamp}_${uuid}"
cur_size=0
fi
done


+ 88
- 0
config.example.sh View File

@@ -0,0 +1,88 @@
#!/bin/bash
# Create a copy of this config.sh, customise it and place it in the
# working directory of the packing and upload scripts.

####################
# CHUNKER SETTINGS #
####################
# start a new chunk when the current chunk is at least this large
MEGABYTES_PER_CHUNK=$((1024*25))

###################
# UPLOAD METADATA #
###################
# your Archive.org S3 keys
IA_AUTH="ACCESS_KEY:SECRET"

# the name of the collection to add the uploads to
IA_COLLECTION="archiveteam_TODO"

# the title of the items (" ${item_timestamp}" will be appended)
IA_ITEM_TITLE="Archive Team TODO:"

# the prefix of the item name ("${item_timestamp}" is appended)
IA_ITEM_PREFIX="archiveteam_todo_"

# the prefix of the megawarc filename ("${item_timestamp}" is appended)
FILE_PREFIX="todo_"

# the date field for the item
IA_ITEM_DATE=$( date +"%Y-%m" )

# offload items to another rsync storage instead of uploading to IA
OFFLOAD_TARGET="rsync://somewhere-far-away:portnum/module-name/directory/"
# it is also possible to create a list of targets and the offloader will pick one at random and retry others on failure
# simply comment out the line above and put all rsync target urls separated by newline in a file called "offload_targets"

# the API for requesting the ZSTD dictionaries
ZST_DICTIONARY_API="API_URL"

###############
# DIRECTORIES #
###############
# Put your directories on one or two filesystems (see README).
FS1_BASE_DIR="/archiveteam/ssd/project"
FS2_BASE_DIR="/archiveteam/disk/project"

## THESE DIRECTORIES ON FILESYSTEM 1: for warcs

# the rsync upload directory
# (the chunker will package the .warc.gz files in this directory)
INCOMING_UPLOADS_DIR="${FS1_BASE_DIR}/incoming-uploads"

# the chunker working directory
# (this directory will hold the current in-progress chunk)
CHUNKER_WORKING_DIR="${FS1_BASE_DIR}/chunker-work"

# the chunker output directory / the packer queue
# (this directory will hold the completed chunks)
PACKING_QUEUE_DIR="${FS1_BASE_DIR}/packing-queue"

# the packer working directory - warc side
# (this directory will hold the current chunk)
PACKER_WORKING_CHUNKS_DIR="${FS1_BASE_DIR}/packer-work-in"

## THESE DIRECTORIES ON FILESYSTEM 2: for megawarcs

# the packer working directory - megawarc side
# (this directory will hold the current megawarc)
PACKER_WORKING_MEGAWARC_DIR="${FS2_BASE_DIR}/packer-work-out"

# the packer output directory / the upload queue
# (this directory will hold the completed megawarcs)
UPLOAD_QUEUE_DIR="${FS2_BASE_DIR}/upload-queue"

# the uploader working directory
# (this directory will hold the current megawarc)
UPLOADER_WORKING_DIR="${FS2_BASE_DIR}/uploader-work"

# the final destination for uploaded megawarcs
# leave this empty to remove megawarcs after uploading
COMPLETED_DIR="${FS2_BASE_DIR}/uploaded"


# remove this
echo "config.sh not customised."
exit 1



+ 70
- 0
docker-boot.sh View File

@@ -0,0 +1,70 @@
#!/bin/bash

set -e

test -d /data || {
echo "No /data mount found"
exit 1
}

mkdir -p /data/incoming /data/chunker-work /data/packing-queue /data/packer-work-in /data/packer-work-out /data/upload-queue /data/uploader-work

IA_ITEM_DATE_LIT='$( date +"%Y-%m" )'
cat > /factory/config.sh << EOF
#!/bin/bash
MEGABYTES_PER_CHUNK="${MEGABYTES_PER_CHUNK}"
IA_AUTH="${IA_AUTH}"
IA_COLLECTION="${IA_COLLECTION}"
IA_ITEM_TITLE="${IA_ITEM_TITLE}"
IA_ITEM_PREFIX="${IA_ITEM_PREFIX}"
FILE_PREFIX="${FILE_PREFIX}"
IA_ITEM_DATE="${IA_ITEM_DATE_LIT}"
OFFLOAD_TARGET="${OFFLOAD_TARGET}"
ZST_DICTIONARY_API="${ZST_DICTIONARY_API}"
LOAD_BALANCER="${LOAD_BALANCER}"
INCOMING_UPLOADS_DIR="/data/incoming"
CHUNKER_WORKING_DIR="/data/chunker-work"
PACKING_QUEUE_DIR="/data/packing-queue"
PACKER_WORKING_CHUNKS_DIR="/data/packer-work-in"
PACKER_WORKING_MEGAWARC_DIR="/data/packer-work-out"
UPLOAD_QUEUE_DIR="/data/upload-queue"
UPLOADER_WORKING_DIR="/data/uploader-work"
COMPLETED_DIR=""
EOF

touch /factory/RUN

case "${1}" in
chunk|chunker|chunk-multiple)
if test -z "${MEGABYTES_PER_CHUNK}"; then
echo "Missing param: MEGABYTES_PER_CHUNK=${MEGABYTES_PER_CHUNK}"
exit 1
fi
exec /factory/chunk-multiple
;;
pack|pack-one|packer|pack-multiple)
if test -z "${FILE_PREFIX}" || test -z "${ZST_DICTIONARY_API}"; then
echo "Missing param: FILE_PREFIX=${FILE_PREFIX} ZST_DICTIONARY_API=${ZST_DICTIONARY_API}"
exit 1
fi
exec /factory/pack-multiple
;;
upload|upload-one|upload-multiple)
if test -z "${IA_AUTH}" || test -z "${IA_COLLECTION}" || test -z "${IA_ITEM_TITLE}" || test -z "${IA_ITEM_PREFIX}" || test -z "${FILE_PREFIX}"; then
echo "Missing param: IA_AUTH=${IA_AUTH} IA_COLLECTION=${IA_COLLECTION} IA_ITEM_TITLE=${IA_ITEM_TITLE} IA_ITEM_PREFIX=${IA_ITEM_PREFIX} FILE_PREFIX=${FILE_PREFIX}"
exit 1
fi
exec /factory/upload-multiple
;;
offload|offload-one|offload-multiple)
if test -z "${OFFLOAD_TARGET}" && ! test -f "${PWD}/offload_targets"; then
echo "Missing param: OFFLOAD_TARGET=${OFFLOAD_TARGET} and no ${PWD}/offload_targets existing"
exit 1
fi
exec /factory/offload-multiple
;;
*)
echo "Usage: chunk|pack|upload|offload"
exit 1
;;
esac

+ 14
- 0
du-all View File

@@ -0,0 +1,14 @@
#!/bin/bash
# This shows du -hs for the important directories.
source ./config.sh || exit 1

du -hs \
"${INCOMING_UPLOADS_DIR}" \
"${CHUNKER_WORKING_DIR}" \
"${PACKING_QUEUE_DIR}/"* \
"${PACKER_WORKING_CHUNKS_DIR}/"* \
"${PACKER_WORKING_MEGAWARC_DIR}/"* \
"${UPLOAD_QUEUE_DIR}/"* \
"${UPLOADER_WORKING_DIR}/"* \
2> >(grep -v 'du: cannot \(access\|read\)' >&2)


+ 17
- 0
upload-multiple View File

@@ -0,0 +1,17 @@
#!/bin/bash
# This loops the upload-one script while the RUN file exists.
# See upload-one for details.
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

while [[ -f RUN ]]
do
"${SCRIPT_DIR}/upload-one"
result="${?}"
if [[ "${result}" -ne 0 ]]
then
date
echo "uploader exited with ${result}"
exit "${result}"
fi
done


+ 137
- 0
upload-one View File

@@ -0,0 +1,137 @@
#!/bin/bash
# Uploads megawarcs from the upload queue.
# (Needs a config.sh in the working directory.)
#
# ./upload-one
#
# 1. Grabs an item from UPLOAD_QUEUE_DIR
# 2. Reserves the item by moving the directory to the
# UPLOADER_WORKING_DIR
# 3. Uploads the item to s3.us.archive.org
# 4. Removes the source files from the working directory
# If COMPLETED_DIR is set, uploaded files are moved there.
#
# The program exits with 1 on any nontransient error.
#

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

source ./config.sh || exit 1

mkdir -p "${UPLOAD_QUEUE_DIR}" || exit 1
mkdir -p "${UPLOADER_WORKING_DIR}" || exit 1

if [ ! -z "${COMPLETED_DIR}" ]
then
mkdir -p "${COMPLETED_DIR}" || exit 1
fi

function mayicontinue {
echo
# echo "May I continue?"
# read
# echo
}

mayicontinue


# try to grab an item from UPLOAD_QUEUE_DIR
ITEM=none
while [[ "${ITEM}" = none ]]
do
possible_item=$( ls -1 "${UPLOAD_QUEUE_DIR}" | grep -E '[0-9]{14}_[a-f0-9]{8}$' | sort | head -n 1 )
if test -n "${possible_item}"
then
echo "Trying to grab ${possible_item}"
if mv "${UPLOAD_QUEUE_DIR}/${possible_item}" "${UPLOADER_WORKING_DIR}/"
then
ITEM="${possible_item}"
else
echo "Failed to move ${possible_item}"
sleep 5
fi
else
date
echo "No current item found!"
sleep 30
exit 0
fi
done


echo "$( date ): Start uploading for item ${ITEM}" >> uploader.log

# upload megawarc
size_hint=$( du --bytes -s "${UPLOADER_WORKING_DIR}/${ITEM}" | grep -oE "^[0-9]+" )
# (upload the large files first to optimise S3 snowballing)

find "${UPLOADER_WORKING_DIR}/${ITEM}" -type f -regextype posix-egrep -regex ".+\.tar$" -printf "%f\n" \
| while read -r filename
do
result=1
while [[ "${result}" -ne 0 ]]
do
curl -v --location --fail \
--speed-limit 1 --speed-time 900 \
--header "x-archive-queue-derive:1" \
--header "x-amz-auto-make-bucket:1" \
--header "x-archive-keep-old-version:1" \
--header "x-archive-meta-collection:${IA_COLLECTION}" \
--header "x-archive-meta-mediatype:data" \
--header "x-archive-meta-title:${IA_ITEM_TITLE} ${ITEM}" \
--header "x-archive-meta-date:${IA_ITEM_DATE}" \
--header "x-archive-meta-language:eng" \
--header "x-archive-meta-noarchivetorrent:true" \
--header "x-archive-size-hint:${size_hint}" \
--header "authorization: LOW ${IA_AUTH}" \
--upload-file "${UPLOADER_WORKING_DIR}/${ITEM}/${filename}" \
"https://${LOAD_BALANCER:-s3}.us.archive.org/${IA_ITEM_PREFIX}${ITEM}/${filename}" \
> /dev/null
result="${?}"
if [[ "${result}" -ne 0 ]]
then
date
echo "Error while uploading ${ITEM}, curl said ${result}"
echo "Will retry in 30 seconds"
sleep 30
fi
done
done

echo "Uploaded ${ITEM}"

echo "$( date ): Completed uploading for item ${ITEM}" >> uploader.log


mayicontinue


# move or remove megawarc
if [ -z "${COMPLETED_DIR}" ]
then
# remove
rm -rf "${UPLOADER_WORKING_DIR}/${ITEM}"
result="${?}"

if [[ "${result}" -ne 0 ]]
then
date
echo "rm -rf megawarc exited with ${result} for ${ITEM}"
exit 1
fi
else
# move
mv "${UPLOADER_WORKING_DIR}/${ITEM}" "${COMPLETED_DIR}/"
result="${?}"

if [[ "${result}" -ne 0 ]]
then
date
echo "rm -rf megawarc exited with ${result} for ${ITEM}"
exit 1
fi
fi

exit 0


Loading…
Cancel
Save