commit 8a650cc865dc80ea7130464427382c34134ece68 Author: Alard Date: Fri Apr 5 22:18:07 2013 +0200 First commit. diff --git a/README.md b/README.md new file mode 100644 index 0000000..7fb73b0 --- /dev/null +++ b/README.md @@ -0,0 +1,50 @@ +Archive Team megawarc factory +============================= +Some scripts to process ArchiveTeam uploads. Use at your own risk; the scripts will need per-project adjustment. + +These scripts make batches of uploaded warc.gz files, combine them into megawarcs and upload them to their permanent home on Archive.org. + +Three processes work together to make this happen: + +1. The chunker +-------------- +The chunker moves uploaded warc.gz files from the upload directory to a batch directory. When this directory has grown to 50GB, the chunker begins a new directory and moves the completed directory to the packing queue. + +There can only be one chunker per upload directory. Chunking doesn't take long, if the files are not moving to a different filesystem. + +2. The packer +------------- +The packer monitors the packing queue. When the chunker brings a new directory, the packer removes the directory from the queue and starts converting it into a megawarc (using the megawarc utility). When that is done, the packer moves the megawarc to the upload queue and removes the original warc files. + +If necessary, multiple packers can work the same queue. Packing involves lots of gzipping and takes some time. + +3. The uploader +--------------- +The uploader monitors the upload queue. When the packer brings a new megawarc, the uploader removes the megawarc from the queue and uploads it to Archive.org. If the upload is successful, the uploader removes the megawarc. + +If necessary, multiple uploaders can work the same queue. + + +Filesystems +----------- +From the chunker to the uploader, the chunks move through the system as timestamped directories, e.g., 20130401213900.) This timestamp will also be used in the name of the uploaded item on Archive.org. The queues are directories. Processes `claim' a chunk by moving it from the queue directory to their working directory. This assumes that `mv` is an atomic operation. + +For efficiency and to maintain the atomicity of `mv`, the filesystem of the directories is very important: + +1. The Rsync upload directory, the chunker working directory, the packing queue and that side of the packer's working directory should all be on the same filesystem. This ensures that the uploaded warc.gz files never move to a different file system. +2. The megawarc side of the packer's working directory, the upload queue and the uploader's working directory should also share a filesystem. + +Filesystems 1 and 2 do not have to be the same. + + +Configuration +------------- +See each script for the configuration options and arguments. Part of the configuration is in the `config.sh` file that you should place in the working directory of each script. Another part of the configuration is in the script's arguments. The working directory itself is also important for some of the scripts. + + +Requirements +------------ +These scripts use Bash and Curl. + +You should clone https://github.com/alard/megawarc to the megawarc/ subdirectory of these scripts. The megawarc utility requires Python and Gzip. + diff --git a/chunker b/chunker new file mode 100755 index 0000000..eaafb1c --- /dev/null +++ b/chunker @@ -0,0 +1,44 @@ +#!/bin/bash +# Move uploaded .warc.gz files to an archive directory. +# When the archive is large enough, make a tar and start with a +# new archive. +# +# Be careful: this script assumes that any file in the upload directory +# that has a name that ends with *.warc.gz is a fully uploaded file and +# can be moved somewhere else. Remember this when running Rsync. +# + +PATH_TO_UPLOADS=$1 # /home/archiveteam/uploads +PATH_TO_TARGET=$2 # /home/archiveteam/processed +MAX_MEGABYTES=$((1024*50)) + +# find every .warc.gz in the upload directory +find $PATH_TO_UPLOADS -type f -name "*.warc.gz" \ +| while read filename +do + # skip partial uploads + if [[ $filename =~ rsync-tmp ]] + then + continue + fi + + # move to the current/ directory + echo "Moving ${filename}" + mkdir -p ${PATH_TO_TARGET}/current + mv ${filename} ${PATH_TO_TARGET}/current/ + + # if the current/ directory is large enough, + # rename it to archive-XXXXX and start a new current/ + cur_size=$( du -BM -s ${PATH_TO_TARGET}/current | grep -oE "^[0-9]+" ) + if [[ $cur_size -gt $MAX_MEGABYTES ]] + then + timestamp=$( date +'%Y%m%d%H%M%S' ) + echo "Current archive is full, moving to ${timestamp}." + mkdir -p ${PATH_TO_TARGET}/archive + mv ${PATH_TO_TARGET}/current ${PATH_TO_TARGET}/archive/${timestamp} + + # perhaps do something to the ${PATH_TO_TARGET}/archive/${timestamp} + # e.g. ./make-tar-and-upload.sh ${timestamp} + fi +done + diff --git a/config.example.sh b/config.example.sh new file mode 100755 index 0000000..784d768 --- /dev/null +++ b/config.example.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Create this config.sh and copy it to the working directories of the +# packing and upload scripts. + +echo "config.sh not customised." +exit 1 + +# your Archive.org S3 keys +IA_AUTH="ACCESS_KEY:SECRET" + +# the name of the collection to add the uploads to +IA_COLLECTION="archiveteam_TODO" + +# the title of the items (" ${item_timestamp}" will be appended) +IA_ITEM_TITLE="Archive Team TODO:" + +# the prefix of the item name ("${item_timestamp}" is appended) +IA_ITEM_PREFIX="archiveteam_todo_" + +# the prefix of the megawarc filename ("${item_timestamp}" is appended) +FILE_PREFIX="todo_" + +# the date field for the item +IA_ITEM_DATE="2013-04" + diff --git a/pack-multiple-without-upload b/pack-multiple-without-upload new file mode 100755 index 0000000..0ec18e7 --- /dev/null +++ b/pack-multiple-without-upload @@ -0,0 +1,17 @@ +#!/bin/bash +# This loops the pack-one-without-upload script while the RUN file exists. +# See pack-one-without-upload for details. +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +while [[ -f RUN ]] +do + $SCRIPT_DIR/pack-one-without-upload $1 $2 $3 + result=$? + if [[ $result -ne 0 ]] + then + date + echo "packer exited with $result" + exit $result + fi +done + diff --git a/pack-one-without-upload b/pack-one-without-upload new file mode 100755 index 0000000..b4bd748 --- /dev/null +++ b/pack-one-without-upload @@ -0,0 +1,121 @@ +#!/bin/bash +# Feeds the upload queue with megawarcs. +# +# ./pack-one PROCESSED_DIR TARGET_DIR UPLOAD_QUEUE_DIR +# +# 1. Grabs an item from PROCESSED_DIR +# 2. Reserves the item by moving the directory to the working directory +# 3. Makes a megawarc in the TARGET_DIR +# 4. Removes the source files from the working directory +# 5. Moves the megawarc to UPLOAD_QUEUE_DIR +# +# The program exits with 1 on any nontransient error. +# +# run from the packer directory /archiveteam/packer-1/ +# +# ./pack-one /archiveteam/processed/archive /archiveteam/ssd1/packer-1 /archiveteam/ssd1/upload-queue +# + +PROCESSED_DIR=$1 +TARGET_DIR=$2 +UPLOAD_QUEUE_DIR=$3 + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +MEGAWARC=$SCRIPT_DIR/megawarc/megawarc + +if [ ! -f ./config.sh ] ; then + echo "config.sh not found in current directory." + exit 1 +fi +source ./config.sh + + +function mayicontinue { + echo +# echo "May I continue?" +# read +# echo +} + +mkdir -p $TARGET_DIR +mkdir -p $UPLOAD_QUEUE_DIR + + +# check if the upload queue is empty +# if [ "$( ls -A $UPLOAD_QUEUE_DIR )" ] +# then +# echo "Upload queue not empty. Wait." +# sleep 30 +# exit 0 +# fi + + +mayicontinue + + +# try to grab a directory from /archiveteam/processed/archive/ +ITEM=none +while [[ $ITEM = none ]] +do + possible_item=$( ls -1 $PROCESSED_DIR/ | grep 201 | sort | head -n 1 ) + if [[ $possible_item =~ 201 ]] + then + echo "Trying to grab $possible_item" + if mv $PROCESSED_DIR/$possible_item . + then + ITEM=$possible_item + else + echo "Failed to move $possible_item" + sleep 5 + fi + else + date + echo "No current item found!" + sleep 30 + exit 0 + fi +done + + +mayicontinue + + +echo "$( date ): Starting megawarc for item $ITEM" >> packer.log + +# construct a megawarc +mkdir -p $TARGET_DIR/$ITEM +$MEGAWARC --verbose pack $TARGET_DIR/$ITEM/${FILE_PREFIX}${ITEM} $ITEM +result=$? + +if [[ $result -ne 0 ]] +then + date + echo "megawarc exited with $result for $ITEM" + exit 1 +fi + +echo "$( date ): Completed megawarc for item $ITEM" >> packer.log + + +mayicontinue + + +# remove files +echo "megawarc OK, removing source files" +rm -rf $ITEM +result=$? + +if [[ $result -ne 0 ]] +then + date + echo "rm -rf source files exited with $result for $ITEM" + exit 1 +fi + + +echo "add to upload queue" +mv $TARGET_DIR/$ITEM $UPLOAD_QUEUE_DIR + + +exit 0 + diff --git a/upload-multiple b/upload-multiple new file mode 100755 index 0000000..c2707fa --- /dev/null +++ b/upload-multiple @@ -0,0 +1,17 @@ +#!/bin/bash +# This loops the upload-one script while the RUN file exists. +# See upload-one for details. +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +while [[ -f RUN ]] +do + $SCRIPT_DIR/upload-one $1 + result=$? + if [[ $result -ne 0 ]] + then + date + echo "uploader exited with $result" + exit $result + fi +done + diff --git a/upload-one b/upload-one new file mode 100755 index 0000000..7e2ed88 --- /dev/null +++ b/upload-one @@ -0,0 +1,121 @@ +#!/bin/bash +# Uploads megawarcs from the upload queue. +# +# ./upload-one UPLOAD_QUEUE_DIR +# +# 1. Grabs an item from UPLOAD_QUEUE_DIR +# 2. Reserves the item by moving the directory to the working directory +# 3. Uploads the item to s3.us.archive.org +# 4. Removes the source files from the working directory +# +# The program exits with 1 on any nontransient error. +# +# run from the upload directory /archiveteam/ssd1/uploader-1/ +# +# ./upload-one /archiveteam/ssd1/upload-queue +# +# + +UPLOAD_QUEUE_DIR=$1 + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +if [ ! -f ./config.sh ] ; then + echo "config.sh not found in current directory." + exit 1 +fi +source ./config.sh + + +function mayicontinue { + echo +# echo "May I continue?" +# read +# echo +} + +mayicontinue + + +# try to grab an item from UPLOAD_QUEUE_DIR +ITEM=none +while [[ $ITEM = none ]] +do + possible_item=$( ls -1 $UPLOAD_QUEUE_DIR | grep 201 | sort | head -n 1 ) + if [[ $possible_item =~ 201 ]] + then + echo "Trying to grab $possible_item" + if mv $UPLOAD_QUEUE_DIR/$possible_item . + then + ITEM=$possible_item + else + echo "Failed to move $possible_item" + sleep 5 + fi + else + date + echo "No current item found!" + sleep 30 + exit 0 + fi +done + + +echo "$( date ): Start uploading for item $ITEM" >> uploader.log + +# upload megawarc +# (upload the large files first to optimise S3 snowballing) +for ext in warc.gz tar json.gz +do + result=1 + while [[ $result -ne 0 ]] + do + filename=${FILE_PREFIX}${ITEM}.megawarc.${ext} + size_hint=$( du --bytes -s ${ITEM}/${filename} | grep -oE "^[0-9]+" ) + curl -v --location --fail \ + --speed-limit 1 --speed-time 900 \ + --header "x-archive-queue-derive:1" \ + --header "x-amz-auto-make-bucket:1" \ + --header "x-archive-meta-collection:${IA_COLLECTION}" \ + --header "x-archive-meta-mediatype:web" \ + --header "x-archive-meta-title:${IA_ITEM_TITLE} ${ITEM}" \ + --header "x-archive-meta-date:${IA_ITEM_DATE}" \ + --header "x-archive-meta-language:eng" \ + --header "x-archive-size-hint:$size_hint" \ + --header "authorization: LOW ${IA_AUTH}" \ + --upload-file ${ITEM}/${filename} \ + http://s3.us.archive.org/${IA_ITEM_PREFIX}${ITEM}/${filename} \ + > /dev/null + result=$? + if [[ $result -ne 0 ]] + then + date + echo "Error while uploading $ITEM, curl said $result" + echo "Will retry in 30 seconds" + sleep 30 + fi + done +done + +echo "Uploaded $ITEM" + +echo "$( date ): Completed uploading for item $ITEM" >> uploader.log + + +mayicontinue + + +# remove megawarc +rm -rf ${ITEM} +result=$? + +if [[ $result -ne 0 ]] +then + date + echo "rm -rf megawarc exited with $result for $ITEM" + exit 1 +fi + + +exit 0 +