initial

1 year ago · e42f241a00
--- a/.drone.yml
+++ b/.drone.yml
@@ -0,0 +1,18 @@
 ---
 kind: pipeline
 name: default
 steps:
 - name: docker
  image: plugins/docker
  settings:
    registry: atdr-writer.meo.ws
    username:
      from_secret: atdr_user
    password:
      from_secret: atdr_pass
    repo: atdr-writer.meo.ws/archiveteam/archiveteam-tar-uploader
    dockerfile: Dockerfile
    purge: true
    auto_tag: false
    tags:
    - latest
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
 *~
--- a/+ 12
+++ b/+ 12
@@ -0,0 +1,12 @@
 FROM debian:stretch-slim
 RUN echo 'deb http://ftp.de.debian.org/debian buster-backports main' >> /etc/apt/sources.list
 RUN DEBIAN_FRONTEND=noninteractive DEBIAN_PRIORITY=critical apt-get -qqy --no-install-recommends -o Dpkg::Options::=--force-confdef -o Dpkg::Options::=--force-confold -o Dpkg::Options::=--force-unsafe-io update \
 && DEBIAN_FRONTEND=noninteractive DEBIAN_PRIORITY=critical apt-get -qqy --no-install-recommends -o Dpkg::Options::=--force-confdef -o Dpkg::Options::=--force-confold -o Dpkg::Options::=--force-unsafe-io install python python2.7 rsync git ca-certificates curl python-pip \
 && DEBIAN_FRONTEND=noninteractive DEBIAN_PRIORITY=critical apt-get -qqy --no-install-recommends -o Dpkg::Options::=--force-confdef -o Dpkg::Options::=--force-confold -o Dpkg::Options::=--force-unsafe-io -t buster-backports install zstd libzstd-dev libzstd1 \
 && pip install zstandard && pip install requests
 COPY * factory/
 RUN rm -rf /factory/megawarc && git clone https://github.com/archiveteam/megawarc.git /factory/megawarc
 WORKDIR /factory
 COPY docker-boot.sh /
 RUN chmod +x /docker-boot.sh
 ENTRYPOINT ["/docker-boot.sh"]
--- a/README.md
+++ b/README.md
@@ -0,0 +1 @@
 This is a rough edited version of https://github.com/ArchiveTeam/archiveteam-megawarc-factory to upload tar files instead of packing and uploading WARCs.
--- a/+ 21
+++ b/+ 21
@@ -0,0 +1,21 @@
 #!/bin/bash
 # This loops the chunker script while the RUN file exists.
 # See chunker for details.
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

 while [[ -f RUN ]]
 do
  date
  "${SCRIPT_DIR}/chunker"
  result="${?}"
  if [[ "${result}" -ne 0 ]]
  then
    date
    echo "chunker exited with ${result}"
    exit "${result}"
  fi

  echo "Sleeping..."
  sleep 1
 done

--- a/+ 58
+++ b/+ 58
@@ -0,0 +1,58 @@
 #!/bin/bash
 # Move uploaded .warc.gz files to an archive directory.
 # When the archive is large enough, make a tar and start with a
 # new archive.
 #
 # Be careful: this script assumes that any file in the upload directory
 # that has a name that ends with *.warc.gz is a fully uploaded file and
 # can be moved somewhere else. Remember this when running Rsync.
 #

 INCOMING_UPLOADS_DIR="${1}" # /home/archiveteam/uploads
 CHUNKER_WORKING_DIR="${2}"  # /home/archiveteam/processed
 PACKING_QUEUE_DIR="${CHUNKER_WORKING_DIR}/archive"
 MEGABYTES_PER_CHUNK=$((1024*25))

 # if not specified in command-line arguments
 if [ -z "${INCOMING_UPLOADS_DIR}" ]
 then
  source ./config.sh || exit 1
 fi

 BYTES_PER_CHUNK=$((1024*1024*MEGABYTES_PER_CHUNK))

 mkdir -p "${CHUNKER_WORKING_DIR}" || exit 1
 mkdir -p "${PACKING_QUEUE_DIR}" || exit 1

 mkdir -p "${CHUNKER_WORKING_DIR}/current" || exit 1
 cur_size=$( du -B1 -s "${CHUNKER_WORKING_DIR}/current" | grep -oE "^[0-9]+" )

 # find every .warc.gz in the upload directory
 find "${INCOMING_UPLOADS_DIR}" -type f -regex ".+\.tar$" \
 | while read -r filename
 do
  # skip partial uploads
  if [[ "${filename}" =~ rsync-tmp ]]
  then
    continue
  fi

  cur_size=$((cur_size + $( du -B1 -s "${filename}" | grep -oE "^[0-9]+" )))

  # move to the current/ directory
  echo "Moving ${filename}"
  mkdir -p "${CHUNKER_WORKING_DIR}/current"
  mv "${filename}" "${CHUNKER_WORKING_DIR}/current/"

  # if the current/ directory is large enough,
  # rename it to archive-XXXXX and start a new current/
  if [[ "${cur_size}" -gt "${BYTES_PER_CHUNK}" ]]
  then
    timestamp=$( date +'%Y%m%d%H%M%S' )
    uuid=$(cat /proc/sys/kernel/random/uuid | cut -d- -f1)
    echo "Current archive is full, moving to ${timestamp}_${uuid}."
    mv "${CHUNKER_WORKING_DIR}/current" "${UPLOAD_QUEUE_DIR}/${timestamp}_${uuid}"
    cur_size=0
  fi
 done

--- a/config.example.sh
+++ b/config.example.sh
@@ -0,0 +1,88 @@
 #!/bin/bash
 # Create a copy of this config.sh, customise it and place it in the
 # working directory of the packing and upload scripts.

 ####################
 # CHUNKER SETTINGS #
 ####################
 # start a new chunk when the current chunk is at least this large
 MEGABYTES_PER_CHUNK=$((1024*25))

 ###################
 # UPLOAD METADATA #
 ###################
 # your Archive.org S3 keys
 IA_AUTH="ACCESS_KEY:SECRET"

 # the name of the collection to add the uploads to
 IA_COLLECTION="archiveteam_TODO"

 # the title of the items (" ${item_timestamp}" will be appended)
 IA_ITEM_TITLE="Archive Team TODO:"

 # the prefix of the item name ("${item_timestamp}" is appended)
 IA_ITEM_PREFIX="archiveteam_todo_"

 # the prefix of the megawarc filename ("${item_timestamp}" is appended)
 FILE_PREFIX="todo_"

 # the date field for the item
 IA_ITEM_DATE=$( date +"%Y-%m" )

 # offload items to another rsync storage instead of uploading to IA
 OFFLOAD_TARGET="rsync://somewhere-far-away:portnum/module-name/directory/"
 # it is also possible to create a list of targets and the offloader will pick one at random and retry others on failure
 # simply comment out the line above and put all rsync target urls separated by newline in a file called "offload_targets"

 # the API for requesting the ZSTD dictionaries
 ZST_DICTIONARY_API="API_URL"

 ###############
 # DIRECTORIES #
 ###############
 # Put your directories on one or two filesystems (see README).
 FS1_BASE_DIR="/archiveteam/ssd/project"
 FS2_BASE_DIR="/archiveteam/disk/project"

 ## THESE DIRECTORIES ON FILESYSTEM 1: for warcs

 # the rsync upload directory
 # (the chunker will package the .warc.gz files in this directory)
 INCOMING_UPLOADS_DIR="${FS1_BASE_DIR}/incoming-uploads"

 # the chunker working directory
 # (this directory will hold the current in-progress chunk)
 CHUNKER_WORKING_DIR="${FS1_BASE_DIR}/chunker-work"

 # the chunker output directory / the packer queue
 # (this directory will hold the completed chunks)
 PACKING_QUEUE_DIR="${FS1_BASE_DIR}/packing-queue"

 # the packer working directory - warc side
 # (this directory will hold the current chunk)
 PACKER_WORKING_CHUNKS_DIR="${FS1_BASE_DIR}/packer-work-in"

 ## THESE DIRECTORIES ON FILESYSTEM 2: for megawarcs

 # the packer working directory - megawarc side
 # (this directory will hold the current megawarc)
 PACKER_WORKING_MEGAWARC_DIR="${FS2_BASE_DIR}/packer-work-out"

 # the packer output directory / the upload queue
 # (this directory will hold the completed megawarcs)
 UPLOAD_QUEUE_DIR="${FS2_BASE_DIR}/upload-queue"

 # the uploader working directory
 # (this directory will hold the current megawarc)
 UPLOADER_WORKING_DIR="${FS2_BASE_DIR}/uploader-work"

 # the final destination for uploaded megawarcs
 # leave this empty to remove megawarcs after uploading
 COMPLETED_DIR="${FS2_BASE_DIR}/uploaded"


 # remove this
 echo "config.sh not customised."
 exit 1


--- a/docker-boot.sh
+++ b/docker-boot.sh
@@ -0,0 +1,70 @@
 #!/bin/bash

 set -e

 test -d /data || {
        echo "No /data mount found"
        exit 1
 }

 mkdir -p /data/incoming /data/chunker-work /data/packing-queue /data/packer-work-in /data/packer-work-out /data/upload-queue /data/uploader-work

 IA_ITEM_DATE_LIT='$( date +"%Y-%m" )'
 cat > /factory/config.sh << EOF
 #!/bin/bash
 MEGABYTES_PER_CHUNK="${MEGABYTES_PER_CHUNK}"
 IA_AUTH="${IA_AUTH}"
 IA_COLLECTION="${IA_COLLECTION}"
 IA_ITEM_TITLE="${IA_ITEM_TITLE}"
 IA_ITEM_PREFIX="${IA_ITEM_PREFIX}"
 FILE_PREFIX="${FILE_PREFIX}"
 IA_ITEM_DATE="${IA_ITEM_DATE_LIT}"
 OFFLOAD_TARGET="${OFFLOAD_TARGET}"
 ZST_DICTIONARY_API="${ZST_DICTIONARY_API}"
 LOAD_BALANCER="${LOAD_BALANCER}"
 INCOMING_UPLOADS_DIR="/data/incoming"
 CHUNKER_WORKING_DIR="/data/chunker-work"
 PACKING_QUEUE_DIR="/data/packing-queue"
 PACKER_WORKING_CHUNKS_DIR="/data/packer-work-in"
 PACKER_WORKING_MEGAWARC_DIR="/data/packer-work-out"
 UPLOAD_QUEUE_DIR="/data/upload-queue"
 UPLOADER_WORKING_DIR="/data/uploader-work"
 COMPLETED_DIR=""
 EOF

 touch /factory/RUN

 case "${1}" in
        chunk|chunker|chunk-multiple)
                if test -z "${MEGABYTES_PER_CHUNK}"; then
                        echo "Missing param: MEGABYTES_PER_CHUNK=${MEGABYTES_PER_CHUNK}"
                        exit 1
                fi
                exec /factory/chunk-multiple
        ;;
        pack|pack-one|packer|pack-multiple)
                if test -z "${FILE_PREFIX}" || test -z "${ZST_DICTIONARY_API}"; then
                        echo "Missing param: FILE_PREFIX=${FILE_PREFIX} ZST_DICTIONARY_API=${ZST_DICTIONARY_API}"
                        exit 1
                fi
                exec /factory/pack-multiple
        ;;
        upload|upload-one|upload-multiple)
                if test -z "${IA_AUTH}" || test -z "${IA_COLLECTION}" || test -z "${IA_ITEM_TITLE}" || test -z "${IA_ITEM_PREFIX}" || test -z "${FILE_PREFIX}"; then
                        echo "Missing param: IA_AUTH=${IA_AUTH} IA_COLLECTION=${IA_COLLECTION} IA_ITEM_TITLE=${IA_ITEM_TITLE} IA_ITEM_PREFIX=${IA_ITEM_PREFIX} FILE_PREFIX=${FILE_PREFIX}"
                        exit 1
                fi
                exec /factory/upload-multiple
        ;;
        offload|offload-one|offload-multiple)
                if test -z "${OFFLOAD_TARGET}" && ! test -f "${PWD}/offload_targets"; then
                        echo "Missing param: OFFLOAD_TARGET=${OFFLOAD_TARGET} and no ${PWD}/offload_targets existing"
                        exit 1
                fi
                exec /factory/offload-multiple
        ;;
        *)
                echo "Usage: chunk|pack|upload|offload"
                exit 1
        ;;
 esac
--- a/+ 14
+++ b/+ 14
@@ -0,0 +1,14 @@
 #!/bin/bash
 # This shows du -hs for the important directories.
 source ./config.sh || exit 1

 du -hs \
  "${INCOMING_UPLOADS_DIR}" \
  "${CHUNKER_WORKING_DIR}" \
  "${PACKING_QUEUE_DIR}/"* \
  "${PACKER_WORKING_CHUNKS_DIR}/"* \
  "${PACKER_WORKING_MEGAWARC_DIR}/"* \
  "${UPLOAD_QUEUE_DIR}/"* \
  "${UPLOADER_WORKING_DIR}/"* \
 2> >(grep -v 'du: cannot \(access\|read\)' >&2)

--- a/+ 17
+++ b/+ 17
@@ -0,0 +1,17 @@
 #!/bin/bash
 # This loops the upload-one script while the RUN file exists.
 # See upload-one for details.
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

 while [[ -f RUN ]]
 do
  "${SCRIPT_DIR}/upload-one"
  result="${?}"
  if [[ "${result}" -ne 0 ]]
  then
    date
    echo "uploader exited with ${result}"
    exit "${result}"
  fi
 done

--- a/+ 137
+++ b/+ 137
@@ -0,0 +1,137 @@
 #!/bin/bash
 # Uploads megawarcs from the upload queue.
 # (Needs a config.sh in the working directory.)
 #
 #   ./upload-one
 #
 # 1. Grabs an item from UPLOAD_QUEUE_DIR
 # 2. Reserves the item by moving the directory to the
 #    UPLOADER_WORKING_DIR
 # 3. Uploads the item to s3.us.archive.org
 # 4. Removes the source files from the working directory
 #    If COMPLETED_DIR is set, uploaded files are moved there.
 #
 # The program exits with 1 on any nontransient error.
 #

 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

 source ./config.sh || exit 1

 mkdir -p "${UPLOAD_QUEUE_DIR}" || exit 1
 mkdir -p "${UPLOADER_WORKING_DIR}" || exit 1

 if [ ! -z "${COMPLETED_DIR}" ]
 then
  mkdir -p "${COMPLETED_DIR}" || exit 1
 fi

 function mayicontinue {
  echo
 # echo "May I continue?"
 # read
 # echo
 }

 mayicontinue


 # try to grab an item from UPLOAD_QUEUE_DIR
 ITEM=none
 while [[ "${ITEM}" = none ]]
 do
  possible_item=$( ls -1 "${UPLOAD_QUEUE_DIR}" | grep -E '[0-9]{14}_[a-f0-9]{8}$' | sort | head -n 1 )
  if test -n "${possible_item}"
  then
    echo "Trying to grab ${possible_item}"
    if mv "${UPLOAD_QUEUE_DIR}/${possible_item}" "${UPLOADER_WORKING_DIR}/"
    then
      ITEM="${possible_item}"
    else
      echo "Failed to move ${possible_item}"
      sleep 5
    fi
  else
    date
    echo "No current item found!"
    sleep 30
    exit 0
  fi
 done


 echo "$( date ): Start uploading for item ${ITEM}" >> uploader.log

 # upload megawarc
 size_hint=$( du --bytes -s "${UPLOADER_WORKING_DIR}/${ITEM}" | grep -oE "^[0-9]+" )
 # (upload the large files first to optimise S3 snowballing)

 find "${UPLOADER_WORKING_DIR}/${ITEM}" -type f -regextype posix-egrep -regex ".+\.tar$" -printf "%f\n" \
 | while read -r filename
 do
  result=1
  while [[ "${result}" -ne 0 ]]
  do
    curl -v --location --fail \
      --speed-limit 1 --speed-time 900 \
      --header "x-archive-queue-derive:1" \
      --header "x-amz-auto-make-bucket:1" \
      --header "x-archive-keep-old-version:1" \
      --header "x-archive-meta-collection:${IA_COLLECTION}" \
      --header "x-archive-meta-mediatype:data" \
      --header "x-archive-meta-title:${IA_ITEM_TITLE} ${ITEM}" \
      --header "x-archive-meta-date:${IA_ITEM_DATE}" \
      --header "x-archive-meta-language:eng" \
      --header "x-archive-meta-noarchivetorrent:true" \
      --header "x-archive-size-hint:${size_hint}" \
      --header "authorization: LOW ${IA_AUTH}" \
      --upload-file "${UPLOADER_WORKING_DIR}/${ITEM}/${filename}" \
      "https://${LOAD_BALANCER:-s3}.us.archive.org/${IA_ITEM_PREFIX}${ITEM}/${filename}" \
      > /dev/null
    result="${?}"
    if [[ "${result}" -ne 0 ]]
    then
      date
      echo "Error while uploading ${ITEM}, curl said ${result}"
      echo "Will retry in 30 seconds"
      sleep 30
    fi
  done
 done

 echo "Uploaded ${ITEM}"

 echo "$( date ): Completed uploading for item ${ITEM}" >> uploader.log


 mayicontinue


 # move or remove megawarc
 if [ -z "${COMPLETED_DIR}" ]
 then
  # remove
  rm -rf "${UPLOADER_WORKING_DIR}/${ITEM}"
  result="${?}"

  if [[ "${result}" -ne 0 ]]
  then
    date
    echo "rm -rf megawarc exited with ${result} for ${ITEM}"
    exit 1
  fi
 else
  # move
  mv "${UPLOADER_WORKING_DIR}/${ITEM}" "${COMPLETED_DIR}/"
  result="${?}"

  if [[ "${result}" -ne 0 ]]
  then
    date
    echo "rm -rf megawarc exited with ${result} for ${ITEM}"
    exit 1
  fi
 fi

 exit 0