Browse Source

Convert mover into a chunker.

master
Roelf Wichertjes 2 years ago
parent
commit
b6f0cf2217
Signed by: rewby GPG Key ID: 4C2B6D2972EE5423
4 changed files with 37 additions and 14 deletions
  1. +6
    -6
      Dockerfile
  2. +9
    -4
      entrypoint.sh
  3. +22
    -3
      mover.sh
  4. +0
    -1
      uploader.sh

+ 6
- 6
Dockerfile View File

@@ -5,9 +5,9 @@ ENV TINI_VERSION v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini

RUN apt-get update && apt-get dist-upgrade -y && apt-get install -y \
build-essential dpkg-dev devscripts cmake git python3 python3-dev python3-pip \
&& rm -rf /var/lib/apt/lists/*
#RUN apt-get update && apt-get dist-upgrade -y && apt-get install -y \
# build-essential dpkg-dev devscripts cmake git python3 python3-dev python3-pip \
# && rm -rf /var/lib/apt/lists/*

# Create data mount
RUN mkdir -p /data
@@ -18,9 +18,9 @@ WORKDIR /
COPY mover.sh /mover.sh
COPY entrypoint.sh /entrypoint.sh
COPY uploader.sh /uploader.sh
ADD https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/master/pipeline/requirements.txt /requirements.txt
ADD https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/master/uploader/uploader.py /uploader.py
#ADD https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/master/pipeline/requirements.txt /requirements.txt
#ADD https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/master/uploader/uploader.py /uploader.py

RUN pip3 install -r /requirements.txt
#RUN pip3 install -r /requirements.txt

ENTRYPOINT [ "/tini", "--", "/entrypoint.sh" ]

+ 9
- 4
entrypoint.sh View File

@@ -7,11 +7,16 @@ trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT
export SHARED_WARCS_DIR="${SHARED_WARCS_DIR:-/data/}"
export INCOMING="${INCOMING:-${SHARED_WARCS_DIR}/incoming/}"
export UPLOAD_QUEUE="${UPLOAD_QUEUE:-${SHARED_WARCS_DIR}/upload-queue/}"
export MOVER_WORKING_DIR="${MOVER_WORKING_DIR:-${SHARED_WARCS_DIR}/mover-work/}"

mkdir -pv "${INCOMING}"
chown nobody:nogroup "${INCOMING}"
mkdir -pv "${UPLOAD_QUEUE}"
chown nobody:nogroup "${UPLOAD_QUEUE}"
function makedir() {
mkdir -pv "$1"
chown nobody:nogroup "$1"
}

makedir "${INCOMING}"
makedir "${UPLOAD_QUEUE}"
makedir "${MOVER_WORKING_DIR}"

case "$1" in
"mover")


+ 22
- 3
mover.sh View File

@@ -3,10 +3,19 @@
set -euo pipefail

export INCOMING="${INCOMING:-/data/incoming/}"
export MOVER_WORKING_DIR="${MOVER_WORKING_DIR:-/data/mover-work/}"
export UPLOAD_QUEUE="${UPLOAD_QUEUE:-/data/upload-queue/}"
export MEGABYTES_PER_CHUNK="${MEGABYTES_PER_CHUNK:-$((1024*500))}"
export BYTES_PER_CHUNK=$((1024*1024*MEGABYTES_PER_CHUNK))

mkdir -p "${MOVER_WORKING_DIR}"
mkdir -p "${UPLOAD_QUEUE}"
mkdir -p "${INCOMING}"

while [[ 1 ]] ; do
# find every .warc.gz in the rsync directory
mkdir -p "${MOVER_WORKING_DIR}/current"
export cur_size=$( du -B1 -s "${MOVER_WORKING_DIR}/current" | grep -oE "^[0-9]+" )

find "${INCOMING}" -type f -not -name ".*"\
| while read filename
do
@@ -15,9 +24,19 @@ while [[ 1 ]] ; do
then
continue
fi

echo "Moving ${filename}"
mkdir -vp "${UPLOAD_QUEUE}"
mv -v "${filename}" "${UPLOAD_QUEUE}/"
export cur_size=$((cur_size + $( du -B1 -s "${filename}" | grep -oE "^[0-9]+" )))
mkdir -p "${MOVER_WORKING_DIR}/current"
mv -v "${filename}" "${MOVER_WORKING_DIR}/current/"

if [[ "${cur_size}" -gt "${BYTES_PER_CHUNK}" ]]; then
timestamp=$( date +'%Y%m%d%H%M%S' )
uuid=$(cat /proc/sys/kernel/random/uuid | cut -d- -f1)
echo "Current archive is full, moving to ${timestamp}_${uuid}."
mv "${MOVER_WORKING_DIR}/current" "${UPLOAD_QUEUE}/${timestamp}_${uuid}"
export cur_size=0
fi
done

echo "Sleeping 30 seconds..."


+ 0
- 1
uploader.sh View File

@@ -1,3 +1,2 @@
#!/bin/bash
set -exuo pipefail
exec python3 -u /uploader.py "${UPLOAD_QUEUE}"

Loading…
Cancel
Save