@@ -5,9 +5,9 @@ ENV TINI_VERSION v0.19.0 | |||||
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini | ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini | ||||
RUN chmod +x /tini | RUN chmod +x /tini | ||||
RUN apt-get update && apt-get dist-upgrade -y && apt-get install -y \ | |||||
build-essential dpkg-dev devscripts cmake git python3 python3-dev python3-pip \ | |||||
&& rm -rf /var/lib/apt/lists/* | |||||
#RUN apt-get update && apt-get dist-upgrade -y && apt-get install -y \ | |||||
# build-essential dpkg-dev devscripts cmake git python3 python3-dev python3-pip \ | |||||
# && rm -rf /var/lib/apt/lists/* | |||||
# Create data mount | # Create data mount | ||||
RUN mkdir -p /data | RUN mkdir -p /data | ||||
@@ -18,9 +18,9 @@ WORKDIR / | |||||
COPY mover.sh /mover.sh | COPY mover.sh /mover.sh | ||||
COPY entrypoint.sh /entrypoint.sh | COPY entrypoint.sh /entrypoint.sh | ||||
COPY uploader.sh /uploader.sh | COPY uploader.sh /uploader.sh | ||||
ADD https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/master/pipeline/requirements.txt /requirements.txt | |||||
ADD https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/master/uploader/uploader.py /uploader.py | |||||
#ADD https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/master/pipeline/requirements.txt /requirements.txt | |||||
#ADD https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/master/uploader/uploader.py /uploader.py | |||||
RUN pip3 install -r /requirements.txt | |||||
#RUN pip3 install -r /requirements.txt | |||||
ENTRYPOINT [ "/tini", "--", "/entrypoint.sh" ] | ENTRYPOINT [ "/tini", "--", "/entrypoint.sh" ] |
@@ -7,11 +7,16 @@ trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT | |||||
export SHARED_WARCS_DIR="${SHARED_WARCS_DIR:-/data/}" | export SHARED_WARCS_DIR="${SHARED_WARCS_DIR:-/data/}" | ||||
export INCOMING="${INCOMING:-${SHARED_WARCS_DIR}/incoming/}" | export INCOMING="${INCOMING:-${SHARED_WARCS_DIR}/incoming/}" | ||||
export UPLOAD_QUEUE="${UPLOAD_QUEUE:-${SHARED_WARCS_DIR}/upload-queue/}" | export UPLOAD_QUEUE="${UPLOAD_QUEUE:-${SHARED_WARCS_DIR}/upload-queue/}" | ||||
export MOVER_WORKING_DIR="${MOVER_WORKING_DIR:-${SHARED_WARCS_DIR}/mover-work/}" | |||||
mkdir -pv "${INCOMING}" | |||||
chown nobody:nogroup "${INCOMING}" | |||||
mkdir -pv "${UPLOAD_QUEUE}" | |||||
chown nobody:nogroup "${UPLOAD_QUEUE}" | |||||
function makedir() { | |||||
mkdir -pv "$1" | |||||
chown nobody:nogroup "$1" | |||||
} | |||||
makedir "${INCOMING}" | |||||
makedir "${UPLOAD_QUEUE}" | |||||
makedir "${MOVER_WORKING_DIR}" | |||||
case "$1" in | case "$1" in | ||||
"mover") | "mover") | ||||
@@ -3,10 +3,19 @@ | |||||
set -euo pipefail | set -euo pipefail | ||||
export INCOMING="${INCOMING:-/data/incoming/}" | export INCOMING="${INCOMING:-/data/incoming/}" | ||||
export MOVER_WORKING_DIR="${MOVER_WORKING_DIR:-/data/mover-work/}" | |||||
export UPLOAD_QUEUE="${UPLOAD_QUEUE:-/data/upload-queue/}" | export UPLOAD_QUEUE="${UPLOAD_QUEUE:-/data/upload-queue/}" | ||||
export MEGABYTES_PER_CHUNK="${MEGABYTES_PER_CHUNK:-$((1024*500))}" | |||||
export BYTES_PER_CHUNK=$((1024*1024*MEGABYTES_PER_CHUNK)) | |||||
mkdir -p "${MOVER_WORKING_DIR}" | |||||
mkdir -p "${UPLOAD_QUEUE}" | |||||
mkdir -p "${INCOMING}" | |||||
while [[ 1 ]] ; do | while [[ 1 ]] ; do | ||||
# find every .warc.gz in the rsync directory | |||||
mkdir -p "${MOVER_WORKING_DIR}/current" | |||||
export cur_size=$( du -B1 -s "${MOVER_WORKING_DIR}/current" | grep -oE "^[0-9]+" ) | |||||
find "${INCOMING}" -type f -not -name ".*"\ | find "${INCOMING}" -type f -not -name ".*"\ | ||||
| while read filename | | while read filename | ||||
do | do | ||||
@@ -15,9 +24,19 @@ while [[ 1 ]] ; do | |||||
then | then | ||||
continue | continue | ||||
fi | fi | ||||
echo "Moving ${filename}" | echo "Moving ${filename}" | ||||
mkdir -vp "${UPLOAD_QUEUE}" | |||||
mv -v "${filename}" "${UPLOAD_QUEUE}/" | |||||
export cur_size=$((cur_size + $( du -B1 -s "${filename}" | grep -oE "^[0-9]+" ))) | |||||
mkdir -p "${MOVER_WORKING_DIR}/current" | |||||
mv -v "${filename}" "${MOVER_WORKING_DIR}/current/" | |||||
if [[ "${cur_size}" -gt "${BYTES_PER_CHUNK}" ]]; then | |||||
timestamp=$( date +'%Y%m%d%H%M%S' ) | |||||
uuid=$(cat /proc/sys/kernel/random/uuid | cut -d- -f1) | |||||
echo "Current archive is full, moving to ${timestamp}_${uuid}." | |||||
mv "${MOVER_WORKING_DIR}/current" "${UPLOAD_QUEUE}/${timestamp}_${uuid}" | |||||
export cur_size=0 | |||||
fi | |||||
done | done | ||||
echo "Sleeping 30 seconds..." | echo "Sleeping 30 seconds..." | ||||
@@ -1,3 +1,2 @@ | |||||
#!/bin/bash | #!/bin/bash | ||||
set -exuo pipefail | set -exuo pipefail | ||||
exec python3 -u /uploader.py "${UPLOAD_QUEUE}" |