Browse Source

first commit

master
Roelf Wichertjes 2 years ago
commit
b90506b1a0
Signed by: rewby GPG Key ID: 4C2B6D2972EE5423
4 changed files with 264 additions and 0 deletions
  1. +82
    -0
      Dockerfile
  2. +20
    -0
      config.patch
  3. +70
    -0
      docker-compose.yml
  4. +92
    -0
      entrypoint.sh

+ 82
- 0
Dockerfile View File

@@ -0,0 +1,82 @@
FROM debian:stretch as builder
RUN apt-get update && apt-get dist-upgrade -y && apt-get install -y \
build-essential dpkg-dev devscripts cmake git libmnl-dev \
&& rm -rf /var/lib/apt/lists/*

RUN git clone https://github.com/kristrev/tcp_closer.git /tcp_closer
RUN mkdir -p tcp_closer/src/build
WORKDIR /tcp_closer/src/build
RUN cmake ..
RUN make package
RUN find . -type f

FROM python:3.6-stretch
RUN apt-get update && apt-get dist-upgrade -y && apt-get install -y \
build-essential \
libxml2-dev libxslt-dev zlib1g-dev libssl-dev libsqlite3-dev \
libffi-dev git tmux fontconfig-config fonts-dejavu-core curl \
libfontconfig1 libjpeg62-turbo-dev libjpeg62-turbo libjpeg-dev libjpeg-turbo-progs lsof ffmpeg \
autossh rsync bundler git tmux python3 libtool pkg-config \
build-essential autoconf automake libzmq3-dev libmnl0 \
&& rm -rf /var/lib/apt/lists/*

# Install tcp-closer
COPY --from=builder /tcp_closer/src/build/tcp-closer-0.1.1-Linux.deb /tcp-closer.deb
RUN dpkg -i /tcp-closer.deb

# Add user
RUN groupadd -g 1337 ab
RUN groupadd -r psudo
RUN useradd -rm -d /home/ab -s /bin/bash -g ab -G psudo -u 1337 ab

# Install python dependencies
RUN pip install websockets requests
RUN curl -L https://yt-dl.org/downloads/latest/youtube-dl -o /usr/local/bin/youtube-dl

# Setup output directory
RUN mkdir /data
RUN chown ab:ab /data
RUN chmod a+rx /usr/local/bin/youtube-dl

# Install tini
ENV TINI_VERSION v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini

# Give ab sudo for a bit
RUN echo "%psudo ALL=(ALL:ALL) NOPASSWD: ALL" > /etc/sudoers.d/psudo
RUN chmod 0440 /etc/sudoers.d/psudo

USER ab

# Clone the repo
RUN git clone --recursive https://github.com/ArchiveTeam/ArchiveBot.git /home/ab/ArchiveBot
WORKDIR /home/ab/ArchiveBot

# Patch a file
COPY config.patch /home/ab/config.patch
RUN patch /home/ab/ArchiveBot/pipeline/archivebot/shared_config.py /home/ab/config.patch

# Setup symlinks for pipeline
RUN ln -s /usr/local/bin/wpull /home/ab/ArchiveBot/pipeline/wpull
RUN rm /home/ab/ArchiveBot/pipeline/youtube-dl
RUN ln -s /usr/local/bin/youtube-dl /home/ab/ArchiveBot/pipeline/youtube-dl

# Setup env var
ENV OPENSSL_CONF=/home/ab/ArchiveBot/ops/openssl-less-secure.cnf

# Bundle install
RUN bundle install
RUN cd plumbing && bundle install && cd ..

# Install pip dependencies
USER root
RUN pip install -r pipeline/requirements.txt

# Copy in entrypoint
COPY entrypoint.sh /entrypoint.sh

# Button up image
USER ab
ENTRYPOINT [ "/tini", "--", "/entrypoint.sh" ]


+ 20
- 0
config.patch View File

@@ -0,0 +1,21 @@
--- shared_config.py.orig 2022-02-24 20:30:36.509450521 +0100
@@ -1,12 +1,17 @@
import os
import yaml
+try:
+ from yaml import CLoader as Loader, CDumper as Dumper
+except ImportError:
+ from yaml import Loader, Dumper
+
def config():
my_dir = os.path.dirname(__file__)
config_file = os.path.join(my_dir, '../../lib/shared_config.yml')
with open(config_file, 'r') as f:
- return yaml.load(f.read())
+ return yaml.load(f.read(), Loader=Loader)
def log_channel():
c = config()

+ 70
- 0
docker-compose.yml View File

@@ -0,0 +1,70 @@
version: "3.9"
services:
redis:
image: "redis:alpine"
couchdb:
image: "couchdb:2"
bot:
build: .
command: "bot"
environment:
- IRC_URL=ircs://irc.hackint.org:6697
- REDIS_URL=redis://redis:6379/0
- IRC_CHANNEL=#archivebot-rewby-dev
- IRC_NICK=ArchiveBotRewbyDev
- COUCHDB_URL=http://couchdb:5984
cogs:
build: .
command: "cogs"
environment:
- REDIS_URL=redis://redis:6379/0
- COUCHDB_URL=http://couchdb:5984
firehose:
build: .
command: "firehose"
environment:
- REDIS_URL=redis://redis:6379/0
websocket:
build: .
command: "websocket"
ports:
- 4568:4568
environment:
- REDIS_URL=redis://redis:6379/0
- FIREHOSE_SOCKET_URL=tcp://firehose:12345
dashboard:
build: .
command: "dashboard"
ports:
- 8080:8080
environment:
- REDIS_URL=redis://redis:6379/0
uploader:
build: .
command: "uploader"
volumes:
- warcs:/data
environment:
- RSYNC_URL=rsync://at-offload.hawc.eu/abtest/
pipeline:
build: .
command: "pipeline"
volumes:
- warcs:/data
environment:
- REDIS_URL=redis://redis:6379/0
- PIPELINE_CONCURRENT=10
- PIPELINE_PREFIX=testpipeline
analyzer:
build: .
command: "analyzer"
environment:
- REDIS_URL=redis://redis:6379/0
trimmer:
build: .
command: "trimmer"
environment:
- REDIS_URL=redis://redis:6379/0
volumes:
warcs:


+ 92
- 0
entrypoint.sh View File

@@ -0,0 +1,92 @@
#!/bin/bash
set -exuo pipefail

if [ -z ${COUCHDB_URL+x} ]; then
echo "Skipping couchdb init"
else
timeout 300 bash -c "while [[ \"\$(curl -s -o /dev/null -w ''%{http_code}'' ${COUCHDB_URL})\" != \"200\" ]]; do sleep 5; done" || false

if [ -z ${COUCHDB_USER+x} ]; then
export COUCHDB_CURL_ARGS=""
else
export COUCHDB_CURL_ARGS="-u \"$COUCHDB_USER:$COUCHDB_PASSWORD\""
fi

pushd "db/design_docs"
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/_users
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot_logs
grep -v _rev archive_urls.json > /tmp/archive_urls.json
grep -v _rev ignore_patterns.json > /tmp/ignore_patterns.json
grep -v _rev jobs.json > /tmp/jobs.json
grep -v _rev user_agents.json > /tmp/user_agents.json
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/archive_urls -d @/tmp/archive_urls.json
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/ignore_patterns -d @/tmp/ignore_patterns.json
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/jobs -d @/tmp/jobs.json
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/user_agents -d @/tmp/user_agents.json
popd
fi

export FINISHED_WARCS_DIR=${FINISHED_WARCS_DIR:-/data/}

case "$1" in
"bot")
cd bot
if [ -z ${COUCHDB_USER+x} ]; then
export COUCHDB_ARGS=""
else
export COUCHDB_ARGS="--db-credentials \"$COUCHDB_USER:$COUCHDB_PASSWORD\""
fi
bundle exec ruby bot.rb \
-s "$IRC_URL" \
-r "$REDIS_URL" \
-c "$IRC_CHANNEL" \
-n "$IRC_NICK" \
--db "$COUCHDB_URL/archivebot" $COUCHDB_ARGS
;;
"cogs")
if [ -z ${COUCHDB_USER+x} ]; then
export COUCHDB_ARGS=""
else
export COUCHDB_ARGS="--db-credentials \"$COUCHDB_USER:$COUCHDB_PASSWORD\" --log-db-credentials \"$COUCHDB_USER:$COUCHDB_PASSWORD\""
fi
bundle exec ruby cogs/start.rb \
-r "$REDIS_URL" \
--db "$COUCHDB_URL/archivebot" \
--log-db "$COUCHDB_URL/archivebot_logs" $COUCHDB_ARGS
;;
"firehose")
export UPDATES_CHANNEL=updates
export FIREHOSE_SOCKET_URL=tcp://0.0.0.0:12345
plumbing/updates-listener | plumbing/log-firehose
;;
"dashboard")
bundle exec ruby dashboard/app.rb -u http://0.0.0.0:8080 -r "$REDIS_URL"
;;
"websocket")
plumbing/firehose-client | python3 dashboard/websocket.py
;;
"pipeline")
cd pipeline
export PIPELINE_NAME="${PIPELINE_NAME:-${PIPELINE_PREFIX}-$(hostname -s)}"
export NO_SCREEN=1
sudo /usr/sbin/tcp-closer -4 --dport 443 --idle_time 21601000 --last_recv_limit 43200000 --interval 300 &
sudo /usr/sbin/tcp-closer -6 --dport 443 --idle_time 21601000 --last_recv_limit 43200000 --interval 300 &
run-pipeline3 pipeline.py --disable-web-server \
--concurrent $PIPELINE_CONCURRENT $PIPELINE_NAME
;;
"uploader")
python ./uploader/uploader.py $FINISHED_WARCS_DIR
;;
"analyzer")
export UPDATES_CHANNEL=updates
cd plumbing
./analyzer
;;
"trimmer")
export UPDATES_CHANNEL=updates
cd plumbing
./trimmer > /dev/null
;;
esac


Loading…
Cancel
Save