commit b90506b1a047797b95ddad7ec6ad7595f54e9c26 Author: Roelf Wichertjes Date: Thu Feb 24 23:16:14 2022 +0100 first commit diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..5aab934 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,82 @@ +FROM debian:stretch as builder +RUN apt-get update && apt-get dist-upgrade -y && apt-get install -y \ + build-essential dpkg-dev devscripts cmake git libmnl-dev \ + && rm -rf /var/lib/apt/lists/* + +RUN git clone https://github.com/kristrev/tcp_closer.git /tcp_closer +RUN mkdir -p tcp_closer/src/build +WORKDIR /tcp_closer/src/build +RUN cmake .. +RUN make package +RUN find . -type f + +FROM python:3.6-stretch +RUN apt-get update && apt-get dist-upgrade -y && apt-get install -y \ + build-essential \ + libxml2-dev libxslt-dev zlib1g-dev libssl-dev libsqlite3-dev \ + libffi-dev git tmux fontconfig-config fonts-dejavu-core curl \ + libfontconfig1 libjpeg62-turbo-dev libjpeg62-turbo libjpeg-dev libjpeg-turbo-progs lsof ffmpeg \ + autossh rsync bundler git tmux python3 libtool pkg-config \ + build-essential autoconf automake libzmq3-dev libmnl0 \ + && rm -rf /var/lib/apt/lists/* + +# Install tcp-closer +COPY --from=builder /tcp_closer/src/build/tcp-closer-0.1.1-Linux.deb /tcp-closer.deb +RUN dpkg -i /tcp-closer.deb + +# Add user +RUN groupadd -g 1337 ab +RUN groupadd -r psudo +RUN useradd -rm -d /home/ab -s /bin/bash -g ab -G psudo -u 1337 ab + +# Install python dependencies +RUN pip install websockets requests +RUN curl -L https://yt-dl.org/downloads/latest/youtube-dl -o /usr/local/bin/youtube-dl + +# Setup output directory +RUN mkdir /data +RUN chown ab:ab /data +RUN chmod a+rx /usr/local/bin/youtube-dl + +# Install tini +ENV TINI_VERSION v0.19.0 +ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini +RUN chmod +x /tini + +# Give ab sudo for a bit +RUN echo "%psudo ALL=(ALL:ALL) NOPASSWD: ALL" > /etc/sudoers.d/psudo +RUN chmod 0440 /etc/sudoers.d/psudo + +USER ab + +# Clone the repo +RUN git clone --recursive https://github.com/ArchiveTeam/ArchiveBot.git /home/ab/ArchiveBot +WORKDIR /home/ab/ArchiveBot + +# Patch a file +COPY config.patch /home/ab/config.patch +RUN patch /home/ab/ArchiveBot/pipeline/archivebot/shared_config.py /home/ab/config.patch + +# Setup symlinks for pipeline +RUN ln -s /usr/local/bin/wpull /home/ab/ArchiveBot/pipeline/wpull +RUN rm /home/ab/ArchiveBot/pipeline/youtube-dl +RUN ln -s /usr/local/bin/youtube-dl /home/ab/ArchiveBot/pipeline/youtube-dl + +# Setup env var +ENV OPENSSL_CONF=/home/ab/ArchiveBot/ops/openssl-less-secure.cnf + +# Bundle install +RUN bundle install +RUN cd plumbing && bundle install && cd .. + +# Install pip dependencies +USER root +RUN pip install -r pipeline/requirements.txt + +# Copy in entrypoint +COPY entrypoint.sh /entrypoint.sh + +# Button up image +USER ab +ENTRYPOINT [ "/tini", "--", "/entrypoint.sh" ] + diff --git a/config.patch b/config.patch new file mode 100644 index 0000000..cc42c48 --- /dev/null +++ b/config.patch @@ -0,0 +1,21 @@ +--- shared_config.py.orig 2022-02-24 20:30:36.509450521 +0100 ++++ shared_config.py 2022-02-24 20:31:13.029431468 +0100 +@@ -1,12 +1,17 @@ + import os + import yaml + ++try: ++ from yaml import CLoader as Loader, CDumper as Dumper ++except ImportError: ++ from yaml import Loader, Dumper ++ + def config(): + my_dir = os.path.dirname(__file__) + config_file = os.path.join(my_dir, '../../lib/shared_config.yml') + + with open(config_file, 'r') as f: +- return yaml.load(f.read()) ++ return yaml.load(f.read(), Loader=Loader) + + def log_channel(): + c = config() diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..d658f97 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,70 @@ +version: "3.9" +services: + redis: + image: "redis:alpine" + couchdb: + image: "couchdb:2" + bot: + build: . + command: "bot" + environment: + - IRC_URL=ircs://irc.hackint.org:6697 + - REDIS_URL=redis://redis:6379/0 + - IRC_CHANNEL=#archivebot-rewby-dev + - IRC_NICK=ArchiveBotRewbyDev + - COUCHDB_URL=http://couchdb:5984 + cogs: + build: . + command: "cogs" + environment: + - REDIS_URL=redis://redis:6379/0 + - COUCHDB_URL=http://couchdb:5984 + firehose: + build: . + command: "firehose" + environment: + - REDIS_URL=redis://redis:6379/0 + websocket: + build: . + command: "websocket" + ports: + - 4568:4568 + environment: + - REDIS_URL=redis://redis:6379/0 + - FIREHOSE_SOCKET_URL=tcp://firehose:12345 + dashboard: + build: . + command: "dashboard" + ports: + - 8080:8080 + environment: + - REDIS_URL=redis://redis:6379/0 + uploader: + build: . + command: "uploader" + volumes: + - warcs:/data + environment: + - RSYNC_URL=rsync://at-offload.hawc.eu/abtest/ + pipeline: + build: . + command: "pipeline" + volumes: + - warcs:/data + environment: + - REDIS_URL=redis://redis:6379/0 + - PIPELINE_CONCURRENT=10 + - PIPELINE_PREFIX=testpipeline + analyzer: + build: . + command: "analyzer" + environment: + - REDIS_URL=redis://redis:6379/0 + trimmer: + build: . + command: "trimmer" + environment: + - REDIS_URL=redis://redis:6379/0 +volumes: + warcs: + diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100755 index 0000000..9562675 --- /dev/null +++ b/entrypoint.sh @@ -0,0 +1,92 @@ +#!/bin/bash +set -exuo pipefail + +if [ -z ${COUCHDB_URL+x} ]; then + echo "Skipping couchdb init" +else + timeout 300 bash -c "while [[ \"\$(curl -s -o /dev/null -w ''%{http_code}'' ${COUCHDB_URL})\" != \"200\" ]]; do sleep 5; done" || false + + if [ -z ${COUCHDB_USER+x} ]; then + export COUCHDB_CURL_ARGS="" + else + export COUCHDB_CURL_ARGS="-u \"$COUCHDB_USER:$COUCHDB_PASSWORD\"" + fi + + pushd "db/design_docs" + curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/_users + curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot + curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot_logs + grep -v _rev archive_urls.json > /tmp/archive_urls.json + grep -v _rev ignore_patterns.json > /tmp/ignore_patterns.json + grep -v _rev jobs.json > /tmp/jobs.json + grep -v _rev user_agents.json > /tmp/user_agents.json + curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/archive_urls -d @/tmp/archive_urls.json + curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/ignore_patterns -d @/tmp/ignore_patterns.json + curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/jobs -d @/tmp/jobs.json + curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/user_agents -d @/tmp/user_agents.json + popd +fi + +export FINISHED_WARCS_DIR=${FINISHED_WARCS_DIR:-/data/} + +case "$1" in + "bot") + cd bot + if [ -z ${COUCHDB_USER+x} ]; then + export COUCHDB_ARGS="" + else + export COUCHDB_ARGS="--db-credentials \"$COUCHDB_USER:$COUCHDB_PASSWORD\"" + fi + bundle exec ruby bot.rb \ + -s "$IRC_URL" \ + -r "$REDIS_URL" \ + -c "$IRC_CHANNEL" \ + -n "$IRC_NICK" \ + --db "$COUCHDB_URL/archivebot" $COUCHDB_ARGS + ;; + "cogs") + if [ -z ${COUCHDB_USER+x} ]; then + export COUCHDB_ARGS="" + else + export COUCHDB_ARGS="--db-credentials \"$COUCHDB_USER:$COUCHDB_PASSWORD\" --log-db-credentials \"$COUCHDB_USER:$COUCHDB_PASSWORD\"" + fi + bundle exec ruby cogs/start.rb \ + -r "$REDIS_URL" \ + --db "$COUCHDB_URL/archivebot" \ + --log-db "$COUCHDB_URL/archivebot_logs" $COUCHDB_ARGS + ;; + "firehose") + export UPDATES_CHANNEL=updates + export FIREHOSE_SOCKET_URL=tcp://0.0.0.0:12345 + plumbing/updates-listener | plumbing/log-firehose + ;; + "dashboard") + bundle exec ruby dashboard/app.rb -u http://0.0.0.0:8080 -r "$REDIS_URL" + ;; + "websocket") + plumbing/firehose-client | python3 dashboard/websocket.py + ;; + "pipeline") + cd pipeline + export PIPELINE_NAME="${PIPELINE_NAME:-${PIPELINE_PREFIX}-$(hostname -s)}" + export NO_SCREEN=1 + sudo /usr/sbin/tcp-closer -4 --dport 443 --idle_time 21601000 --last_recv_limit 43200000 --interval 300 & + sudo /usr/sbin/tcp-closer -6 --dport 443 --idle_time 21601000 --last_recv_limit 43200000 --interval 300 & + run-pipeline3 pipeline.py --disable-web-server \ + --concurrent $PIPELINE_CONCURRENT $PIPELINE_NAME + ;; + "uploader") + python ./uploader/uploader.py $FINISHED_WARCS_DIR + ;; + "analyzer") + export UPDATES_CHANNEL=updates + cd plumbing + ./analyzer + ;; + "trimmer") + export UPDATES_CHANNEL=updates + cd plumbing + ./trimmer > /dev/null + ;; +esac +