@@ -0,0 +1,82 @@ | |||||
FROM debian:stretch as builder | |||||
RUN apt-get update && apt-get dist-upgrade -y && apt-get install -y \ | |||||
build-essential dpkg-dev devscripts cmake git libmnl-dev \ | |||||
&& rm -rf /var/lib/apt/lists/* | |||||
RUN git clone https://github.com/kristrev/tcp_closer.git /tcp_closer | |||||
RUN mkdir -p tcp_closer/src/build | |||||
WORKDIR /tcp_closer/src/build | |||||
RUN cmake .. | |||||
RUN make package | |||||
RUN find . -type f | |||||
FROM python:3.6-stretch | |||||
RUN apt-get update && apt-get dist-upgrade -y && apt-get install -y \ | |||||
build-essential \ | |||||
libxml2-dev libxslt-dev zlib1g-dev libssl-dev libsqlite3-dev \ | |||||
libffi-dev git tmux fontconfig-config fonts-dejavu-core curl \ | |||||
libfontconfig1 libjpeg62-turbo-dev libjpeg62-turbo libjpeg-dev libjpeg-turbo-progs lsof ffmpeg \ | |||||
autossh rsync bundler git tmux python3 libtool pkg-config \ | |||||
build-essential autoconf automake libzmq3-dev libmnl0 \ | |||||
&& rm -rf /var/lib/apt/lists/* | |||||
# Install tcp-closer | |||||
COPY --from=builder /tcp_closer/src/build/tcp-closer-0.1.1-Linux.deb /tcp-closer.deb | |||||
RUN dpkg -i /tcp-closer.deb | |||||
# Add user | |||||
RUN groupadd -g 1337 ab | |||||
RUN groupadd -r psudo | |||||
RUN useradd -rm -d /home/ab -s /bin/bash -g ab -G psudo -u 1337 ab | |||||
# Install python dependencies | |||||
RUN pip install websockets requests | |||||
RUN curl -L https://yt-dl.org/downloads/latest/youtube-dl -o /usr/local/bin/youtube-dl | |||||
# Setup output directory | |||||
RUN mkdir /data | |||||
RUN chown ab:ab /data | |||||
RUN chmod a+rx /usr/local/bin/youtube-dl | |||||
# Install tini | |||||
ENV TINI_VERSION v0.19.0 | |||||
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini | |||||
RUN chmod +x /tini | |||||
# Give ab sudo for a bit | |||||
RUN echo "%psudo ALL=(ALL:ALL) NOPASSWD: ALL" > /etc/sudoers.d/psudo | |||||
RUN chmod 0440 /etc/sudoers.d/psudo | |||||
USER ab | |||||
# Clone the repo | |||||
RUN git clone --recursive https://github.com/ArchiveTeam/ArchiveBot.git /home/ab/ArchiveBot | |||||
WORKDIR /home/ab/ArchiveBot | |||||
# Patch a file | |||||
COPY config.patch /home/ab/config.patch | |||||
RUN patch /home/ab/ArchiveBot/pipeline/archivebot/shared_config.py /home/ab/config.patch | |||||
# Setup symlinks for pipeline | |||||
RUN ln -s /usr/local/bin/wpull /home/ab/ArchiveBot/pipeline/wpull | |||||
RUN rm /home/ab/ArchiveBot/pipeline/youtube-dl | |||||
RUN ln -s /usr/local/bin/youtube-dl /home/ab/ArchiveBot/pipeline/youtube-dl | |||||
# Setup env var | |||||
ENV OPENSSL_CONF=/home/ab/ArchiveBot/ops/openssl-less-secure.cnf | |||||
# Bundle install | |||||
RUN bundle install | |||||
RUN cd plumbing && bundle install && cd .. | |||||
# Install pip dependencies | |||||
USER root | |||||
RUN pip install -r pipeline/requirements.txt | |||||
# Copy in entrypoint | |||||
COPY entrypoint.sh /entrypoint.sh | |||||
# Button up image | |||||
USER ab | |||||
ENTRYPOINT [ "/tini", "--", "/entrypoint.sh" ] | |||||
@@ -0,0 +1,21 @@ | |||||
--- shared_config.py.orig 2022-02-24 20:30:36.509450521 +0100 | |||||
@@ -1,12 +1,17 @@ | |||||
import os | |||||
import yaml | |||||
+try: | |||||
+ from yaml import CLoader as Loader, CDumper as Dumper | |||||
+except ImportError: | |||||
+ from yaml import Loader, Dumper | |||||
+ | |||||
def config(): | |||||
my_dir = os.path.dirname(__file__) | |||||
config_file = os.path.join(my_dir, '../../lib/shared_config.yml') | |||||
with open(config_file, 'r') as f: | |||||
- return yaml.load(f.read()) | |||||
+ return yaml.load(f.read(), Loader=Loader) | |||||
def log_channel(): | |||||
c = config() |
@@ -0,0 +1,70 @@ | |||||
version: "3.9" | |||||
services: | |||||
redis: | |||||
image: "redis:alpine" | |||||
couchdb: | |||||
image: "couchdb:2" | |||||
bot: | |||||
build: . | |||||
command: "bot" | |||||
environment: | |||||
- IRC_URL=ircs://irc.hackint.org:6697 | |||||
- REDIS_URL=redis://redis:6379/0 | |||||
- IRC_CHANNEL=#archivebot-rewby-dev | |||||
- IRC_NICK=ArchiveBotRewbyDev | |||||
- COUCHDB_URL=http://couchdb:5984 | |||||
cogs: | |||||
build: . | |||||
command: "cogs" | |||||
environment: | |||||
- REDIS_URL=redis://redis:6379/0 | |||||
- COUCHDB_URL=http://couchdb:5984 | |||||
firehose: | |||||
build: . | |||||
command: "firehose" | |||||
environment: | |||||
- REDIS_URL=redis://redis:6379/0 | |||||
websocket: | |||||
build: . | |||||
command: "websocket" | |||||
ports: | |||||
- 4568:4568 | |||||
environment: | |||||
- REDIS_URL=redis://redis:6379/0 | |||||
- FIREHOSE_SOCKET_URL=tcp://firehose:12345 | |||||
dashboard: | |||||
build: . | |||||
command: "dashboard" | |||||
ports: | |||||
- 8080:8080 | |||||
environment: | |||||
- REDIS_URL=redis://redis:6379/0 | |||||
uploader: | |||||
build: . | |||||
command: "uploader" | |||||
volumes: | |||||
- warcs:/data | |||||
environment: | |||||
- RSYNC_URL=rsync://at-offload.hawc.eu/abtest/ | |||||
pipeline: | |||||
build: . | |||||
command: "pipeline" | |||||
volumes: | |||||
- warcs:/data | |||||
environment: | |||||
- REDIS_URL=redis://redis:6379/0 | |||||
- PIPELINE_CONCURRENT=10 | |||||
- PIPELINE_PREFIX=testpipeline | |||||
analyzer: | |||||
build: . | |||||
command: "analyzer" | |||||
environment: | |||||
- REDIS_URL=redis://redis:6379/0 | |||||
trimmer: | |||||
build: . | |||||
command: "trimmer" | |||||
environment: | |||||
- REDIS_URL=redis://redis:6379/0 | |||||
volumes: | |||||
warcs: | |||||
@@ -0,0 +1,92 @@ | |||||
#!/bin/bash | |||||
set -exuo pipefail | |||||
if [ -z ${COUCHDB_URL+x} ]; then | |||||
echo "Skipping couchdb init" | |||||
else | |||||
timeout 300 bash -c "while [[ \"\$(curl -s -o /dev/null -w ''%{http_code}'' ${COUCHDB_URL})\" != \"200\" ]]; do sleep 5; done" || false | |||||
if [ -z ${COUCHDB_USER+x} ]; then | |||||
export COUCHDB_CURL_ARGS="" | |||||
else | |||||
export COUCHDB_CURL_ARGS="-u \"$COUCHDB_USER:$COUCHDB_PASSWORD\"" | |||||
fi | |||||
pushd "db/design_docs" | |||||
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/_users | |||||
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot | |||||
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot_logs | |||||
grep -v _rev archive_urls.json > /tmp/archive_urls.json | |||||
grep -v _rev ignore_patterns.json > /tmp/ignore_patterns.json | |||||
grep -v _rev jobs.json > /tmp/jobs.json | |||||
grep -v _rev user_agents.json > /tmp/user_agents.json | |||||
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/archive_urls -d @/tmp/archive_urls.json | |||||
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/ignore_patterns -d @/tmp/ignore_patterns.json | |||||
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/jobs -d @/tmp/jobs.json | |||||
curl -s $COUCHDB_CURL_ARGS -X PUT $COUCHDB_URL/archivebot/_design/user_agents -d @/tmp/user_agents.json | |||||
popd | |||||
fi | |||||
export FINISHED_WARCS_DIR=${FINISHED_WARCS_DIR:-/data/} | |||||
case "$1" in | |||||
"bot") | |||||
cd bot | |||||
if [ -z ${COUCHDB_USER+x} ]; then | |||||
export COUCHDB_ARGS="" | |||||
else | |||||
export COUCHDB_ARGS="--db-credentials \"$COUCHDB_USER:$COUCHDB_PASSWORD\"" | |||||
fi | |||||
bundle exec ruby bot.rb \ | |||||
-s "$IRC_URL" \ | |||||
-r "$REDIS_URL" \ | |||||
-c "$IRC_CHANNEL" \ | |||||
-n "$IRC_NICK" \ | |||||
--db "$COUCHDB_URL/archivebot" $COUCHDB_ARGS | |||||
;; | |||||
"cogs") | |||||
if [ -z ${COUCHDB_USER+x} ]; then | |||||
export COUCHDB_ARGS="" | |||||
else | |||||
export COUCHDB_ARGS="--db-credentials \"$COUCHDB_USER:$COUCHDB_PASSWORD\" --log-db-credentials \"$COUCHDB_USER:$COUCHDB_PASSWORD\"" | |||||
fi | |||||
bundle exec ruby cogs/start.rb \ | |||||
-r "$REDIS_URL" \ | |||||
--db "$COUCHDB_URL/archivebot" \ | |||||
--log-db "$COUCHDB_URL/archivebot_logs" $COUCHDB_ARGS | |||||
;; | |||||
"firehose") | |||||
export UPDATES_CHANNEL=updates | |||||
export FIREHOSE_SOCKET_URL=tcp://0.0.0.0:12345 | |||||
plumbing/updates-listener | plumbing/log-firehose | |||||
;; | |||||
"dashboard") | |||||
bundle exec ruby dashboard/app.rb -u http://0.0.0.0:8080 -r "$REDIS_URL" | |||||
;; | |||||
"websocket") | |||||
plumbing/firehose-client | python3 dashboard/websocket.py | |||||
;; | |||||
"pipeline") | |||||
cd pipeline | |||||
export PIPELINE_NAME="${PIPELINE_NAME:-${PIPELINE_PREFIX}-$(hostname -s)}" | |||||
export NO_SCREEN=1 | |||||
sudo /usr/sbin/tcp-closer -4 --dport 443 --idle_time 21601000 --last_recv_limit 43200000 --interval 300 & | |||||
sudo /usr/sbin/tcp-closer -6 --dport 443 --idle_time 21601000 --last_recv_limit 43200000 --interval 300 & | |||||
run-pipeline3 pipeline.py --disable-web-server \ | |||||
--concurrent $PIPELINE_CONCURRENT $PIPELINE_NAME | |||||
;; | |||||
"uploader") | |||||
python ./uploader/uploader.py $FINISHED_WARCS_DIR | |||||
;; | |||||
"analyzer") | |||||
export UPDATES_CHANNEL=updates | |||||
cd plumbing | |||||
./analyzer | |||||
;; | |||||
"trimmer") | |||||
export UPDATES_CHANNEL=updates | |||||
cd plumbing | |||||
./trimmer > /dev/null | |||||
;; | |||||
esac | |||||