commit 398726f73e84233a584fe096d916799fa3c90006 Author: JustAnotherArchivist Date: Mon May 9 02:19:05 2022 +0000 Initial commit diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..07f0b51 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,59 @@ +ARG PYTHON_VERSION=3.7 +ARG ALPINE_VERSION=3.14 + +# Build torsocks from source for the syscall fixes +# Use Alpine 3.13 for compatibility with old Docker daemons (https://wiki.alpinelinux.org/wiki/Draft_Release_Notes_for_Alpine_3.14.0#faccessat2) +FROM alpine:3.13 AS torsocks_build +ARG TORSOCKS_VERSION=f9721f38aa78bcd249237c1777fde0e6743351ce +RUN apk add --no-cache autoconf automake build-base gcc git libc-dev libtool make \ + && git clone https://gitlab.torproject.org/tpo/core/torsocks.git /torsocks \ + && cd /torsocks \ + && git checkout ${TORSOCKS_VERSION} \ + && ./autogen.sh \ + && ./configure \ + && make \ + && chmod 755 src/bin/torsocks + +# Based on https://github.com/ArchiveTeam/grab-site/pull/195 +FROM python:${PYTHON_VERSION}-alpine${ALPINE_VERSION} + +ARG GRAB_SITE_VERSION=2.2.3 + +WORKDIR /app +VOLUME [ "/data" ] + +RUN apk add --no-cache \ + git \ + gcc \ + libxml2-dev \ + musl-dev \ + libxslt-dev \ + g++ \ + re2-dev \ + libffi-dev \ + openssl-dev \ + patch \ + && ln -s /usr/include/libxml2/libxml /usr/include/libxml \ + && addgroup -S grab-site \ + && adduser -S -G grab-site grab-site \ + && chown -R grab-site:grab-site $(pwd) \ + && mkdir -p /data \ + && chown -R grab-site:grab-site /data + +COPY --from=torsocks_build /torsocks/src/bin/torsocks /usr/local/bin/torsocks +COPY --from=torsocks_build /torsocks/src/lib/.libs/*.a /usr/local/lib/torsocks/ +COPY --from=torsocks_build /torsocks/src/lib/.libs/*.so* /usr/local/lib/torsocks/ +COPY --from=torsocks_build /torsocks/doc/torsocks.conf /usr/local/etc/tor/torsocks.conf + +USER grab-site:grab-site +COPY --chown=grab-site:grab-site *.patch /tmp/grab-site-patches/ +RUN git clone https://github.com/ArchiveTeam/grab-site.git /app \ + && git checkout ${GRAB_SITE_VERSION} \ + && git apply --verbose /tmp/grab-site-patches/*.patch \ + && pip install --no-cache-dir . \ + && rm -rf /tmp/grab-site-patches +COPY --chown=grab-site:grab-site entrypoint-jaa /app/entrypoint-jaa +ENV PATH="/app:$PATH" +ENTRYPOINT ["entrypoint-jaa"] + +WORKDIR /data diff --git a/README.md b/README.md new file mode 100644 index 0000000..81ebdc6 --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +A grab-site Docker image by JAA + +Intended for use without a gs-server, so the WebSocket connection is disabled entirely. + + docker build -t grab-site-jaa:latest . + docker run --rm -d -v $(pwd)/data:/data:rw grab-site-jaa:latest --1 https://example.org/ + +# Tor +This image includes support for grabbing through Tor: + +1. Create a Docker bridge network for connecting to an isolated Tor proxy image: `docker network create --driver bridge --subnet 10.91.50.0/24 tor` +2. Run a Tor proxy in that network with a specified IP, e.g. `docker run -d --restart=always --name tor-socks-proxy --network tor --ip 10.91.50.2 peterdavehello/tor-socks-proxy:latest` +3. Run this image while specifying the relevant environment variables: `docker run --rm -d -v $(pwd)/data:/data:rw -e TORSOCKS_TOR_ADDRESS=10.91.50.2 -e TORSOCKS_TOR_PORT=9150 --network tor grab-site-jaa:latest --1 https://check.torproject.org/` diff --git a/disable-dnspython.patch b/disable-dnspython.patch new file mode 100644 index 0000000..7d0966d --- /dev/null +++ b/disable-dnspython.patch @@ -0,0 +1,28 @@ +diff --git a/libgrabsite/wpull_tweaks.py b/libgrabsite/wpull_tweaks.py +index 6bef92d..401b3d9 100644 +--- a/libgrabsite/wpull_tweaks.py ++++ b/libgrabsite/wpull_tweaks.py +@@ -4,6 +4,7 @@ import functools + + from wpull.database.sqltable import SQLiteURLTable + from wpull.document.html import HTMLReader ++from wpull.network.dns import Resolver + from wpull.processor.rule import ProcessingRule + + from libgrabsite import dupespotter +@@ -55,8 +56,15 @@ class DupeSpottingProcessingRule(ProcessingRule): + super().scrape_document(item_session) + + ++class NoDnspythonResolver(Resolver): ++ def __init__(self, *args, **kwargs): ++ super().__init__(*args, **kwargs) ++ self.dns_python_enabled = False ++ ++ + def activate(app_session): + app_session.factory.class_map['URLTableImplementation'] = NoFsyncSQLTable ++ app_session.factory.class_map['Resolver'] = NoDnspythonResolver + + if int(os.environ["DUPESPOTTER_ENABLED"]): + dupes_db_location = os.path.join(os.environ["GRAB_SITE_WORKING_DIR"], "dupes_db") diff --git a/disable-ws-client.patch b/disable-ws-client.patch new file mode 100644 index 0000000..58699f4 --- /dev/null +++ b/disable-ws-client.patch @@ -0,0 +1,13 @@ +diff --git a/libgrabsite/wpull_hooks.py b/libgrabsite/wpull_hooks.py +index 41a6fbe..b8240d7 100644 +--- a/libgrabsite/wpull_hooks.py ++++ b/libgrabsite/wpull_hooks.py +@@ -247,7 +247,7 @@ class GrabSitePlugin(WpullPlugin): + ws_port = int(os.environ.get("GRAB_SITE_PORT", 29000)) + ws_url = f"ws://{ws_host}:{ws_port}" + +- self.loop.create_task(dashboard_client.sender(self, ws_url)) ++ #self.loop.create_task(dashboard_client.sender(self, ws_url)) + + @swallow_exception + def update_max_content_length(self): diff --git a/entrypoint-jaa b/entrypoint-jaa new file mode 100755 index 0000000..c6dd760 --- /dev/null +++ b/entrypoint-jaa @@ -0,0 +1,7 @@ +#!/usr/bin/env sh +if [ "${TORSOCKS_TOR_ADDRESS}" ] +then + exec torsocks grab-site "$@" +else + exec grab-site "$@" +fi