Browse Source

Initial commit

master
JustAnotherArchivist 1 year ago
commit
398726f73e
5 changed files with 118 additions and 0 deletions
  1. +59
    -0
      Dockerfile
  2. +13
    -0
      README.md
  3. +27
    -0
      disable-dnspython.patch
  4. +12
    -0
      disable-ws-client.patch
  5. +7
    -0
      entrypoint-jaa

+ 59
- 0
Dockerfile View File

@@ -0,0 +1,59 @@
ARG PYTHON_VERSION=3.7
ARG ALPINE_VERSION=3.14

# Build torsocks from source for the syscall fixes
# Use Alpine 3.13 for compatibility with old Docker daemons (https://wiki.alpinelinux.org/wiki/Draft_Release_Notes_for_Alpine_3.14.0#faccessat2)
FROM alpine:3.13 AS torsocks_build
ARG TORSOCKS_VERSION=f9721f38aa78bcd249237c1777fde0e6743351ce
RUN apk add --no-cache autoconf automake build-base gcc git libc-dev libtool make \
&& git clone https://gitlab.torproject.org/tpo/core/torsocks.git /torsocks \
&& cd /torsocks \
&& git checkout ${TORSOCKS_VERSION} \
&& ./autogen.sh \
&& ./configure \
&& make \
&& chmod 755 src/bin/torsocks

# Based on https://github.com/ArchiveTeam/grab-site/pull/195
FROM python:${PYTHON_VERSION}-alpine${ALPINE_VERSION}

ARG GRAB_SITE_VERSION=2.2.3

WORKDIR /app
VOLUME [ "/data" ]

RUN apk add --no-cache \
git \
gcc \
libxml2-dev \
musl-dev \
libxslt-dev \
g++ \
re2-dev \
libffi-dev \
openssl-dev \
patch \
&& ln -s /usr/include/libxml2/libxml /usr/include/libxml \
&& addgroup -S grab-site \
&& adduser -S -G grab-site grab-site \
&& chown -R grab-site:grab-site $(pwd) \
&& mkdir -p /data \
&& chown -R grab-site:grab-site /data

COPY --from=torsocks_build /torsocks/src/bin/torsocks /usr/local/bin/torsocks
COPY --from=torsocks_build /torsocks/src/lib/.libs/*.a /usr/local/lib/torsocks/
COPY --from=torsocks_build /torsocks/src/lib/.libs/*.so* /usr/local/lib/torsocks/
COPY --from=torsocks_build /torsocks/doc/torsocks.conf /usr/local/etc/tor/torsocks.conf

USER grab-site:grab-site
COPY --chown=grab-site:grab-site *.patch /tmp/grab-site-patches/
RUN git clone https://github.com/ArchiveTeam/grab-site.git /app \
&& git checkout ${GRAB_SITE_VERSION} \
&& git apply --verbose /tmp/grab-site-patches/*.patch \
&& pip install --no-cache-dir . \
&& rm -rf /tmp/grab-site-patches
COPY --chown=grab-site:grab-site entrypoint-jaa /app/entrypoint-jaa
ENV PATH="/app:$PATH"
ENTRYPOINT ["entrypoint-jaa"]

WORKDIR /data

+ 13
- 0
README.md View File

@@ -0,0 +1,13 @@
A grab-site Docker image by JAA

Intended for use without a gs-server, so the WebSocket connection is disabled entirely.

docker build -t grab-site-jaa:latest .
docker run --rm -d -v $(pwd)/data:/data:rw grab-site-jaa:latest --1 https://example.org/

# Tor
This image includes support for grabbing through Tor:

1. Create a Docker bridge network for connecting to an isolated Tor proxy image: `docker network create --driver bridge --subnet 10.91.50.0/24 tor`
2. Run a Tor proxy in that network with a specified IP, e.g. `docker run -d --restart=always --name tor-socks-proxy --network tor --ip 10.91.50.2 peterdavehello/tor-socks-proxy:latest`
3. Run this image while specifying the relevant environment variables: `docker run --rm -d -v $(pwd)/data:/data:rw -e TORSOCKS_TOR_ADDRESS=10.91.50.2 -e TORSOCKS_TOR_PORT=9150 --network tor grab-site-jaa:latest --1 https://check.torproject.org/`

+ 27
- 0
disable-dnspython.patch View File

@@ -0,0 +1,28 @@
diff --git a/libgrabsite/wpull_tweaks.py b/libgrabsite/wpull_tweaks.py
index 6bef92d..401b3d9 100644
--- a/libgrabsite/wpull_tweaks.py
@@ -4,6 +4,7 @@ import functools
from wpull.database.sqltable import SQLiteURLTable
from wpull.document.html import HTMLReader
+from wpull.network.dns import Resolver
from wpull.processor.rule import ProcessingRule
from libgrabsite import dupespotter
@@ -55,8 +56,15 @@ class DupeSpottingProcessingRule(ProcessingRule):
super().scrape_document(item_session)
+class NoDnspythonResolver(Resolver):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.dns_python_enabled = False
+
+
def activate(app_session):
app_session.factory.class_map['URLTableImplementation'] = NoFsyncSQLTable
+ app_session.factory.class_map['Resolver'] = NoDnspythonResolver
if int(os.environ["DUPESPOTTER_ENABLED"]):
dupes_db_location = os.path.join(os.environ["GRAB_SITE_WORKING_DIR"], "dupes_db")

+ 12
- 0
disable-ws-client.patch View File

@@ -0,0 +1,13 @@
diff --git a/libgrabsite/wpull_hooks.py b/libgrabsite/wpull_hooks.py
index 41a6fbe..b8240d7 100644
--- a/libgrabsite/wpull_hooks.py
@@ -247,7 +247,7 @@ class GrabSitePlugin(WpullPlugin):
ws_port = int(os.environ.get("GRAB_SITE_PORT", 29000))
ws_url = f"ws://{ws_host}:{ws_port}"
- self.loop.create_task(dashboard_client.sender(self, ws_url))
+ #self.loop.create_task(dashboard_client.sender(self, ws_url))
@swallow_exception
def update_max_content_length(self):

+ 7
- 0
entrypoint-jaa View File

@@ -0,0 +1,7 @@
#!/usr/bin/env sh
if [ "${TORSOCKS_TOR_ADDRESS}" ]
then
exec torsocks grab-site "$@"
else
exec grab-site "$@"
fi

Loading…
Cancel
Save