@@ -0,0 +1,59 @@ | |||||
ARG PYTHON_VERSION=3.7 | |||||
ARG ALPINE_VERSION=3.14 | |||||
# Build torsocks from source for the syscall fixes | |||||
# Use Alpine 3.13 for compatibility with old Docker daemons (https://wiki.alpinelinux.org/wiki/Draft_Release_Notes_for_Alpine_3.14.0#faccessat2) | |||||
FROM alpine:3.13 AS torsocks_build | |||||
ARG TORSOCKS_VERSION=f9721f38aa78bcd249237c1777fde0e6743351ce | |||||
RUN apk add --no-cache autoconf automake build-base gcc git libc-dev libtool make \ | |||||
&& git clone https://gitlab.torproject.org/tpo/core/torsocks.git /torsocks \ | |||||
&& cd /torsocks \ | |||||
&& git checkout ${TORSOCKS_VERSION} \ | |||||
&& ./autogen.sh \ | |||||
&& ./configure \ | |||||
&& make \ | |||||
&& chmod 755 src/bin/torsocks | |||||
# Based on https://github.com/ArchiveTeam/grab-site/pull/195 | |||||
FROM python:${PYTHON_VERSION}-alpine${ALPINE_VERSION} | |||||
ARG GRAB_SITE_VERSION=2.2.3 | |||||
WORKDIR /app | |||||
VOLUME [ "/data" ] | |||||
RUN apk add --no-cache \ | |||||
git \ | |||||
gcc \ | |||||
libxml2-dev \ | |||||
musl-dev \ | |||||
libxslt-dev \ | |||||
g++ \ | |||||
re2-dev \ | |||||
libffi-dev \ | |||||
openssl-dev \ | |||||
patch \ | |||||
&& ln -s /usr/include/libxml2/libxml /usr/include/libxml \ | |||||
&& addgroup -S grab-site \ | |||||
&& adduser -S -G grab-site grab-site \ | |||||
&& chown -R grab-site:grab-site $(pwd) \ | |||||
&& mkdir -p /data \ | |||||
&& chown -R grab-site:grab-site /data | |||||
COPY --from=torsocks_build /torsocks/src/bin/torsocks /usr/local/bin/torsocks | |||||
COPY --from=torsocks_build /torsocks/src/lib/.libs/*.a /usr/local/lib/torsocks/ | |||||
COPY --from=torsocks_build /torsocks/src/lib/.libs/*.so* /usr/local/lib/torsocks/ | |||||
COPY --from=torsocks_build /torsocks/doc/torsocks.conf /usr/local/etc/tor/torsocks.conf | |||||
USER grab-site:grab-site | |||||
COPY --chown=grab-site:grab-site *.patch /tmp/grab-site-patches/ | |||||
RUN git clone https://github.com/ArchiveTeam/grab-site.git /app \ | |||||
&& git checkout ${GRAB_SITE_VERSION} \ | |||||
&& git apply --verbose /tmp/grab-site-patches/*.patch \ | |||||
&& pip install --no-cache-dir . \ | |||||
&& rm -rf /tmp/grab-site-patches | |||||
COPY --chown=grab-site:grab-site entrypoint-jaa /app/entrypoint-jaa | |||||
ENV PATH="/app:$PATH" | |||||
ENTRYPOINT ["entrypoint-jaa"] | |||||
WORKDIR /data |
@@ -0,0 +1,13 @@ | |||||
A grab-site Docker image by JAA | |||||
Intended for use without a gs-server, so the WebSocket connection is disabled entirely. | |||||
docker build -t grab-site-jaa:latest . | |||||
docker run --rm -d -v $(pwd)/data:/data:rw grab-site-jaa:latest --1 https://example.org/ | |||||
# Tor | |||||
This image includes support for grabbing through Tor: | |||||
1. Create a Docker bridge network for connecting to an isolated Tor proxy image: `docker network create --driver bridge --subnet 10.91.50.0/24 tor` | |||||
2. Run a Tor proxy in that network with a specified IP, e.g. `docker run -d --restart=always --name tor-socks-proxy --network tor --ip 10.91.50.2 peterdavehello/tor-socks-proxy:latest` | |||||
3. Run this image while specifying the relevant environment variables: `docker run --rm -d -v $(pwd)/data:/data:rw -e TORSOCKS_TOR_ADDRESS=10.91.50.2 -e TORSOCKS_TOR_PORT=9150 --network tor grab-site-jaa:latest --1 https://check.torproject.org/` |
@@ -0,0 +1,28 @@ | |||||
diff --git a/libgrabsite/wpull_tweaks.py b/libgrabsite/wpull_tweaks.py | |||||
index 6bef92d..401b3d9 100644 | |||||
--- a/libgrabsite/wpull_tweaks.py | |||||
@@ -4,6 +4,7 @@ import functools | |||||
from wpull.database.sqltable import SQLiteURLTable | |||||
from wpull.document.html import HTMLReader | |||||
+from wpull.network.dns import Resolver | |||||
from wpull.processor.rule import ProcessingRule | |||||
from libgrabsite import dupespotter | |||||
@@ -55,8 +56,15 @@ class DupeSpottingProcessingRule(ProcessingRule): | |||||
super().scrape_document(item_session) | |||||
+class NoDnspythonResolver(Resolver): | |||||
+ def __init__(self, *args, **kwargs): | |||||
+ super().__init__(*args, **kwargs) | |||||
+ self.dns_python_enabled = False | |||||
+ | |||||
+ | |||||
def activate(app_session): | |||||
app_session.factory.class_map['URLTableImplementation'] = NoFsyncSQLTable | |||||
+ app_session.factory.class_map['Resolver'] = NoDnspythonResolver | |||||
if int(os.environ["DUPESPOTTER_ENABLED"]): | |||||
dupes_db_location = os.path.join(os.environ["GRAB_SITE_WORKING_DIR"], "dupes_db") |
@@ -0,0 +1,13 @@ | |||||
diff --git a/libgrabsite/wpull_hooks.py b/libgrabsite/wpull_hooks.py | |||||
index 41a6fbe..b8240d7 100644 | |||||
--- a/libgrabsite/wpull_hooks.py | |||||
@@ -247,7 +247,7 @@ class GrabSitePlugin(WpullPlugin): | |||||
ws_port = int(os.environ.get("GRAB_SITE_PORT", 29000)) | |||||
ws_url = f"ws://{ws_host}:{ws_port}" | |||||
- self.loop.create_task(dashboard_client.sender(self, ws_url)) | |||||
+ #self.loop.create_task(dashboard_client.sender(self, ws_url)) | |||||
@swallow_exception | |||||
def update_max_content_length(self): |
@@ -0,0 +1,7 @@ | |||||
#!/usr/bin/env sh | |||||
if [ "${TORSOCKS_TOR_ADDRESS}" ] | |||||
then | |||||
exec torsocks grab-site "$@" | |||||
else | |||||
exec grab-site "$@" | |||||
fi |