commit 8d0068a573107f11dca400cc57c14212532b2755 Author: JustAnotherArchivist Date: Thu May 19 17:56:08 2022 +0000 Initial commit diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..c38c9cd --- /dev/null +++ b/Dockerfile @@ -0,0 +1,3 @@ +FROM atdr.meo.ws/archiveteam/mercurial-grab@sha256:c25841efe3679952eef9418a9784b09b7668918f753227890a10d3e3c404b108 +COPY mercurial-dl /grab/ +ENTRYPOINT ["./mercurial-dl"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..f576d44 --- /dev/null +++ b/README.md @@ -0,0 +1,6 @@ +A small wrapper around the mercurial-grab project container that dumps individual repositories on demand. + + docker build -t mercurial-dl:latest . + docker run --rm -v $(pwd)/data:/grab/data mercurial-dl:latest https://hg.mozilla.org/penelope/ + +Accepts any number of URLs as arguments. The output WARCs are *not* merged together. See for an example of what the output might look like (although the directory naming here would be slightly different). diff --git a/mercurial-dl b/mercurial-dl new file mode 100755 index 0000000..3ce6146 --- /dev/null +++ b/mercurial-dl @@ -0,0 +1,34 @@ +#!/bin/bash +for url +do + name="${url//\//_}" + itemDir="./data/${name}/" + mkdir -p "${itemDir}" + warcName="${name}" + item_dir="${itemDir}" item_value="${url}" warc_file_base="${warcName}" /grab/wget-at \ + -U 'mercurial/proto-1.0 (Mercurial 5.3.1)' \ + -nv \ + --no-cookies \ + --content-on-error \ + --lua-script mercurial.lua \ + -o "${itemDir}/wget.log" \ + --no-check-certificate \ + --output-document "${itemDir}/wget.tmp" \ + --truncate-output \ + -e robots=off \ + --rotate-dns \ + --recursive --level=inf \ + --no-parent \ + --page-requisites \ + --timeout 30 \ + --tries inf \ + --span-hosts \ + --waitretry 30 \ + --warc-file "${itemDir}/${warcName}-main" \ + --warc-header 'operator: Archive Team' \ + --warc-header 'mercurial-dld-script-version: 20201031.01' \ + --warc-dedup-url-agnostic \ + --warc-header "mercurial-repository: ${url}" \ + --warc-header 'warc-type: main' \ + "${url}?cmd=capabilities" +done