Browse Source

Add wpull2-extract-ignored-offsite and extract-urls-for-archiveteam-projects

master
JustAnotherArchivist 4 months ago
parent
commit
53535b925a
2 changed files with 68 additions and 0 deletions
  1. +25
    -0
      extract-urls-for-archiveteam-projects
  2. +43
    -0
      wpull2-extract-ignored-offsite

+ 25
- 0
extract-urls-for-archiveteam-projects View File

@@ -0,0 +1,25 @@
#!/bin/bash
if [[ $# -ne 1 || "$1" == '--help' || "$1" == '-h' ]]; then
printf 'Usage: extract-urls-for-archiveteam-projects PREFIX\n' >&2
printf 'Reads URLs from stdin, extracts interesting for the different currently relevant AT projects into files prefixed by PREFIX\n' >&2
exit 1
fi

prefix="$1"
if [[ "${prefix}" == *[*?[]* ]]; then
printf 'Error: prefixes containing * ? [ not supported\n' >&2
exit 1
fi

if compgen -G "${prefix}*" >/dev/null; then
printf 'Error: there already exist files starting with %q\n' "${prefix}" >&2
exit 1
fi

tee \
>(grep -Fai imgur >"${prefix}-imgur") \
>(grep -Fai -e mediafire -e mfi.re >"${prefix}-mediafire") \
>(grep -Fai pastebin.com >"${prefix}-pastebin") \
>(grep -Fai -e blogspot -e blogger >"${prefix}-blogger") \
>(grep -Fai -e telegram.me -e //t.me/ >"${prefix}-telegram") \
>/dev/null

+ 43
- 0
wpull2-extract-ignored-offsite View File

@@ -0,0 +1,43 @@
#!/bin/bash
if [[ "$1" == '--help' || "$1" == '-h' ]]; then
printf 'Usage: %q [FILENAME]\n' "$0" >&2
printf 'Prints all ignored offsite URLs from the wpull 2.x SQLite DB at FILENAME (default: wpull.db) to stdout\n' >&2
exit
fi

if [[ $# -eq 1 ]]
then
filename="$1"
else
filename=wpull.db
fi
if [[ ! -f "${filename}" ]]
then
printf 'Error: %q does not exist or is not a regular file\n' "${filename}" >&2
exit 1
fi

# Check that the number of root URLs is 1; since we need to filter by the host, more than one root URL gets complicated.
# This query is unfortunately slow due to a lack of index since it isn't needed for other operations.
readarray -t roots < <(sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE level = 0')
if [[ ${#roots[@]} -ne 1 ]]; then
printf 'Error: jobs with more than one root URL are not supported.\n' >&2
exit 1
fi
root="${roots[0]}"

# Extract root hostname
roothost="${root#*//}"
roothost="${roothost%%/*}"
if [[ "${roothost}" =~ :[0-9][0-9]*$ ]]; then roothost="${roothost%:*}"; fi
if [[ "${roothost}" == *@* ]]; then roothost="${roothost##*@}"; fi

# Bail if there are weird chars in the hostname; this shouldn't be possible.
if [[ "${roothost}" == *[*?]* ]]; then
printf 'Error: root hostname %q contains glob chars.\n' "${roothost}" >&2
exit 1
fi
# GLOB is case-sensitive, but wpull normalises URLs, including lowercasing hostnames, so this is not a problem.

# Go!
sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE status = "skipped" AND status_code IS NULL AND try_count = 1 AND inline_level IS NULL AND url NOT GLOB "*[/@]'"${roothost}"'[:/]*"'

Loading…
Cancel
Save