From f3bec23348c645df97db86e965e6928a1c75166f Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 6 Dec 2023 17:53:50 +0000 Subject: [PATCH] Remove filtering of onsite URLs because it's unreliable It also erroneously filters out offsite URLs that contain the root domain, and this isn't fixable without using regex, which isn't always available in the SQLite CLI before version 3.36.0. --- wpull2-extract-ignored | 20 ++++++++++++++++ wpull2-extract-ignored-offsite | 43 ---------------------------------- 2 files changed, 20 insertions(+), 43 deletions(-) create mode 100755 wpull2-extract-ignored delete mode 100755 wpull2-extract-ignored-offsite diff --git a/wpull2-extract-ignored b/wpull2-extract-ignored new file mode 100755 index 0000000..870eb6b --- /dev/null +++ b/wpull2-extract-ignored @@ -0,0 +1,20 @@ +#!/bin/bash +if [[ "$1" == '--help' || "$1" == '-h' ]]; then + printf 'Usage: %q [FILENAME]\n' "$0" >&2 + printf 'Prints all ignored URLs from the wpull 2.x SQLite DB at FILENAME (default: wpull.db) to stdout\n' >&2 + exit +fi + +if [[ $# -eq 1 ]] +then + filename="$1" +else + filename=wpull.db +fi +if [[ ! -f "${filename}" ]] +then + printf 'Error: %q does not exist or is not a regular file\n' "${filename}" >&2 + exit 1 +fi + +sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE status = "skipped" AND status_code IS NULL AND try_count = 1 AND inline_level IS NULL' diff --git a/wpull2-extract-ignored-offsite b/wpull2-extract-ignored-offsite deleted file mode 100755 index 19c6289..0000000 --- a/wpull2-extract-ignored-offsite +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -if [[ "$1" == '--help' || "$1" == '-h' ]]; then - printf 'Usage: %q [FILENAME]\n' "$0" >&2 - printf 'Prints all ignored offsite URLs from the wpull 2.x SQLite DB at FILENAME (default: wpull.db) to stdout\n' >&2 - exit -fi - -if [[ $# -eq 1 ]] -then - filename="$1" -else - filename=wpull.db -fi -if [[ ! -f "${filename}" ]] -then - printf 'Error: %q does not exist or is not a regular file\n' "${filename}" >&2 - exit 1 -fi - -# Check that the number of root URLs is 1; since we need to filter by the host, more than one root URL gets complicated. -# This query is unfortunately slow due to a lack of index since it isn't needed for other operations. -readarray -t roots < <(sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE level = 0') -if [[ ${#roots[@]} -ne 1 ]]; then - printf 'Error: jobs with more than one root URL are not supported.\n' >&2 - exit 1 -fi -root="${roots[0]}" - -# Extract root hostname -roothost="${root#*//}" -roothost="${roothost%%/*}" -if [[ "${roothost}" =~ :[0-9][0-9]*$ ]]; then roothost="${roothost%:*}"; fi -if [[ "${roothost}" == *@* ]]; then roothost="${roothost##*@}"; fi - -# Bail if there are weird chars in the hostname; this shouldn't be possible. -if [[ "${roothost}" == *[*?]* ]]; then - printf 'Error: root hostname %q contains glob chars.\n' "${roothost}" >&2 - exit 1 -fi -# GLOB is case-sensitive, but wpull normalises URLs, including lowercasing hostnames, so this is not a problem. - -# Go! -sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE status = "skipped" AND status_code IS NULL AND try_count = 1 AND inline_level IS NULL AND url NOT GLOB "*[/@]'"${roothost}"'[:/]*"'