#!/bin/bash if [[ "$1" == '--help' || "$1" == '-h' ]]; then printf 'Usage: %q [FILENAME]\n' "$0" >&2 printf 'Prints all ignored offsite URLs from the wpull 2.x SQLite DB at FILENAME (default: wpull.db) to stdout\n' >&2 exit fi if [[ $# -eq 1 ]] then filename="$1" else filename=wpull.db fi if [[ ! -f "${filename}" ]] then printf 'Error: %q does not exist or is not a regular file\n' "${filename}" >&2 exit 1 fi # Check that the number of root URLs is 1; since we need to filter by the host, more than one root URL gets complicated. # This query is unfortunately slow due to a lack of index since it isn't needed for other operations. readarray -t roots < <(sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE level = 0') if [[ ${#roots[@]} -ne 1 ]]; then printf 'Error: jobs with more than one root URL are not supported.\n' >&2 exit 1 fi root="${roots[0]}" # Extract root hostname roothost="${root#*//}" roothost="${roothost%%/*}" if [[ "${roothost}" =~ :[0-9][0-9]*$ ]]; then roothost="${roothost%:*}"; fi if [[ "${roothost}" == *@* ]]; then roothost="${roothost##*@}"; fi # Bail if there are weird chars in the hostname; this shouldn't be possible. if [[ "${roothost}" == *[*?]* ]]; then printf 'Error: root hostname %q contains glob chars.\n' "${roothost}" >&2 exit 1 fi # GLOB is case-sensitive, but wpull normalises URLs, including lowercasing hostnames, so this is not a problem. # Go! sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE status = "skipped" AND status_code IS NULL AND try_count = 1 AND inline_level IS NULL AND url NOT GLOB "*[/@]'"${roothost}"'[:/]*"'