|
|
@@ -1,43 +0,0 @@ |
|
|
|
#!/bin/bash |
|
|
|
if [[ "$1" == '--help' || "$1" == '-h' ]]; then |
|
|
|
printf 'Usage: %q [FILENAME]\n' "$0" >&2 |
|
|
|
printf 'Prints all ignored offsite URLs from the wpull 2.x SQLite DB at FILENAME (default: wpull.db) to stdout\n' >&2 |
|
|
|
exit |
|
|
|
fi |
|
|
|
|
|
|
|
if [[ $# -eq 1 ]] |
|
|
|
then |
|
|
|
filename="$1" |
|
|
|
else |
|
|
|
filename=wpull.db |
|
|
|
fi |
|
|
|
if [[ ! -f "${filename}" ]] |
|
|
|
then |
|
|
|
printf 'Error: %q does not exist or is not a regular file\n' "${filename}" >&2 |
|
|
|
exit 1 |
|
|
|
fi |
|
|
|
|
|
|
|
# Check that the number of root URLs is 1; since we need to filter by the host, more than one root URL gets complicated. |
|
|
|
# This query is unfortunately slow due to a lack of index since it isn't needed for other operations. |
|
|
|
readarray -t roots < <(sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE level = 0') |
|
|
|
if [[ ${#roots[@]} -ne 1 ]]; then |
|
|
|
printf 'Error: jobs with more than one root URL are not supported.\n' >&2 |
|
|
|
exit 1 |
|
|
|
fi |
|
|
|
root="${roots[0]}" |
|
|
|
|
|
|
|
# Extract root hostname |
|
|
|
roothost="${root#*//}" |
|
|
|
roothost="${roothost%%/*}" |
|
|
|
if [[ "${roothost}" =~ :[0-9][0-9]*$ ]]; then roothost="${roothost%:*}"; fi |
|
|
|
if [[ "${roothost}" == *@* ]]; then roothost="${roothost##*@}"; fi |
|
|
|
|
|
|
|
# Bail if there are weird chars in the hostname; this shouldn't be possible. |
|
|
|
if [[ "${roothost}" == *[*?]* ]]; then |
|
|
|
printf 'Error: root hostname %q contains glob chars.\n' "${roothost}" >&2 |
|
|
|
exit 1 |
|
|
|
fi |
|
|
|
# GLOB is case-sensitive, but wpull normalises URLs, including lowercasing hostnames, so this is not a problem. |
|
|
|
|
|
|
|
# Go! |
|
|
|
sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE status = "skipped" AND status_code IS NULL AND try_count = 1 AND inline_level IS NULL AND url NOT GLOB "*[/@]'"${roothost}"'[:/]*"' |