Author | SHA1 | Message | Date |
---|---|---|---|
JustAnotherArchivist | f3bec23348 |
Remove filtering of onsite URLs because it's unreliable
It also erroneously filters out offsite URLs that contain the root domain, and this isn't fixable without using regex, which isn't always available in the SQLite CLI before version 3.36.0. |
5 months ago |
JustAnotherArchivist | 53535b925a | Add wpull2-extract-ignored-offsite and extract-urls-for-archiveteam-projects | 5 months ago |
JustAnotherArchivist | 0432bd00c2 | Avoid float roundtrip for integer values | 6 months ago |
@@ -0,0 +1,25 @@ | |||||
#!/bin/bash | |||||
if [[ $# -ne 1 || "$1" == '--help' || "$1" == '-h' ]]; then | |||||
printf 'Usage: extract-urls-for-archiveteam-projects PREFIX\n' >&2 | |||||
printf 'Reads URLs from stdin, extracts interesting for the different currently relevant AT projects into files prefixed by PREFIX\n' >&2 | |||||
exit 1 | |||||
fi | |||||
prefix="$1" | |||||
if [[ "${prefix}" == *[*?[]* ]]; then | |||||
printf 'Error: prefixes containing * ? [ not supported\n' >&2 | |||||
exit 1 | |||||
fi | |||||
if compgen -G "${prefix}*" >/dev/null; then | |||||
printf 'Error: there already exist files starting with %q\n' "${prefix}" >&2 | |||||
exit 1 | |||||
fi | |||||
tee \ | |||||
>(grep -Fai imgur >"${prefix}-imgur") \ | |||||
>(grep -Fai -e mediafire -e mfi.re >"${prefix}-mediafire") \ | |||||
>(grep -Fai pastebin.com >"${prefix}-pastebin") \ | |||||
>(grep -Fai -e blogspot -e blogger >"${prefix}-blogger") \ | |||||
>(grep -Fai -e telegram.me -e //t.me/ >"${prefix}-telegram") \ | |||||
>/dev/null |
@@ -56,12 +56,13 @@ for line in sys.stdin: | |||||
print(f'Skipping line with unknown unit: {origLine}', file = sys.stderr) | print(f'Skipping line with unknown unit: {origLine}', file = sys.stderr) | ||||
continue | continue | ||||
try: | try: | ||||
number = float(number) | |||||
if number.strip('0123456789') == '': | |||||
number = int(number) | |||||
else: | |||||
number = float(number) | |||||
except ValueError as e: | except ValueError as e: | ||||
print(f'Skipping line with unparseable number: {origLine}', file = sys.stderr) | print(f'Skipping line with unparseable number: {origLine}', file = sys.stderr) | ||||
continue | continue | ||||
if number.is_integer(): | |||||
number = int(number) | |||||
sum += int(number * units[unit]) | sum += int(number * units[unit]) | ||||
# Special case because log(0) is kinda bad... | # Special case because log(0) is kinda bad... | ||||
@@ -0,0 +1,20 @@ | |||||
#!/bin/bash | |||||
if [[ "$1" == '--help' || "$1" == '-h' ]]; then | |||||
printf 'Usage: %q [FILENAME]\n' "$0" >&2 | |||||
printf 'Prints all ignored URLs from the wpull 2.x SQLite DB at FILENAME (default: wpull.db) to stdout\n' >&2 | |||||
exit | |||||
fi | |||||
if [[ $# -eq 1 ]] | |||||
then | |||||
filename="$1" | |||||
else | |||||
filename=wpull.db | |||||
fi | |||||
if [[ ! -f "${filename}" ]] | |||||
then | |||||
printf 'Error: %q does not exist or is not a regular file\n' "${filename}" >&2 | |||||
exit 1 | |||||
fi | |||||
sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE status = "skipped" AND status_code IS NULL AND try_count = 1 AND inline_level IS NULL' |