3 Commits

Author SHA1 Message Date
  JustAnotherArchivist f3bec23348 Remove filtering of onsite URLs because it's unreliable 4 months ago
  JustAnotherArchivist 53535b925a Add wpull2-extract-ignored-offsite and extract-urls-for-archiveteam-projects 4 months ago
  JustAnotherArchivist 0432bd00c2 Avoid float roundtrip for integer values 5 months ago
3 changed files with 49 additions and 3 deletions
Split View
  1. +25
    -0
      extract-urls-for-archiveteam-projects
  2. +4
    -3
      sum-sizes
  3. +20
    -0
      wpull2-extract-ignored

+ 25
- 0
extract-urls-for-archiveteam-projects View File

@@ -0,0 +1,25 @@
#!/bin/bash
if [[ $# -ne 1 || "$1" == '--help' || "$1" == '-h' ]]; then
printf 'Usage: extract-urls-for-archiveteam-projects PREFIX\n' >&2
printf 'Reads URLs from stdin, extracts interesting for the different currently relevant AT projects into files prefixed by PREFIX\n' >&2
exit 1
fi

prefix="$1"
if [[ "${prefix}" == *[*?[]* ]]; then
printf 'Error: prefixes containing * ? [ not supported\n' >&2
exit 1
fi

if compgen -G "${prefix}*" >/dev/null; then
printf 'Error: there already exist files starting with %q\n' "${prefix}" >&2
exit 1
fi

tee \
>(grep -Fai imgur >"${prefix}-imgur") \
>(grep -Fai -e mediafire -e mfi.re >"${prefix}-mediafire") \
>(grep -Fai pastebin.com >"${prefix}-pastebin") \
>(grep -Fai -e blogspot -e blogger >"${prefix}-blogger") \
>(grep -Fai -e telegram.me -e //t.me/ >"${prefix}-telegram") \
>/dev/null

+ 4
- 3
sum-sizes View File

@@ -56,12 +56,13 @@ for line in sys.stdin:
print(f'Skipping line with unknown unit: {origLine}', file = sys.stderr)
continue
try:
number = float(number)
if number.strip('0123456789') == '':
number = int(number)
else:
number = float(number)
except ValueError as e:
print(f'Skipping line with unparseable number: {origLine}', file = sys.stderr)
continue
if number.is_integer():
number = int(number)
sum += int(number * units[unit])

# Special case because log(0) is kinda bad...


+ 20
- 0
wpull2-extract-ignored View File

@@ -0,0 +1,20 @@
#!/bin/bash
if [[ "$1" == '--help' || "$1" == '-h' ]]; then
printf 'Usage: %q [FILENAME]\n' "$0" >&2
printf 'Prints all ignored URLs from the wpull 2.x SQLite DB at FILENAME (default: wpull.db) to stdout\n' >&2
exit
fi

if [[ $# -eq 1 ]]
then
filename="$1"
else
filename=wpull.db
fi
if [[ ! -f "${filename}" ]]
then
printf 'Error: %q does not exist or is not a regular file\n' "${filename}" >&2
exit 1
fi

sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE status = "skipped" AND status_code IS NULL AND try_count = 1 AND inline_level IS NULL'

Loading…
Cancel
Save