The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

44 lines
1.7 KiB

  1. #!/bin/bash
  2. if [[ "$1" == '--help' || "$1" == '-h' ]]; then
  3. printf 'Usage: %q [FILENAME]\n' "$0" >&2
  4. printf 'Prints all ignored offsite URLs from the wpull 2.x SQLite DB at FILENAME (default: wpull.db) to stdout\n' >&2
  5. exit
  6. fi
  7. if [[ $# -eq 1 ]]
  8. then
  9. filename="$1"
  10. else
  11. filename=wpull.db
  12. fi
  13. if [[ ! -f "${filename}" ]]
  14. then
  15. printf 'Error: %q does not exist or is not a regular file\n' "${filename}" >&2
  16. exit 1
  17. fi
  18. # Check that the number of root URLs is 1; since we need to filter by the host, more than one root URL gets complicated.
  19. # This query is unfortunately slow due to a lack of index since it isn't needed for other operations.
  20. readarray -t roots < <(sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE level = 0')
  21. if [[ ${#roots[@]} -ne 1 ]]; then
  22. printf 'Error: jobs with more than one root URL are not supported.\n' >&2
  23. exit 1
  24. fi
  25. root="${roots[0]}"
  26. # Extract root hostname
  27. roothost="${root#*//}"
  28. roothost="${roothost%%/*}"
  29. if [[ "${roothost}" =~ :[0-9][0-9]*$ ]]; then roothost="${roothost%:*}"; fi
  30. if [[ "${roothost}" == *@* ]]; then roothost="${roothost##*@}"; fi
  31. # Bail if there are weird chars in the hostname; this shouldn't be possible.
  32. if [[ "${roothost}" == *[*?]* ]]; then
  33. printf 'Error: root hostname %q contains glob chars.\n' "${roothost}" >&2
  34. exit 1
  35. fi
  36. # GLOB is case-sensitive, but wpull normalises URLs, including lowercasing hostnames, so this is not a problem.
  37. # Go!
  38. sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE status = "skipped" AND status_code IS NULL AND try_count = 1 AND inline_level IS NULL AND url NOT GLOB "*[/@]'"${roothost}"'[:/]*"'