The little things give you away... A collection of various small helper stuff
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 

44 lignes
1.7 KiB

  1. #!/bin/bash
  2. if [[ "$1" == '--help' || "$1" == '-h' ]]; then
  3. printf 'Usage: %q [FILENAME]\n' "$0" >&2
  4. printf 'Prints all ignored offsite URLs from the wpull 2.x SQLite DB at FILENAME (default: wpull.db) to stdout\n' >&2
  5. exit
  6. fi
  7. if [[ $# -eq 1 ]]
  8. then
  9. filename="$1"
  10. else
  11. filename=wpull.db
  12. fi
  13. if [[ ! -f "${filename}" ]]
  14. then
  15. printf 'Error: %q does not exist or is not a regular file\n' "${filename}" >&2
  16. exit 1
  17. fi
  18. # Check that the number of root URLs is 1; since we need to filter by the host, more than one root URL gets complicated.
  19. # This query is unfortunately slow due to a lack of index since it isn't needed for other operations.
  20. readarray -t roots < <(sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE level = 0')
  21. if [[ ${#roots[@]} -ne 1 ]]; then
  22. printf 'Error: jobs with more than one root URL are not supported.\n' >&2
  23. exit 1
  24. fi
  25. root="${roots[0]}"
  26. # Extract root hostname
  27. roothost="${root#*//}"
  28. roothost="${roothost%%/*}"
  29. if [[ "${roothost}" =~ :[0-9][0-9]*$ ]]; then roothost="${roothost%:*}"; fi
  30. if [[ "${roothost}" == *@* ]]; then roothost="${roothost##*@}"; fi
  31. # Bail if there are weird chars in the hostname; this shouldn't be possible.
  32. if [[ "${roothost}" == *[*?]* ]]; then
  33. printf 'Error: root hostname %q contains glob chars.\n' "${roothost}" >&2
  34. exit 1
  35. fi
  36. # GLOB is case-sensitive, but wpull normalises URLs, including lowercasing hostnames, so this is not a problem.
  37. # Go!
  38. sqlite3 "${filename}" 'SELECT url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE status = "skipped" AND status_code IS NULL AND try_count = 1 AND inline_level IS NULL AND url NOT GLOB "*[/@]'"${roothost}"'[:/]*"'