The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

65 line
5.9 KiB

  1. #!/bin/bash
  2. if [[ "$1" == '--test' ]]
  3. then
  4. ## Self-test
  5. # Notes:
  6. # - Only the response lines are included here; the requests would be skipped anyway.
  7. # - I didn't bother adjusting the status code message since it's not used for parsing
  8. # - Only one example of an ERROR is included because the error message doesn't matter anyway
  9. diff -q <("$0" <<-EOF
  10. 2020-09-10 23:54:25,000 - wpull.processor.web - INFO - Fetched ‘https://example.org/success-200’: 200 OK. Length: 1234 [text/html; charset=utf-8].
  11. 2020-09-10 23:54:25,000 - wpull.processor.web - INFO - Fetched ‘https://example.org/success-204’: 204 OK. Length: 1234 [text/html; charset=utf-8].
  12. 2020-09-10 23:54:25,000 - wpull.processor.web - INFO - Fetched ‘https://example.org/success-304’: 304 OK. Length: 1234 [text/html; charset=utf-8].
  13. 2020-09-10 23:54:25,000 - wpull.processor.web - INFO - Fetched ‘https://example.org/ok-401’: 401 OK. Length: 1234 [text/html; charset=utf-8].
  14. 2020-09-10 23:54:25,000 - wpull.processor.web - INFO - Fetched ‘https://example.org/ok-403’: 403 OK. Length: 1234 [text/html; charset=utf-8].
  15. 2020-09-10 23:54:25,000 - wpull.processor.web - INFO - Fetched ‘https://example.org/ok-404’: 404 OK. Length: 1234 [text/html; charset=utf-8].
  16. 2020-09-10 23:54:25,000 - wpull.processor.web - INFO - Fetched ‘https://example.org/ok-405’: 405 OK. Length: 1234 [text/html; charset=utf-8].
  17. 2020-09-10 23:54:25,000 - wpull.processor.web - INFO - Fetched ‘https://example.org/ok-410’: 410 OK. Length: 1234 [text/html; charset=utf-8].
  18. 2020-09-10 23:54:25,000 - wpull.processor.web - INFO - Fetched ‘https://example.org/ok-301’: 301 OK. Length: 1234 [text/html; charset=utf-8].
  19. 2020-09-10 23:54:25,000 - wpull.processor.base - ERROR - Fetching ‘https://example.org/ok-dns-nxdomain’ encountered an error: DNS resolution failed: [Errno -2] Name or service not known
  20. 2020-09-10 23:54:25,000 - wpull.processor.base - ERROR - Fetching ‘https://example.org/ok-dns-norecord’ encountered an error: DNS resolution failed: [Errno -5] No address associated with hostname
  21. 2020-09-10 23:54:25,000 - wpull.processor.web - INFO - Fetched ‘https://example.org/error-429’: 429 OK. Length: 1234 [text/html; charset=utf-8].
  22. 2020-09-10 23:54:25,000 - wpull.processor.web - INFO - Fetched ‘https://example.org/error-418’: 418 OK. Length: 1234 [text/html; charset=utf-8].
  23. 2020-09-10 23:54:25,000 - wpull.processor.base - ERROR - Fetching ‘https://example.org/error-dns’ encountered an error: DNS resolution error: All nameservers failed to answer the query ...
  24. 2020-09-10 23:54:25,000 - wpull.processor.web - INFO - Fetched ‘https://example.org/error-429-successful-retry’: 429 OK. Length: 1234 [text/html; charset=utf-8].
  25. 2020-09-10 23:54:25,000 - wpull.processor.web - INFO - Fetched ‘https://example.org/error-429-successful-retry’: 200 OK. Length: 1234 [text/html; charset=utf-8].
  26. 2020-09-10 23:54:25,000 - wpull.processor.web - INFO - Fetched ‘https://example.org/error-429-successful-second-retry’: 429 OK. Length: 1234 [text/html; charset=utf-8].
  27. 2020-09-10 23:54:25,000 - wpull.processor.web - INFO - Fetched ‘https://example.org/error-429-successful-second-retry’: 429 OK. Length: 1234 [text/html; charset=utf-8].
  28. 2020-09-10 23:54:25,000 - wpull.processor.web - INFO - Fetched ‘https://example.org/error-429-successful-second-retry’: 200 OK. Length: 1234 [text/html; charset=utf-8].
  29. 2020-09-10 23:54:25,000 - wpull.processor.web - INFO - Fetched ‘https://example.org/error-429-successful-retry-with-redirect’: 429 OK. Length: 1234 [text/html; charset=utf-8].
  30. 2020-09-10 23:54:25,000 - wpull.processor.web - INFO - Fetched ‘https://example.org/error-429-successful-retry-with-redirect’: 302 OK. Length: 1234 [text/html; charset=utf-8].
  31. 2020-09-10 23:54:25,000 - wpull.processor.base - ERROR - Fetching ‘https://example.org/error-dns-successful-retry’ encountered an error: DNS resolution failed: [Errno -2] Name or service not known
  32. 2020-09-10 23:54:25,000 - wpull.processor.web - INFO - Fetched ‘https://example.org/error-dns-successful-retry’: 200 OK. Length: 1234 [text/html; charset=utf-8].
  33. EOF
  34. ) <(cat <<-EOF
  35. https://example.org/error-429
  36. https://example.org/error-418
  37. https://example.org/error-dns
  38. EOF
  39. ) >/dev/null
  40. if [[ $? -eq 0 ]]
  41. then
  42. echo 'Success!'
  43. exit 0
  44. else
  45. echo 'Fail!'
  46. exit 1
  47. fi
  48. fi
  49. if [[ -t 0 || "$1" == '--help' ]]
  50. then
  51. echo 'Usage: pipe a wpull log (or meta WARC, decompressed) to this script' >&2
  52. echo 'Produces a list of URLs that were attempted but not retrieved successfully. They are output in the order of the first failure.' >&2
  53. exit 1
  54. fi
  55. # Logic: extract all lines of interest, process them such that they only contain a + or - indicating success or error plus the URL, filter the errors with the successes in awk.
  56. # The output order is as each URL appears for the first time in the log. Since awk doesn't preserve the insertion order on iteration, keep the line number and sort the output on that.
  57. grep -F -e ' - ERROR - Fetching ‘' -e ' - INFO - Fetched ‘' | sed 's,^.*‘\(.*\)’: \(200\|204\|30[0-8]\|401\|403\|404\|405\|410\) .*$,+ \1,; s,^.*‘\(.*\)’ encountered an error: DNS resolution failed: \[Errno -\(2\] Name or service not known\|5\] No address associated with hostname\)$,+ \1,; s,^.*‘\(.*\)’.*$,- \1,' | awk '/^\+ / { successes[$2] = 1; } /^- / && ! ($2 in successes) { errors[$2] = NR; } END { for (url in errors) { if (! (url in successes)) { print errors[url] " " url; } } }' | sort -n | cut -d' ' -f2-
  58. # Faster version without preserving order: grep -F -e ' - ERROR - Fetching ‘' -e ' - INFO - Fetched ‘' | sed 's,^.*‘\(.*\)’: \(200\|204\|304\|401\|403\|404\|405\|410\) .*$,+ \1,; s,^.*‘\(.*\)’.*$,- \1,' | awk '/^\+ / { successes[$2] = 1; } /^- / && ! ($2 in successes) { errors[$2] = 1; } END { for (url in errors) { if (! (url in successes)) { print url; } } }'