The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

74 lines
2.4 KiB

  1. #!/bin/bash
  2. set -e
  3. if [[ $# -ne 1 || "$1" == '-h' || "$1" == '--help' ]]
  4. then
  5. echo 'Usage: transfer.notkiska.pw-check-ia TKPPATH'
  6. echo 'TKPPATH is the path of a file on transfer.notkiska.pw, e.g. "123F3V/twitter-#qanon"'
  7. echo 'Checks that the file is archived correctly on IA by downloading both copies and comparing the SHA-1.'
  8. echo 'If the TKPCHECK_CACHE_DIR environment variable is set, it is used as a cache for the CDXs to avoid redownloading them from IA on every check.'
  9. exit 1
  10. fi
  11. if [[ "${TKPCHECK_CACHE_DIR}" ]]
  12. then
  13. if [[ -e "${TKPCHECK_CACHE_DIR}" && ! -d "${TKPCHECK_CACHE_DIR}" ]]
  14. then
  15. echo "Error: ${TKPCHECK_CACHE_DIR} is not a directory." >&2
  16. exit 1
  17. fi
  18. fi
  19. file="$1"
  20. fileid="${file%/*}"
  21. filename="${file#*/}"
  22. echo "Downloading from transfer.notkiska.pw" >&2
  23. tkphash="$(curl "https://transfer.notkiska.pw/${file}" | sha1sum | tee /dev/fd/2)"
  24. echo "Retrieving WARC list from viewer" >&2
  25. mapfile -t warcs < <(curl -s "https://archive.fart.website/archivebot/viewer/api/v1/search.json?q=urls-transfer.notkiska.pw-transfer.notkiska.pw-" | python3 -c 'import json,sys; [print(x["job_id"]) for x in json.loads(sys.stdin.read())["results"]]' | sed 's,^,https://archive.fart.website/archivebot/viewer/job/,' | xargs curl -s | grep -Po 'href="\Khttps://archive.org/download/[^/"]+/[^/"]+-\d\d\d\d\d\.warc\.gz(?=")')
  26. cdxLines=()
  27. for warc in "${warcs[@]}"
  28. do
  29. cdx="${warc::-3}.os.cdx.gz"
  30. mapfile -t -O ${#cdxLines[@]} cdxLines < <(
  31. {
  32. if [[ "${TKPCHECK_CACHE_DIR}" ]]
  33. then
  34. cdxfn="${cdx:29}"
  35. mkdir -p "${TKPCHECK_CACHE_DIR}/${cdxfn%/*}"
  36. if [[ ! -e "${TKPCHECK_CACHE_DIR}/${cdxfn}" ]]
  37. then
  38. echo "Fetching ${cdx} into local cache" >&2
  39. curl -L "${cdx}" >"${TKPCHECK_CACHE_DIR}/${cdxfn}"
  40. fi
  41. cat "${TKPCHECK_CACHE_DIR}/${cdxfn}"
  42. else
  43. echo "Fetching ${cdx}" >&2
  44. curl -L "${cdx}"
  45. fi
  46. } | zgrep -F "/${fileid}/" 2>/dev/null | tee /dev/fd/2
  47. )
  48. done
  49. if [[ ${#cdxLines[@]} -ne 1 ]]
  50. then
  51. echo "Not exactly one matching CDX line found, cannot continue" >&2
  52. exit 1
  53. fi
  54. read -r length offset iapath < <(awk '{print $9 " " $10 " " $11}' <<<"${cdxLines[0]}")
  55. echo "Fetching ${offset}-$((${offset}+${length})) from IA ${iapath}" >&2
  56. iahash="$(curl -L --range "${offset}-$((${offset}+${length}))" "https://archive.org/download/${iapath}" | zcat 2>/dev/null | awk '/^\r$/ {empty+=1; next} (empty >= 2)' | sha1sum | tee /dev/fd/2)"
  57. if [[ "${tkphash}" == "${iahash}" ]]
  58. then
  59. echo OK >&2
  60. exit 0
  61. else
  62. echo "Hash mismatch!" >&2
  63. exit 1
  64. fi