The little things give you away... A collection of various small helper stuff
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 

74 lignes
2.4 KiB

  1. #!/bin/bash
  2. set -e
  3. if [[ $# -ne 1 || "$1" == '-h' || "$1" == '--help' ]]
  4. then
  5. echo 'Usage: transfer.notkiska.pw-check-ia TKPPATH'
  6. echo 'TKPPATH is the path of a file on transfer.notkiska.pw, e.g. "123F3V/twitter-#qanon"'
  7. echo 'Checks that the file is archived correctly on IA by downloading both copies and comparing the SHA-1.'
  8. echo 'If the TKPCHECK_CACHE_DIR environment variable is set, it is used as a cache for the CDXs to avoid redownloading them from IA on every check.'
  9. exit 1
  10. fi
  11. if [[ "${TKPCHECK_CACHE_DIR}" ]]
  12. then
  13. if [[ -e "${TKPCHECK_CACHE_DIR}" && ! -d "${TKPCHECK_CACHE_DIR}" ]]
  14. then
  15. echo "Error: ${TKPCHECK_CACHE_DIR} is not a directory." >&2
  16. exit 1
  17. fi
  18. fi
  19. file="$1"
  20. fileid="${file%/*}"
  21. filename="${file#*/}"
  22. echo "Downloading from transfer.notkiska.pw" >&2
  23. tkphash="$(curl "https://transfer.notkiska.pw/${file}" | sha1sum | tee /dev/fd/2)"
  24. echo "Retrieving WARC list from viewer" >&2
  25. mapfile -t warcs < <(curl -s "https://archive.fart.website/archivebot/viewer/api/v1/search.json?q=urls-transfer.notkiska.pw-transfer.notkiska.pw-" | python3 -c 'import json,sys; [print(x["job_id"]) for x in json.loads(sys.stdin.read())["results"]]' | sed 's,^,https://archive.fart.website/archivebot/viewer/job/,' | xargs curl -s | grep -Po 'href="\Khttps://archive.org/download/[^/"]+/[^/"]+-\d\d\d\d\d\.warc\.gz(?=")')
  26. cdxLines=()
  27. for warc in "${warcs[@]}"
  28. do
  29. cdx="${warc::-3}.os.cdx.gz"
  30. mapfile -t -O ${#cdxLines[@]} cdxLines < <(
  31. {
  32. if [[ "${TKPCHECK_CACHE_DIR}" ]]
  33. then
  34. cdxfn="${cdx:29}"
  35. mkdir -p "${TKPCHECK_CACHE_DIR}/${cdxfn%/*}"
  36. if [[ ! -e "${TKPCHECK_CACHE_DIR}/${cdxfn}" ]]
  37. then
  38. echo "Fetching ${cdx} into local cache" >&2
  39. curl -L "${cdx}" >"${TKPCHECK_CACHE_DIR}/${cdxfn}"
  40. fi
  41. cat "${TKPCHECK_CACHE_DIR}/${cdxfn}"
  42. else
  43. echo "Fetching ${cdx}" >&2
  44. curl -L "${cdx}"
  45. fi
  46. } | zgrep -F "/${fileid}/" 2>/dev/null | tee /dev/fd/2
  47. )
  48. done
  49. if [[ ${#cdxLines[@]} -ne 1 ]]
  50. then
  51. echo "Not exactly one matching CDX line found, cannot continue" >&2
  52. exit 1
  53. fi
  54. read -r length offset iapath < <(awk '{print $9 " " $10 " " $11}' <<<"${cdxLines[0]}")
  55. echo "Fetching ${offset}-$((${offset}+${length})) from IA ${iapath}" >&2
  56. iahash="$(curl -L --range "${offset}-$((${offset}+${length}))" "https://archive.org/download/${iapath}" | zcat 2>/dev/null | awk '/^\r$/ {empty+=1; next} (empty >= 2)' | sha1sum | tee /dev/fd/2)"
  57. if [[ "${tkphash}" == "${iahash}" ]]
  58. then
  59. echo OK >&2
  60. exit 0
  61. else
  62. echo "Hash mismatch!" >&2
  63. exit 1
  64. fi