The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

445 lines
19 KiB

  1. #!/usr/bin/env bash
  2. printf '\e[1;33mWARNING: This script is experimental and is not intended for productive use yet. Use at your own risk.\e[0m\n\n' >&2
  3. # Requires Bash 4.3+
  4. if [[ "$1" == '-h' || "$1" == '--help' ]]; then
  5. printf 'Usage: %q\n' "$0"
  6. printf '\n'
  7. printf 'Uploads files from each subdirectory of the PWD into an IA item using the dirname as the identifier; files whose name starts with the identifier are skipped.\n'
  8. printf 'Configuration happens through a subdir/.dir-to-ia.config file, a Bash script that gets sourced to get the config values.\n'
  9. printf "Use '%q --example-config' to get an example configuration with explanations.\n" "$0"
  10. printf 'The upload log for each item gets written to the subdir/.dir-to-ia.log file.\n'
  11. printf 'It is assumed that no other changes are made to the items while dir-to-ia is running.\n'
  12. printf 'Multiple instances can safely run in the same directory to upload multiple items in parallel. Uploads within an item are always sequential.\n'
  13. printf '\n'
  14. printf 'This program is dedicated to fireonlive.\n'
  15. exit 1
  16. fi
  17. # Default config values
  18. sha256=no
  19. rm=no
  20. rmwait=yes
  21. sha1check=no
  22. clobber=no
  23. derive=no
  24. autosizehint=no
  25. iauploadstreamopts=()
  26. if [[ "$1" == '--example-config' ]]; then
  27. cat <<-EOF
  28. # Calculate SHA-256 hash of each file after uploading
  29. sha256=${sha256}
  30. # Remove local file after successful upload; when disabled, only files without a filename match in the IA item get uploaded.
  31. rm=${rm}
  32. # Delay removal until IA processed the upload
  33. rmwait=${rmwait}
  34. # Run an SHA-1 check against the local file after the upload using the hash calculated by IA; if rm and sha1check are enabled, rmwait must also be enabled.
  35. sha1check=${sha1check}
  36. # Clobber existing files in IA item (no = existing copy is moved to history/files/ by IA); only meaningful with rm=yes.
  37. clobber=${clobber}
  38. # Queue derive after upload (applied after every file!)
  39. derive=${derive}
  40. # Automatic size hint for item creation using the current directory size. When more data will be added, do not use this and set a value in iauploadstreamopts instead if a size hint is desired.
  41. autosizehint=${autosizehint}
  42. # Custom options for ia-upload-stream, as an array of args; this can be used to choose the part size and concurrency, for example, using (--part-size 1G --concurrency 4).
  43. iauploadstreamopts=()
  44. # Item metadata (array with 'key:value' elements); the only mandatory variable with no default
  45. metadata=('collection:opensource' "date:$(printf '%(%Y-%m-%d)T')")
  46. EOF
  47. exit
  48. fi
  49. function tsprintf {
  50. # Bash 5.0+, ms is empty below that, will break in the year 2286
  51. local t="${EPOCHREALTIME:--1}" ts ms
  52. ms="${t:10:4}"
  53. printf -v ts '%(%Y-%m-%d %H:%M:%S)T%s' "${t%%.*}" "${ms}";
  54. if [[ $# -eq 1 ]]; then
  55. printf "%s %s\n" "${ts}" "$@";
  56. else
  57. # First argument is the format, rest are arguments to printf; pass through `while read` loop to have a better chance of line buffering.
  58. {
  59. printf "%s " "${ts}";
  60. printf "$1" "${@:2}";
  61. printf '\n';
  62. } | while IFS= read -r line; do printf '%s\n' "${line}"; done
  63. fi
  64. }
  65. function maybets {
  66. # Add a timestamp to each line that doesn't start with one.
  67. local line
  68. while IFS= read -r line; do
  69. if [[ "${line}" != [0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]\ [0-9][0-9]:[0-9][0-9]:[0-9][0-9]* ]]; then
  70. tsprintf "${line}"
  71. else
  72. printf '%s\n' "${line}"
  73. fi
  74. done
  75. }
  76. if [[ "${DEBUG}" ]]; then
  77. function dbgprintf { tsprintf "[DEBUG] $1" "${@:2}" >&2; }
  78. else
  79. function dbgprintf { :; }
  80. fi
  81. scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
  82. for cmd in ia-upload-stream ia-tasks; do
  83. if [[ ! -x "${scriptpath}/${cmd}" ]]; then
  84. printf 'Error: %q not found, make sure it is in the same directory as %q\n' "${cmd}" "$0" >&2
  85. exit 1
  86. fi
  87. done
  88. while :; do
  89. # Prefilter the subdirs for ones that have actual files in them. This avoids cycling through all dirs when rm=yes.
  90. # Include dirs with .dirty file so the warning about those is always emitted.
  91. readarray -d $'\0' -t candidateDirs < <(find -type f \( \! -name '.dir-to-ia.*' -o -name .dir-to-ia.dirty \) -print0 | sed -z 's,^\./,,; s,/.*$,,' | uniq -z)
  92. for dir in "${candidateDirs[@]}"; do
  93. dbgprintf 'Considering directory %q' "${dir}"
  94. identifier="${dir}"
  95. if [[ ! "${dir}" =~ ^[a-zA-Z0-9] ]]; then
  96. continue
  97. fi
  98. if [[ ! -e "${dir}/.dir-to-ia.config" ]]; then
  99. continue
  100. fi
  101. if [[ -e "${dir}/.dir-to-ia.dirty" ]]; then
  102. tsprintf "Warning: %q is dirty, skipping" "${dir}" >&2
  103. continue
  104. fi
  105. # Everything from here on is executed in a subshell so that the config sourcing can't affect other items.
  106. # It would be possible to do that without a subshell, but this is easier.
  107. {
  108. flock --exclusive --nonblock 3 || continue
  109. (
  110. dbgprintf 'Processing %q' "${dir}"
  111. # Source and check configuration
  112. . "${dir}/.dir-to-ia.config" || { tsprintf 'Sourcing %q failed' "${dir}/.dir-to-ia.config" >&2; exit 1; }
  113. configbroken=
  114. for v in sha256 rm rmwait sha1check clobber derive; do
  115. if [[ "${!v}" != 'yes' && "${!v}" != 'no' ]]; then
  116. tsprintf "Error in %q: %s must be 'yes' or 'no'" "${dir}/.dir-to-ia.config" "${v}" >&2
  117. configbroken=yes
  118. fi
  119. done
  120. if read -r _ attrs _ < <(declare -p iauploadstreamopts 2>/dev/null); [[ "${attrs}" != *a* ]]; then
  121. tsprintf 'Error in %q: iauploadstreamopts must be an array' "${dir}/.dir-to-ia.config" >&2
  122. configbroken=yes
  123. fi
  124. if read -r _ attrs _ < <(declare -p metadata 2>/dev/null); [[ "${attrs}" != *a* ]]; then
  125. tsprintf 'Error in %q: metadata missing or not an array' "${dir}/.dir-to-ia.config" >&2
  126. configbroken=yes
  127. else
  128. for f in "${metadata[@]}"; do
  129. if [[ "${f}" != *:* ]]; then
  130. tsprintf 'Error in %q: metadata field missing colon: %s' "${dir}/.dir-to-ia.config" "${f}" >&2
  131. configbroken=yes
  132. fi
  133. done
  134. fi
  135. if [[ "${configbroken}" ]]; then
  136. exit 1
  137. fi
  138. if [[ "${rm}" == 'no' && "${clobber}" == 'yes' ]]; then
  139. tsprintf 'Error in %q: rm=no and clobber=yes is not permitted' "${dir}/.dir-to-ia.config" >&2
  140. exit 1
  141. fi
  142. if [[ "${rm}" == 'yes' && "${sha1check}" == 'yes' && "${rmwait}" == 'no' ]]; then
  143. tsprintf 'Error in %q: rm=yes + sha1check=yes is incompatible with rmwait=no' "${dir}/.dir-to-ia.config" >&2
  144. exit 1
  145. fi
  146. dbgprintf 'Configuration:'
  147. dbgprintf " sha256=${sha256}"
  148. dbgprintf " rm=${rm}"
  149. dbgprintf " rmwait=${rmwait}"
  150. dbgprintf " sha1check=${sha1check}"
  151. dbgprintf " clobber=${clobber}"
  152. dbgprintf " derive=${derive}"
  153. dbgprintf " autosizehint=${autosizehint}"
  154. dbgprintf " iauploadstreamopts=(${iauploadstreamopts[0]+"$(printf %q "${iauploadstreamopts[0]}")"}$(if [[ ${#iauploadstreamopts[@]} -gt 1 ]]; then printf ' %q' "${iauploadstreamopts[@]:1}"; fi))"
  155. dbgprintf " metadata=($(printf %q "${metadata[0]}")$(if [[ ${#metadata[@]} -gt 1 ]]; then printf ' %q' "${metadata[@]:1}"; fi))"
  156. dbgprintf 'Version: commit %q' "$(git -C "${scriptpath}" log --max-count=1 --format=%H)"
  157. # Check for previous batch that might require waiting for IA and/or deletion
  158. # Exits 0 for likely IA-side errors so it gets retried on the next pass.
  159. if [[ -e "${dir}/.dir-to-ia.batch-files" ]]; then
  160. if [[ ( "${rm}" == 'yes' && "${rmwait}" == 'yes' ) || "${sha1check}" == 'yes' ]]; then
  161. tsprintf 'Checking whether IA is done with upload processing...' >&2
  162. summary="$("${scriptpath}/ia-tasks" --summary --jsonl "${identifier}")"
  163. st="$?"
  164. if ((st)); then
  165. tsprintf "Warning: task processing wait: ia-tasks exited with status ${st}" >&2
  166. exit 0
  167. fi
  168. tasksAndErrors="$(python3 -c 'import json, sys; o = json.load(sys.stdin); print(sum(o.values()), o["error"])' <<<"${summary}")"
  169. st="$?"
  170. if ((st)); then
  171. tsprintf "Warning: task processing wait: python3 for ia-tasks output %q exited with status %d" "${summary}" "${st}" >&2
  172. exit 0
  173. fi
  174. if [[ "${tasksAndErrors}" != '0 0' ]]; then
  175. tsprintf "Task processing wait: IA tasks not finished (%d tasks remaining, %d errors)" "${tasksAndErrors% *}" "${tasksAndErrors#* }" >&2
  176. exit 0
  177. fi
  178. tsprintf 'IA upload processing finished' >&2
  179. fi
  180. readarray -d $'\0' -t uploaded <"${dir}/.dir-to-ia.batch-files"
  181. dbgprintf "Local files:$(printf " %q" "${uploaded[@]}")"
  182. if [[ "${sha1check}" == 'yes' ]]; then
  183. tsprintf 'Checking SHA-1 hashes...' >&2
  184. if [[ ! -e "${dir}/.dir-to-ia.batch-sha1sums" ]]; then
  185. tsprintf 'Error: sha1check enabled but no sha1sums file for the previous batch of uploads (did you enable sha1check while a batch was pending?)' >&2
  186. exit 1
  187. fi
  188. readarray -t localhashes <"${dir}/.dir-to-ia.batch-sha1sums"
  189. dbgprintf "Local hashes:$(printf " %q" "${localhashes[@]}")"
  190. if ((${#localhashes[@]} != ${#uploaded[@]})); then
  191. tsprintf 'Error: %q and %q do not have the same number of entries (%d vs %d)' "${dir}/.dir-to-ia.batch-files" "${dir}/.dir-to-ia.batch-sha1sums" "${#uploaded[@]}" "${#localhashes[@]}" >&2
  192. exit 1
  193. fi
  194. dbgprintf 'Retrieving SHA-1 hashes from IA...'
  195. # Only two error scenarios: retrieval fails (curl exit != 0), or the file list doesn't contain all files in the array (Python exit 33)
  196. # To not have to handle weird characters in filenames, don't try to produce a file with the expected format for `sha1sum -c`.
  197. shopt -s lastpipe
  198. curl --silent --location --max-time 10 --fail --user-agent 'little-things +https://gitea.arpa.li/JustAnotherArchivist/little-things' "https://archive.org/metadata/${identifier}" \
  199. | python3 -c 'import json, sys; o = json.load(sys.stdin); files = set(sys.argv[1:]); {(print(f["sha1"], end = "\0"), print(f["name"], end = "\0"), files.remove(f["name"])) for f in o.get("files", []) if f["name"] in files and "sha1" in f}; sys.exit(33) if files else ...' "${uploaded[@]}" \
  200. | readarray -d $'\0' -t iahashes
  201. sts=("${PIPESTATUS[@]}")
  202. dbgprintf "IA hashes:$(printf " %q" "${iahashes[@]}")"
  203. if ((sts[0])); then
  204. tsprintf "IA hashes curl exited with status ${sts[0]}" >&2
  205. exit 0
  206. fi
  207. if ((sts[1] == 33)); then
  208. tsprintf 'IA hashes metadata does not contain all files uploaded in this batch' >&2
  209. exit 1
  210. elif ((sts[1])); then
  211. tsprintf "IA hashes python3 exited with status ${sts[1]}" >&2
  212. exit 0
  213. fi
  214. if ((${#iahashes[@]} % 2 != 0)); then
  215. tsprintf 'Got an odd number of hashes from IA' >&2
  216. exit 1
  217. fi
  218. if ((${#iahashes[@]} != 2 * ${#localhashes[@]})); then
  219. tsprintf "Number of hashes from IA (%d) doesn't match number of local hashes (%d)" "$((${#iahashes[@]} / 2))" "${#localhashes[@]}" >&2
  220. exit 1
  221. fi
  222. # Local filenames in uploaded, corresponding local hashes + space space dash in localhashes, and iahashes is (hash filename [hash filename...]).
  223. dbgprintf 'Comparing...'
  224. hashfail=
  225. for ((i = 0; i < ${#uploaded[@]}; ++i)); do
  226. fn="${uploaded[i]}"
  227. dbgprintf 'Checking hash for %q' "${dir}/${fn}"
  228. localhash="${localhashes[i]% -}"
  229. dbgprintf ' local: %s' "${localhash}"
  230. iahash=
  231. for ((j = 0; j < "${#iahashes[@]}"; j += 2)); do
  232. if [[ "${iahashes[j + 1]}" == "${fn}" ]]; then
  233. iahash="${iahashes[j]}"
  234. break
  235. fi
  236. done
  237. if [[ -z "${iahash}" ]]; then
  238. # This should be impossible due to the checks above.
  239. tsprintf 'Error: could not find IA hash for %q' "${fn}" >&2
  240. exit 1
  241. fi
  242. dbgprintf ' IA: %s' "${iahash}"
  243. if [[ "${iahash}" != "${localhash}" ]]; then
  244. tsprintf 'SHA-1 mismatch for %q: local %s vs IA %s' "${dir}/${fn}" "${localhash}" "${iahash}" >&2
  245. hashfail=yes
  246. else
  247. tsprintf 'OK: %q' "${dir}/${fn}" >&2
  248. fi
  249. done
  250. if [[ "${hashfail}" ]]; then
  251. exit 1
  252. fi
  253. fi
  254. if [[ "${rm}" == 'yes' ]]; then
  255. tsprintf 'Removing:' >&2
  256. for fn in "${uploaded[@]}"; do
  257. tsprintf ' %q' "${dir}/${fn}" >&2
  258. rm "${dir}/${fn}" || { tsprintf 'rm for %q exited with status %d' "${dir}/${fn}" "$?" >&2; exit 1; }
  259. done
  260. fi
  261. rm "${dir}/.dir-to-ia.batch-files" || { tsprintf "rm for %q batch files or sha1sums exited with status %d" "${dir}" "$?" >&2; exit 1; }
  262. if [[ -e "${dir}/.dir-to-ia.batch-sha1sums" ]]; then
  263. rm "${dir}/.dir-to-ia.batch-sha1sums" || { tsprintf "rm for %q batch files or sha1sums exited with status %d" "${dir}" "$?" >&2; exit 1; }
  264. fi
  265. fi
  266. # Loop over local files and upload them
  267. # The 'quota' controls how many files get uploaded at once.
  268. # It gets decremented for each uploaded file, and when it reaches zero, the item's task queue is checked to ensure it isn't building up.
  269. gotFileList=
  270. nQuota=50
  271. while IFS= read -r -d $'\0' fn; do
  272. dbgprintf 'Considering file %q' "${fn}"
  273. if [[ "${fn}" == .dir-to-ia.* ]]; then
  274. dbgprintf '%q is a dir-to-ia file, skipping' "${fn}"
  275. continue
  276. fi
  277. if [[ ! -f "${dir}/${fn}" ]]; then
  278. # Should never happen since the `find` command already uses `-type f`
  279. dbgprintf '%q is not a regular file, skipping' "${fn}"
  280. continue
  281. fi
  282. if [[ "${fn}" == "${identifier}"* ]]; then
  283. dbgprintf '%q starts with the identifier, skipping' "${fn}"
  284. continue
  285. fi
  286. if [[ "${rm}" == 'no' ]]; then
  287. if [[ ! "${gotFileList}" ]]; then
  288. # This can only proceed if there are no pending tasks; if there are, break out of the loop and revisit later.
  289. dbgprintf 'Checking whether IA has pending tasks on the item as the item file list/metadata (needed for rm=no) is unreliable otherwise...'
  290. summary="$("${scriptpath}/ia-tasks" --summary --jsonl"${identifier}")"
  291. st="$?"
  292. if ((st)); then
  293. tsprintf "Warning: metadata reliability check: ia-tasks exited with status ${st}" >&2
  294. break
  295. fi
  296. tasksAndErrors="$(python3 -c 'import json, sys; o = json.load(sys.stdin); print(sum(o.values()), o["error"])' <<<"${summary}")"
  297. st="$?"
  298. if ((st)); then
  299. tsprintf "Warning: metadata reliability check: python3 for ia-tasks output %q exited with status %d" "${summary}" "${st}" >&2
  300. break
  301. fi
  302. if [[ "${tasksAndErrors}" != '0 0' ]]; then
  303. dbgprintf "Metadata reliability check: IA tasks not finished (%d tasks remaining, %d errors)" "${tasksAndErrors% *}" "${tasksAndErrors#* }" >&2
  304. break
  305. fi
  306. dbgprintf 'No pending tasks, metadata should be reliable' >&2
  307. dbgprintf 'Retrieving existing files on IA...'
  308. # Scenarios: retrieval fails (curl exit != 0), item doesn't exist (= empty JSON, Python exit 31), no file list (e.g. darked, Python exit 32)
  309. shopt -s lastpipe
  310. curl --silent --location --max-time 10 --fail --user-agent 'little-things +https://gitea.arpa.li/JustAnotherArchivist/little-things' "https://archive.org/metadata/${identifier}" \
  311. | python3 -c 'import json, sys; o = json.load(sys.stdin); sys.exit(31) if o == {} else ...; sys.exit(32) if "files" not in o else ...; {print(f["name"], end = "\0") for f in o["files"] if not f["name"].startswith(sys.argv[1])}' "${identifier}" \
  312. | readarray -d $'\0' -t iafiles
  313. sts=("${PIPESTATUS[@]}")
  314. dbgprintf "Existing files on IA:$(printf " %q" "${iafiles[@]}")"
  315. if ((sts[0])); then
  316. tsprintf "IA metadata curl exited with status ${sts[0]}" >&2
  317. fi
  318. if ((sts[1] == 31)); then
  319. # Item doesn't exist, but that's fine. Reset the status so it doesn't trigger the exit below.
  320. sts[1]=0
  321. elif ((sts[1] == 32)); then
  322. tsprintf "IA returned no file list in metadata" >&2
  323. elif ((sts[1])); then
  324. tsprintf "IA metadata python3 exited with status ${sts[1]}" >&2
  325. fi
  326. if ((sts[0] || sts[1])); then
  327. break
  328. fi
  329. gotFileList=yes
  330. fi
  331. found=
  332. for remoteFn in "${iafiles[@]}"; do
  333. if [[ "${fn}" == "${remoteFn}" ]]; then
  334. found=yes
  335. break
  336. fi
  337. done
  338. if [[ "${found}" ]]; then
  339. dbgprintf '%q found in IA item, skipping' "${fn}"
  340. continue
  341. fi
  342. fi
  343. tsprintf 'Uploading %q to %q...' "${fn}" "${identifier}" >&2
  344. cmd=("${scriptpath}/ia-upload-stream")
  345. if [[ "${clobber}" == 'yes' ]]; then cmd+=('--clobber'); fi
  346. if [[ "${derive}" == 'no' ]]; then cmd+=('--no-derive'); fi
  347. if [[ "${autosizehint}" == 'yes' ]]; then
  348. read -r size _ < <(du -s -k "${dir}")
  349. cmd+=('--size-hint' "${size}K")
  350. fi
  351. cmd+=(--input-file "${dir}/${fn}")
  352. cmd+=("${iauploadstreamopts[@]}")
  353. cmd+=("${identifier}")
  354. cmd+=("${fn}")
  355. cmd+=("${metadata[@]}")
  356. dbgprintf "Upload command:$(printf " %q" "${cmd[@]}")"
  357. "${cmd[@]}" || { tsprintf "ia-upload-stream exited with status $?" >&2; exit 1; }
  358. if [[ "${sha256}" == 'yes' ]]; then
  359. tsprintf 'Calculating SHA-256...' >&2
  360. (cd "${dir}" && sha256sum "${fn}" > >(tee -a '.dir-to-ia.sha256sum')) || { tsprintf 'sha256sum for %q exited with status %d' "${fn}" "$?" >&2; exit 1; }
  361. fi
  362. if [[ "${sha1check}" == 'yes' ]]; then
  363. tsprintf 'Calculating SHA-1...' >&2
  364. sha1sum <"${dir}/${fn}" > >(tee -a "${dir}/.dir-to-ia.batch-sha1sums") || { tsprintf 'sha1sum for %q exited with status %d' "${fn}" "$?" >&2; exit 1; }
  365. fi
  366. if [[ "${rm}" == 'yes' || "${sha1check}" == 'yes' ]]; then
  367. printf '%s\0' "${fn}" >>"${dir}/.dir-to-ia.batch-files"
  368. fi
  369. ((--nQuota))
  370. if ((nQuota == 0)); then
  371. dbgprintf "Checking IA's task queue to determine how many more files we can upload in this iteration..."
  372. summary="$("${scriptpath}/ia-tasks" --summary --jsonl "${identifier}")"
  373. st="$?"
  374. if ((st)); then
  375. tsprintf "Warning: task queue check: ia-tasks exited with status ${st}" >&2
  376. exit 0
  377. fi
  378. tasksAndErrors="$(python3 -c 'import json, sys; o = json.load(sys.stdin); print(sum(o.values()), o["error"])' <<<"${summary}")"
  379. st="$?"
  380. if ((st)); then
  381. tsprintf "Warning: task queue check: python3 for ia-tasks output %q exited with status %d" "${summary}" "${st}" >&2
  382. exit 0
  383. fi
  384. tasks="${tasksAndErrors% *}"
  385. errors="${tasksAndErrors#* }"
  386. ((nQuota = 50 - tasks))
  387. if ((nQuota < 0 || errors > 0)); then
  388. nQuota=0
  389. fi
  390. dbgprintf "IA item has %d tasks remaining and %d errors, new quota: %d" "${tasks}" "${errors}" "${nQuota}"
  391. if ((nQuota == 0)); then
  392. break
  393. fi
  394. fi
  395. done < <(cd "${dir}" && find . -type f -print0 | sed -z 's,^\./,,')
  396. # Waiting for IA and/or deletions are handled in the next iteration
  397. )
  398. st="$?"
  399. if ((st)); then
  400. tsprintf 'Subshell exited with status %d, marking as dirty' "${st}" >&2
  401. printf '%s\n' \
  402. 'An error occurred on the last upload, so this file marks this directory as dirty to not cause worse problems.' \
  403. 'Delete this file once the underlying issue is resolved to make dir-to-ia process this directory again.' \
  404. >"${dir}/.dir-to-ia.dirty"
  405. fi
  406. dbgprintf 'Done with %q' "${dir}"
  407. } 3>"${dir}/.dir-to-ia.lock" &> >(stdbuf -oL tr '\r' '\n' | maybets | tee -a "${dir}/.dir-to-ia.log" >&2)
  408. done
  409. sleep 60
  410. done