You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

313 lines
12 KiB

  1. #!/bin/bash
  2. export TZ=UTC
  3. envvars=(HTTP2IRC_GET_URL HTTP2IRC_POST_URL IA_S3_ACCESS IA_S3_SECRET)
  4. for envvar in "${envvars[@]}"; do
  5. if [[ ! -v "${envvar}" ]]; then
  6. { printf 'Error: one or more of the required environment variables (%s' "${envvars[0]}"; printf ', %s' "${envvars[@]:1}"; printf ') missing\n'; } >&2
  7. exit 1
  8. fi
  9. done
  10. # Optional env variables
  11. declare -i timeout="${CODEARCHIVER_BOT_TIMEOUT:-0}"
  12. declare -i nproclimit="${CODEARCHIVER_BOT_NPROC:-1}"
  13. declare -i nproc=0
  14. for dep in awk codearchiver curl ia-upload-stream python3 sha256sum tee zstd; do
  15. if ! command -v "${dep}" &>/dev/null; then
  16. printf 'Error: %s not found\n' "${dep}" >&2
  17. exit 1
  18. fi
  19. done
  20. function log {
  21. printf '%s %s\n' "${EPOCHREALTIME}" "$1" >&2
  22. }
  23. function log_loop {
  24. prefix="$1"
  25. # If there is output and it does not end with a LF, add one. Then replace CRLF with LF and replace remaining CR with LF.
  26. { lastchar="$(tee /dev/fd/42 | tail -c 1 | xxd -p)"; if [[ "${lastchar}" && "${lastchar}" != '0a' ]]; then printf '\n'; fi } 42>&1 |
  27. sed -u 's,\r$,,; s,\r,\n,g' |
  28. while IFS= read -r line; do log "${prefix}${line}"; done
  29. }
  30. function send {
  31. local message="$1"
  32. log "Sending message: ${message}"
  33. curl --silent --verbose --max-time 10 --data "${message}" "${HTTP2IRC_POST_URL}" 2> >(log_loop 'curl http2irc POST: ') | log_loop 'http2irc POST response: '
  34. }
  35. function respond {
  36. local nick="$1"
  37. local message="$2"
  38. send "${nick}: ${message}"
  39. }
  40. function taint_block {
  41. message="Storage is tainted, not ${1}"
  42. if [[ -e '.tainted' ]]; then
  43. log "${message}"
  44. while [[ -e '.tainted' ]]; do
  45. sleep 1
  46. done
  47. fi
  48. }
  49. { # Group the pipeline without requiring a backslash every time
  50. while :; do
  51. # Read from http2irc
  52. log 'Starting http2irc GET stream...'
  53. curl --silent --verbose --no-buffer "${HTTP2IRC_GET_URL}" 2> >(log_loop 'curl http2irc GET: ')
  54. printf '\n' # Ensure that there's a trailing LF for `read`
  55. done |
  56. # Log all raw input
  57. tee >(log_loop 'Received http2irc line: ') |
  58. # Transform the JSONL data into a more suitable format for the following: lines of 'modes SP nick SP message'
  59. python3 -u -c 'import json, sys'$'\n''def json_parse_or_none(s):'$'\n'' try: return json.loads(s)'$'\n'' except json.JSONDecodeError as e:'$'\n'' print(f"Could not parse {s[:100]}…: {type(e).__name__}: {e!s}")'$'\n''{print(o["user"]["modes"] or "_", o["user"]["nick"], o["message"]) for o in map(json_parse_or_none, sys.stdin) if o and o.get("command") == "PRIVMSG"}' |
  60. # For valid bot commands with adequate permissions, assign a job ID and respond. Suppress everything else. Print lines of 'jobid SP nick SP URL' for the processing below.
  61. while read -r modes nick message; do
  62. if [[ "${message}" == '!help' ]]; then
  63. respond "${nick}" '`!a URL`: archives a single repository'
  64. respond "${nick}" '`!a < URL`: archives a list of repositories (no success/failure report, no warnings/errors report, check logs manually!)'
  65. continue
  66. fi
  67. if [[ "${message}" != '!a '* ]]; then
  68. continue
  69. fi
  70. if [[ "${modes}" != *[@+]* ]]; then
  71. respond "${nick}" 'Only voiced or opped users may use this command.'
  72. continue
  73. fi
  74. if [[ "${message}" =~ ^'!a '([a-z-]+\+)?[a-z]+://[^\ ]+$ ]]; then
  75. # Individual job
  76. jobs=("${message:3}")
  77. src="${message:3}"
  78. elif [[ "${message}" =~ ^'!a < 'https://transfer\.archivete\.am/[a-zA-Z0-9]+/.+$ ]]; then
  79. # List job
  80. jobs=()
  81. url="${message:5}"
  82. bad=
  83. log "Retrieving list job list: ${url}"
  84. while read -r line; do
  85. if [[ "${line}" =~ ^([a-z-]+\+)?[a-z]+://[^\ ]+$ ]]; then
  86. jobs+=("${line}")
  87. elif [[ "${line}" == '' ]]; then
  88. # Ignore empty lines
  89. continue
  90. else
  91. respond "${nick}" "Malformed line in ${url}: ${line}"
  92. bad=1
  93. break
  94. fi
  95. done < <({ curl --silent --verbose --fail --max-time 10 "${message:5}" 2> >(log_loop 'curl list job: '); printf '\n'; } | tee >(log_loop 'List input line: '))
  96. if [[ "${bad}" ]]; then
  97. continue
  98. fi
  99. src="${url}"
  100. else
  101. respond "${nick}" "I don't understand your command. Please forgive me."
  102. continue
  103. fi
  104. read -r jobid </proc/sys/kernel/random/uuid
  105. respond "${nick}" "Queueing job ${jobid} for ${src}"
  106. appendcounter=; if [[ ${#jobs[@]} -gt 1 ]]; then appendcounter=yes; fi
  107. for ((i=0; i<${#jobs[@]}; ++i)); do
  108. job="${jobs[${i}]}"
  109. singlejobid="${jobid}"; if [[ "${appendcounter}" ]]; then singlejobid+="_${i}"; fi
  110. printf '%s %s %s\n' "${singlejobid}" "${nick}" "${job}"
  111. done
  112. if [[ "${appendcounter}" ]]; then printf '%s %s end\n' "${jobid}" "${nick}"; fi # Special value for sending a message when all list URLs have been processed
  113. done |
  114. # The actual work loop
  115. while IFS= read -r line; do
  116. singlejobid="${line%% *}"
  117. line="${line#* }"
  118. nick="${line%% *}"
  119. url="${line#* }"
  120. # Handle marker for end of list job: tell the user it's done and move on.
  121. if [[ "${url}" == 'end' ]]; then
  122. # No status code reflection here because the start of the list job might not even have been in this batch.
  123. respond "${nick}" "Job ${singlejobid} finished."
  124. continue
  125. fi
  126. # Block until there's a free slot
  127. while [[ "${nproc}" -ge "${nproclimit}" ]]; do
  128. # Wait for one subshell to exit
  129. wait -n
  130. nproc+=-1
  131. done
  132. taint_block 'continuing with work loop'
  133. # Find nonexistent filename for log file with lock
  134. # mkdir is pretty much always atomic, creating files might not be depending on the underlying file system (e.g. networked ones like NFS).
  135. while ! mkdir '.loglock' 2> >(log_loop 'mkdir loglock (work) err: '); do
  136. sleep 1
  137. done
  138. trap 'rmdir ".loglock"' EXIT # Unlock if something below fails
  139. logbasename="$(printf '%(%Y%m%dT%H%M%SZ)T')_${singlejobid}"
  140. if [[ -e "${logbasename}_codearchiver.log" || -e "${logbasename}_codearchiver.log.zst" ]]; then
  141. for ((j=0; ; ++j)); do
  142. if [[ ! -e "${logbasename}_coll${j}_codearchiver.log" || -e "${logbasename}_coll${j}_codearchiver.log.zst" ]]; then
  143. break
  144. fi
  145. done
  146. logbasename="${logbasename}_coll${j}"
  147. fi
  148. logname="${logbasename}_codearchiver.log"
  149. artefactsname="${logbasename}_codearchiver_artefacts.txt"
  150. # Create the log file already in case spawning the tee process for it below is too slow
  151. touch "${logname}"
  152. trap - EXIT # Reset trap
  153. rmdir '.loglock' # Unlock
  154. # Run codearchiver in a background shell, duplicating WARNINGs and higher in the bot output
  155. # Produces lines of filenames to upload on stdout
  156. log "Running ${url} (${singlejobid}), logging into ${logname}"
  157. (
  158. taint_block "launching job ${singlejobid}"
  159. timeout --signal=INT "${timeout}" \
  160. codearchiver --verbose --write-artefacts-fd-3 "${url}" \
  161. 3> >(tee "${artefactsname}" | log_loop "Artefact from codearchiver ${singlejobid}: ") \
  162. > >(log_loop "codearchiver ${singlejobid} out: ") \
  163. 2> >(tee "${logname}" | grep -Fv -e ' INFO ' | log_loop "codearchiver ${singlejobid} err: ")
  164. status="$?"
  165. log "codearchiver ${url} finished with status code ${status}"
  166. #TODO Integrate this into the pipe from codearchiver above to avoid rereading the entire log file
  167. declare -i badcount=$(awk '! ($3 ~ /^INFO$/) { cnt += 1; } END { printf "%d\n", cnt; }' "${logname}")
  168. # Compress log file with zstd -19
  169. log "Compressing log file ${logname}"
  170. zstd -19 --rm "${logname}" 2> >(log_loop 'zstd err: ')
  171. if [[ -e "${logname}.zst" && ! -e "${logname}" ]]; then
  172. # Compression successful
  173. logname="${logname}.zst"
  174. fi
  175. # Verify that there are no artefacts if codearchiver exited non-zero
  176. # Since codearchiver handles errors internally normally, this should not usually happen, but it could occur e.g. if running out of disk space and leaving partial files in the storage.
  177. # With parallelism, this could in theory lead to artefacts of a successful run depending on artefacts from a failed run, which we wouldn't want.
  178. # So, if there are artefacts of a failed process, touch the .tainted file to stop the uploader and new processes starting and send a warning to IRC.
  179. # Emit the log filename for upload always (even on tainted storage), artefacts list and artefacts only on zero exit.
  180. readarray -t artefacts <"${artefactsname}"
  181. if [[ "${status}" -ne 0 && "${#artefacts[@]}" -ne 0 ]]; then
  182. touch '.tainted'
  183. send "Job ${singlejobid} exited non-zero but left artefacts behind!"
  184. msg="$(printf 'Artefact files by non-zero exit process: '; printf ' %q' "${artefacts[@]}")"
  185. log "${msg}"
  186. elif [[ "${status}" -eq 0 ]]; then
  187. for file in "${artefacts[@]}"; do
  188. printf '%s\n' "${file}"
  189. done
  190. printf '%s\n' "${artefactsname}"
  191. fi
  192. printf '%s\n' "${logname}"
  193. # For individual jobs, tell the user about warnings and success/failure
  194. if [[ "${singlejobid}" != *_* ]]; then
  195. if [[ "${status}" -eq 0 ]]; then
  196. respond "${nick}" "Job ${singlejobid} succeeded."
  197. else
  198. respond "${nick}" "Job ${singlejobid} failed."
  199. fi
  200. if [[ ${badcount} -gt 0 ]]; then
  201. respond "${nick}" "Job ${singlejobid} produced ${badcount} warnings or errors."
  202. fi
  203. fi
  204. ) &
  205. nproc+=1
  206. done |
  207. # Upload
  208. while :; do
  209. # Process in batches for efficiency of parallel IA upload processing
  210. declare -a filenames=()
  211. while read -r -t 1 filename; do
  212. filenames+=("${filename}")
  213. done
  214. if [[ ${#filenames[@]} -eq 0 ]]; then
  215. continue
  216. fi
  217. log 'Starting upload batch'
  218. # Record SHA-256 hashes for new files
  219. sha256sum "${filenames[@]}" > >(log_loop 'sha256sum: ')
  220. taint_block 'uploading anything'
  221. # Upload
  222. date="$(printf '%(%Y-%m-%d)T')"
  223. identifier="codearchiver_${date//-/}"
  224. if [[ -z "${CODEARCHIVER_BOT_TEST}" ]]; then
  225. collection='archiveteam_codearchiver'
  226. else
  227. identifier="test_${identifier}"
  228. collection='test_collection'
  229. fi
  230. uploadsfine=y
  231. for f in "${filenames[@]}"; do
  232. taint_block "starting upload for $(printf '%q' "${f}")"
  233. log "Uploading $(printf '%q' "${f}") to ${identifier}"
  234. ia-upload-stream --no-derive "${identifier}" "${f}" \
  235. "collection:${collection}" \
  236. 'mediatype:software' \
  237. "date:${date}" \
  238. 'noarchivetorrent:true' \
  239. <"${f}" 2> >(log_loop 'ia-upload-stream: ')
  240. status="$?"
  241. if [[ "${status}" -ne 0 ]]; then
  242. log "Upload failed: exit status ${status}"
  243. if [[ "${uploadsfine}" ]]; then
  244. send "Upload failed: exit status ${status}"
  245. fi
  246. uploadsfine=
  247. fi
  248. done
  249. if [[ -z "${uploadsfine}" ]]; then
  250. log 'At least one upload in the batch failed, not removing anything'
  251. continue
  252. fi
  253. # Wait until all tasks for the item are done
  254. while :; do
  255. tasks="$(python3 -c 'import json, sys; o = json.load(sys.stdin); totalTasks = sum(o["value"]["summary"].values()); virusChecks = sum(1 for task in o["value"]["catalog"] if task.get("cmd") == "book_op.php" and task.get("args", {}).get("op10") == "VirusCheck"); print(f"Expected exactly 0 or 1 VirusCheck tasks, got {virusChecks}", file = sys.stderr) if virusChecks not in (0, 1) else None; print("Ignoring VirusCheck book_op task", file = sys.stderr) if virusChecks > 0 else None; print(totalTasks - min(virusChecks, 1))' < <({ curl --silent --verbose --fail --max-time 10 --header "Authorization: LOW ${IA_S3_ACCESS}:${IA_S3_SECRET}" "https://archive.org/services/tasks.php?identifier=${identifier}&catalog=1&history=0" 2> >(log_loop 'curl IA tasks err: '); } | tee >(log_loop 'curl IA tasks out: ')))"
  256. if [[ "${tasks}" == '0' ]]; then
  257. break
  258. fi
  259. sleep 60
  260. done
  261. taint_block 'removing any files after upload'
  262. # Replace non-metadata files with a symlink to .uploaded dummy file
  263. # No locking with codearchiver processes is necessary because those will only read metadata (which is left alone) or write files.
  264. # However, a lock with the log filename finding is required.
  265. while ! mkdir '.loglock' 2> >(log_loop 'mkdir loglock (upload) err: '); do
  266. sleep 1.1 # Slightly longer than above to minimise repeated collisions
  267. done
  268. trap 'rmdir ".loglock"' EXIT
  269. touch '.uploaded'
  270. for f in "${filenames[@]}"; do
  271. if [[ "${f}" != *_codearchiver_metadata.txt ]]; then
  272. log "Replacing $(printf '%q' "${f}") with symlink to .uploaded"
  273. { rm --verbose -- "${f}" && ln --symbolic --verbose '.uploaded' "${f}"; } |& log_loop 'rm/ln: '
  274. fi
  275. done
  276. trap - EXIT
  277. rmdir '.loglock'
  278. done
  279. }