You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

264 lines
9.8 KiB

  1. #!/bin/bash
  2. export TZ=UTC
  3. envvars=(HTTP2IRC_GET_URL HTTP2IRC_POST_URL IA_S3_ACCESS IA_S3_SECRET)
  4. for envvar in "${envvars[@]}"; do
  5. if [[ ! -v "${envvar}" ]]; then
  6. { printf 'Error: one or more of the required environment variables (%s' "${envvars[0]}"; printf ', %s' "${envvars[@]:1}"; printf ') missing\n'; } >&2
  7. exit 1
  8. fi
  9. done
  10. for dep in awk codearchiver curl ia-upload-stream python3 sha256sum tee zstd; do
  11. if ! command -v "${dep}" &>/dev/null; then
  12. printf 'Error: %s not found\n' "${dep}" >&2
  13. exit 1
  14. fi
  15. done
  16. function log {
  17. printf '%s %s\n' "${EPOCHREALTIME}" "$1" >&2
  18. }
  19. function log_loop {
  20. prefix="$1"
  21. # If the output does not end with a LF, add one. Then replace CRLF with LF and replace remaining CR with LF.
  22. { lastchar="$(tee /dev/fd/3 | tail -c 1 | xxd -p)"; if [[ "${lastchar}" != '0a' ]]; then printf '\n'; fi } 3>&1 |
  23. sed -u 's,\r$,,; s,\r,\n,g' |
  24. while IFS= read -r line; do log "${prefix}${line}"; done
  25. }
  26. function send {
  27. local message="$1"
  28. log "Sending message: ${message}"
  29. curl --silent --verbose --max-time 10 --data "${message}" "${HTTP2IRC_POST_URL}" 2> >(log_loop 'curl http2irc POST: ') | log_loop 'http2irc POST response: '
  30. }
  31. function respond {
  32. local nick="$1"
  33. local message="$2"
  34. send "${nick}: ${message}"
  35. }
  36. { # Group the pipeline without requiring a backslash every time
  37. while :; do
  38. # Read from http2irc
  39. log 'Starting http2irc GET stream...'
  40. curl --silent --verbose --no-buffer "${HTTP2IRC_GET_URL}" 2> >(log_loop 'curl http2irc GET: ')
  41. printf '\n' # Ensure that there's a trailing LF for `read`
  42. done |
  43. # Log all raw input
  44. tee >(log_loop 'Received http2irc line: ') |
  45. # Transform the JSONL data into a more suitable format for the following: lines of 'modes SP nick SP message'
  46. python3 -u -c 'import json, sys'$'\n''def json_parse_or_none(s):'$'\n'' try: return json.loads(s)'$'\n'' except json.JSONDecodeError as e:'$'\n'' print(f"Could not parse {s[:100]}…: {type(e).__name__}: {e!s}")'$'\n''{print(o["user"]["modes"] or "_", o["user"]["nick"], o["message"]) for o in map(json_parse_or_none, sys.stdin) if o and o.get("command") == "PRIVMSG"}' |
  47. # For valid bot commands with adequate permissions, assign a job ID and respond. Suppress everything else. Print lines of 'jobid SP nick SP URL' for the processing below.
  48. while read -r modes nick message; do
  49. if [[ "${message}" == '!help' ]]; then
  50. respond "${nick}" '`!a URL`: archives a single repository'
  51. respond "${nick}" '`!a < URL`: archives a list of repositories (no success/failure report, no warnings/errors report, check logs manually!)'
  52. continue
  53. fi
  54. if [[ "${message}" != '!a '* ]]; then
  55. continue
  56. fi
  57. if [[ "${modes}" != *[@+]* ]]; then
  58. respond "${nick}" 'Only voiced or opped users may use this command.'
  59. continue
  60. fi
  61. if [[ "${message}" =~ ^'!a '([a-z-]+\+)?[a-z]+://[^\ ]+$ ]]; then
  62. # Individual job
  63. jobs=("${message:3}")
  64. src="${message:3}"
  65. elif [[ "${message}" =~ ^'!a < 'https://transfer\.archivete\.am/[a-zA-Z0-9]+/.+$ ]]; then
  66. # List job
  67. jobs=()
  68. url="${message:5}"
  69. bad=
  70. log "Retrieving list job list: ${url}"
  71. while read -r line; do
  72. if [[ "${line}" =~ ^'!a '([a-z-]+\+)?[a-z]+://[^\ ]+$ ]]; then
  73. jobs+=("${line}")
  74. elif [[ "${line}" == '' ]]; then
  75. # Ignore empty lines
  76. continue
  77. else
  78. respond "${nick}" "Malformed line in ${url}: ${line}"
  79. bad=1
  80. break
  81. fi
  82. done < <({ curl --silent --verbose --fail --max-time 10 "${message:5}" 2> >(log_loop 'curl list job: '); printf '\n'; } | tee >(log_loop 'List input line: '))
  83. if [[ "${bad}" ]]; then
  84. continue
  85. fi
  86. src="${url}"
  87. else
  88. respond "${nick}" "I don't understand your command. Please forgive me."
  89. continue
  90. fi
  91. read -r jobid </proc/sys/kernel/random/uuid
  92. respond "${nick}" "Queueing job ${jobid} for ${src}"
  93. appendcounter=; if [[ ${#jobs[@]} -gt 1 ]]; then appendcounter=yes; fi
  94. for ((i=0; i<${#jobs[@]}; ++i)); do
  95. job="${jobs[${i}]}"
  96. singlejobid="${jobid}"; if [[ "${appendcounter}" ]]; then singlejobid+="_${i}"; fi
  97. printf '%s %s %s\n' "${singlejobid}" "${nick}" "${job}"
  98. done
  99. if [[ "${appendcounter}" ]]; then printf '%s %s end\n' "${jobid}" "${nick}"; fi # Special value for sending a message when all list URLs have been processed
  100. done |
  101. # The actual work loop
  102. while :; do
  103. # Process in batches for efficiency of parallel IA processing after uploads
  104. declare -a batch=()
  105. while read -r -t 1 line; do
  106. batch+=("${line}")
  107. done
  108. if [[ ${#batch[@]} -eq 0 ]]; then
  109. continue
  110. fi
  111. statuscodes=() # Exit status for each `batch` element's codearchiver process (-1 for 'end' markers)
  112. newfiles=()
  113. newfilejobindices=() # One entry for each element of `newfiles`, containing the corresponding index in `batch` to which the file belongs
  114. for ((i=0; i<${#batch[@]}; ++i)); do
  115. line="${batch[${i}]}"
  116. singlejobid="${line%% *}"
  117. line="${line#* }"
  118. nick="${line%% *}"
  119. url="${line#* }"
  120. # Handle marker for end of list job: tell the user it's done and move on.
  121. if [[ "${url}" == 'end' ]]; then
  122. # No status code reflection here because the start of the list job might not even have been in this batch.
  123. respond "${nick}" "Job ${jobid} finished."
  124. statuscodes+=(-1)
  125. continue
  126. fi
  127. # Marker for finding new files
  128. touch '.filemarker'
  129. # Find nonexistent filename for log file (*not* concurrency-safe!)
  130. logbasename="$(date +%Y%m%dT%H%M%SZ)_${singlejobid}"
  131. if [[ -e "${logbasename}_codearchiver.log" ]]; then
  132. for ((i=0; ; ++i)); do
  133. if [[ ! -e "${logbasename}_coll${i}_codearchiver.log" ]]; then
  134. break
  135. fi
  136. done
  137. logbasename="${logbasename}_coll${i}"
  138. fi
  139. logname="${logbasename}_codearchiver.log"
  140. # Run codearchiver, duplicating WARNINGs and higher in the bot output
  141. log "Running ${url}, logging into ${logname}"
  142. codearchiver -vv "${url}" 2> >(tee "${logname}" | grep -Fv -e ' INFO ' -e ' DEBUG ' | log_loop "From codearchiver ${singlejobid}: ")
  143. status="$?"
  144. log "codearchiver ${url} finished with status code ${status}"
  145. statuscodes+=("${status}")
  146. #TODO Integrate this into the pipe from codearchiver above to avoid rereading the entire log file
  147. declare -i badcount=$(awk '! ($3 ~ /^(DEBUG|INFO)$/) { cnt += 1; } END { printf "%d\n", cnt; }' "${logname}")
  148. # Compress log file with zstd -19
  149. log "Compressing log file ${logname}"
  150. zstd -19 --rm "${logname}" 2> >(log_loop 'zstd err: ')
  151. if [[ -e "${logname}.zst" && ! -e "${logname}" ]]; then
  152. # Compression successful
  153. logname="${logname}.zst"
  154. fi
  155. # Collect new files, write artefacts file, move everything but the log file to ./failed/ if codearchiver exited non-zero
  156. readarray -d $'\0' -t tmpnewfiles < <(find -maxdepth 1 -type f -newer '.filemarker' -print0 2> >(log_loop 'find new files err: ') | sed 's,^\./,,')
  157. readarray -d $'\0' -t artefacts < <(for file in "${tmpnewfiles[@]}"; do if [[ "${file}" != "${logname}" ]]; then printf '%s\0' "${file}"; fi; done)
  158. artefactsname="${logbasename}_codearchiver_artefacts.txt"
  159. for file in "${artefacts[@]}"; do
  160. printf '%s\n' "${file}"
  161. done >"${artefactsname}"
  162. if [[ "${status}" -ne 0 ]]; then
  163. msg="$(printf 'Moving artefact files'; printf ' %q' "${artefacts[@]}" "${artefactsname}"; printf ' from non-zero exit for job %s to ./failed/\n' "${singlejobid}";)"
  164. log "${msg}"
  165. mkdir --parents ./failed/
  166. mv --verbose -- "${artefacts[@]}" "${artefactsname}" ./failed/ 2> >(log_loop 'mv err: ') | log_loop 'mv out: '
  167. newfiles+=("${logname}")
  168. newfilejobindices+=("${i}")
  169. else
  170. for file in "${tmpnewfiles[@]}"; do
  171. log "New file from job ${singlejobid}: ${file}"
  172. newfiles+=("${file}")
  173. newfilejobindices+=("${i}")
  174. done
  175. newfiles+=("${artefactsname}")
  176. newfilejobindices+=("${i}")
  177. fi
  178. # For individual jobs, tell the user about warnings and success/failure
  179. if [[ "${singlejobid}" != *_* ]]; then
  180. if [[ "${status}" -eq 0 ]]; then
  181. respond "${nick}" "Job ${singlejobid} succeeded."
  182. else
  183. respond "${nick}" "Job ${singlejobid} failed."
  184. fi
  185. if [[ ${badcount} -gt 0 ]]; then
  186. respond "${nick}" "Job ${singlejobid} produced ${badcount} warnings or errors."
  187. fi
  188. fi
  189. done
  190. # Record SHA-256 hashes for new files
  191. log "SHA-256 hashes:"
  192. sha256sum "${newfiles[@]}" > >(log_loop 'sha256sum: ')
  193. # Upload
  194. date="$(date '+%Y-%m-%d')"
  195. identifier="codearchiver_${date//-/}"
  196. if [[ -z "${CODEARCHIVER_BOT_TEST}" ]]; then
  197. collection='archiveteam_codearchiver'
  198. else
  199. identifier="test_${identifier}"
  200. collection='test_collection'
  201. fi
  202. uploadsfine=y
  203. for f in "${newfiles[@]}"; do
  204. ia-upload-stream --no-derive "${identifier}" "${f}" \
  205. "collection:${collection}" \
  206. 'mediatype:software' \
  207. "date:${date}" \
  208. <"${f}" 2> >(log_loop 'ia-upload-stream: ')
  209. status="$?"
  210. if [[ "${status}" -ne 0 ]]; then
  211. log "Upload failed: exit status ${status}"
  212. if [[ "${uploadsfine}" ]]; then
  213. send "Upload failed: exit status ${status}"
  214. fi
  215. uploadsfine=
  216. fi
  217. done
  218. if [[ "${uploadsfine}" ]]; then
  219. # Wait until all tasks for the item are done
  220. while :; do
  221. tasks="$(python3 -c 'import json, sys; o = json.load(sys.stdin); print(sum(o["value"]["summary"].values()))' < <({ curl --silent --verbose --fail --max-time 10 --header "Authorization: LOW ${IA_S3_ACCESS}:${IA_S3_SECRET}" "https://archive.org/services/tasks.php?identifier=${identifier}&summary=1&history=0" 2> >(log_loop 'curl IA tasks err: '); } | tee >(log_loop 'curl IA tasks out: ')))"
  222. if [[ "${tasks}" == '0' ]]; then
  223. break
  224. fi
  225. sleep 60
  226. done
  227. # Replace non-metadata files with a symlink to .uploaded dummy file
  228. touch '.uploaded'
  229. for f in "${newfiles[@]}"; do
  230. if [[ "${f}" != *_codearchiver_metadata.txt ]]; then
  231. log "Replacing ${f} with symlink to .uploaded"
  232. { rm --verbose -- "${f}" && ln --symbolic --verbose '.uploaded' "${f}"; } |& log_loop 'rm/ln: '
  233. fi
  234. done
  235. fi
  236. done
  237. }