Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.
 
 

275 řádky
10 KiB

  1. #!/bin/bash
  2. export TZ=UTC
  3. envvars=(HTTP2IRC_GET_URL HTTP2IRC_POST_URL IA_S3_ACCESS IA_S3_SECRET)
  4. for envvar in "${envvars[@]}"; do
  5. if [[ ! -v "${envvar}" ]]; then
  6. { printf 'Error: one or more of the required environment variables (%s' "${envvars[0]}"; printf ', %s' "${envvars[@]:1}"; printf ') missing\n'; } >&2
  7. exit 1
  8. fi
  9. done
  10. # Optional env variables
  11. declare -i timeout="${CODEARCHIVER_BOT_TIMEOUT:-0}"
  12. for dep in awk codearchiver curl ia-upload-stream python3 sha256sum tee zstd; do
  13. if ! command -v "${dep}" &>/dev/null; then
  14. printf 'Error: %s not found\n' "${dep}" >&2
  15. exit 1
  16. fi
  17. done
  18. function log {
  19. printf '%s %s\n' "${EPOCHREALTIME}" "$1" >&2
  20. }
  21. function log_loop {
  22. prefix="$1"
  23. # If the output does not end with a LF, add one. Then replace CRLF with LF and replace remaining CR with LF.
  24. { lastchar="$(tee /dev/fd/3 | tail -c 1 | xxd -p)"; if [[ "${lastchar}" != '0a' ]]; then printf '\n'; fi } 3>&1 |
  25. sed -u 's,\r$,,; s,\r,\n,g' |
  26. while IFS= read -r line; do log "${prefix}${line}"; done
  27. }
  28. function send {
  29. local message="$1"
  30. log "Sending message: ${message}"
  31. curl --silent --verbose --max-time 10 --data "${message}" "${HTTP2IRC_POST_URL}" 2> >(log_loop 'curl http2irc POST: ') | log_loop 'http2irc POST response: '
  32. }
  33. function respond {
  34. local nick="$1"
  35. local message="$2"
  36. send "${nick}: ${message}"
  37. }
  38. { # Group the pipeline without requiring a backslash every time
  39. while :; do
  40. # Read from http2irc
  41. log 'Starting http2irc GET stream...'
  42. curl --silent --verbose --no-buffer "${HTTP2IRC_GET_URL}" 2> >(log_loop 'curl http2irc GET: ')
  43. printf '\n' # Ensure that there's a trailing LF for `read`
  44. done |
  45. # Log all raw input
  46. tee >(log_loop 'Received http2irc line: ') |
  47. # Transform the JSONL data into a more suitable format for the following: lines of 'modes SP nick SP message'
  48. python3 -u -c 'import json, sys'$'\n''def json_parse_or_none(s):'$'\n'' try: return json.loads(s)'$'\n'' except json.JSONDecodeError as e:'$'\n'' print(f"Could not parse {s[:100]}…: {type(e).__name__}: {e!s}")'$'\n''{print(o["user"]["modes"] or "_", o["user"]["nick"], o["message"]) for o in map(json_parse_or_none, sys.stdin) if o and o.get("command") == "PRIVMSG"}' |
  49. # For valid bot commands with adequate permissions, assign a job ID and respond. Suppress everything else. Print lines of 'jobid SP nick SP URL' for the processing below.
  50. while read -r modes nick message; do
  51. if [[ "${message}" == '!help' ]]; then
  52. respond "${nick}" '`!a URL`: archives a single repository'
  53. respond "${nick}" '`!a < URL`: archives a list of repositories (no success/failure report, no warnings/errors report, check logs manually!)'
  54. continue
  55. fi
  56. if [[ "${message}" != '!a '* ]]; then
  57. continue
  58. fi
  59. if [[ "${modes}" != *[@+]* ]]; then
  60. respond "${nick}" 'Only voiced or opped users may use this command.'
  61. continue
  62. fi
  63. if [[ "${message}" =~ ^'!a '([a-z-]+\+)?[a-z]+://[^\ ]+$ ]]; then
  64. # Individual job
  65. jobs=("${message:3}")
  66. src="${message:3}"
  67. elif [[ "${message}" =~ ^'!a < 'https://transfer\.archivete\.am/[a-zA-Z0-9]+/.+$ ]]; then
  68. # List job
  69. jobs=()
  70. url="${message:5}"
  71. bad=
  72. log "Retrieving list job list: ${url}"
  73. while read -r line; do
  74. if [[ "${line}" =~ ^'!a '([a-z-]+\+)?[a-z]+://[^\ ]+$ ]]; then
  75. jobs+=("${line}")
  76. elif [[ "${line}" == '' ]]; then
  77. # Ignore empty lines
  78. continue
  79. else
  80. respond "${nick}" "Malformed line in ${url}: ${line}"
  81. bad=1
  82. break
  83. fi
  84. done < <({ curl --silent --verbose --fail --max-time 10 "${message:5}" 2> >(log_loop 'curl list job: '); printf '\n'; } | tee >(log_loop 'List input line: '))
  85. if [[ "${bad}" ]]; then
  86. continue
  87. fi
  88. src="${url}"
  89. else
  90. respond "${nick}" "I don't understand your command. Please forgive me."
  91. continue
  92. fi
  93. read -r jobid </proc/sys/kernel/random/uuid
  94. respond "${nick}" "Queueing job ${jobid} for ${src}"
  95. appendcounter=; if [[ ${#jobs[@]} -gt 1 ]]; then appendcounter=yes; fi
  96. for ((i=0; i<${#jobs[@]}; ++i)); do
  97. job="${jobs[${i}]}"
  98. singlejobid="${jobid}"; if [[ "${appendcounter}" ]]; then singlejobid+="_${i}"; fi
  99. printf '%s %s %s\n' "${singlejobid}" "${nick}" "${job}"
  100. done
  101. if [[ "${appendcounter}" ]]; then printf '%s %s end\n' "${jobid}" "${nick}"; fi # Special value for sending a message when all list URLs have been processed
  102. done |
  103. # The actual work loop
  104. while IFS= read -r line; do
  105. singlejobid="${line%% *}"
  106. line="${line#* }"
  107. nick="${line%% *}"
  108. url="${line#* }"
  109. # Handle marker for end of list job: tell the user it's done and move on.
  110. if [[ "${url}" == 'end' ]]; then
  111. # No status code reflection here because the start of the list job might not even have been in this batch.
  112. respond "${nick}" "Job ${singlejobid} finished."
  113. continue
  114. fi
  115. # Find nonexistent filename for log file with lock
  116. # mkdir is pretty much always atomic, creating files might not be depending on the underlying file system (e.g. networked ones like NFS).
  117. while ! mkdir '.loglock' 2> >(log_loop 'mkdir loglock (work) err: '); do
  118. sleep 1
  119. done
  120. trap 'rmdir ".loglock"' EXIT # Unlock if something below fails
  121. logbasename="$(date +%Y%m%dT%H%M%SZ)_${singlejobid}"
  122. if [[ -e "${logbasename}_codearchiver.log" || -e "${logbasename}_codearchiver.log.zst" ]]; then
  123. for ((j=0; ; ++j)); do
  124. if [[ ! -e "${logbasename}_coll${j}_codearchiver.log" || -e "${logbasename}_coll${j}_codearchiver.log.zst" ]]; then
  125. break
  126. fi
  127. done
  128. logbasename="${logbasename}_coll${j}"
  129. fi
  130. logname="${logbasename}_codearchiver.log"
  131. artefactsname="${logbasename}_codearchiver_artefacts.txt"
  132. # Create the log file already in case spawning the tee process for it below is too slow
  133. touch "${logname}"
  134. trap - EXIT # Reset trap
  135. rmdir '.loglock' # Unlock
  136. # Run codearchiver in a background shell, duplicating WARNINGs and higher in the bot output
  137. # Produces lines of filenames to upload on stdout
  138. log "Running ${url} (${singlejobid}), logging into ${logname}"
  139. (
  140. timeout --signal=INT "${timeout}" \
  141. codearchiver --verbose --write-artefacts-fd-3 "${url}" \
  142. > >(log_loop "codearchiver ${singlejobid} out: ") \
  143. 2> >(tee "${logname}" | grep -Fv -e ' INFO ' | log_loop "codearchiver ${singlejobid} err: ") \
  144. 3> >(tee "${artefactsname}" | log_loop "Artefact from codearchiver ${singlejobid}: ")
  145. status="$?"
  146. log "codearchiver ${url} finished with status code ${status}"
  147. #TODO Integrate this into the pipe from codearchiver above to avoid rereading the entire log file
  148. declare -i badcount=$(awk '! ($3 ~ /^INFO$/) { cnt += 1; } END { printf "%d\n", cnt; }' "${logname}")
  149. # Compress log file with zstd -19
  150. log "Compressing log file ${logname}"
  151. zstd -19 --rm "${logname}" 2> >(log_loop 'zstd err: ')
  152. if [[ -e "${logname}.zst" && ! -e "${logname}" ]]; then
  153. # Compression successful
  154. logname="${logname}.zst"
  155. fi
  156. # Move everything but the log file to ./failed/ if codearchiver exited non-zero
  157. readarray -t artefacts <"${artefactsname}"
  158. if [[ "${status}" -ne 0 ]]; then
  159. msg="$(printf 'Moving artefact files'; printf ' %q' "${artefacts[@]}" "${artefactsname}"; printf ' from non-zero exit for job %s to ./failed/\n' "${singlejobid}";)"
  160. log "${msg}"
  161. mkdir --parents ./failed/
  162. mv --verbose -- "${artefacts[@]}" "${artefactsname}" ./failed/ 2> >(log_loop 'mv err: ') | log_loop 'mv out: '
  163. else
  164. for file in "${artefacts[@]}"; do
  165. printf '%s\n' "${file}"
  166. done
  167. printf '%s\n' "${artefactsname}"
  168. fi
  169. printf '%s\n' "${logname}"
  170. # For individual jobs, tell the user about warnings and success/failure
  171. if [[ "${singlejobid}" != *_* ]]; then
  172. if [[ "${status}" -eq 0 ]]; then
  173. respond "${nick}" "Job ${singlejobid} succeeded."
  174. else
  175. respond "${nick}" "Job ${singlejobid} failed."
  176. fi
  177. if [[ ${badcount} -gt 0 ]]; then
  178. respond "${nick}" "Job ${singlejobid} produced ${badcount} warnings or errors."
  179. fi
  180. fi
  181. ) &
  182. wait
  183. done |
  184. # Upload
  185. while :; do
  186. # Process in batches for efficiency of parallel IA upload processing
  187. declare -a filenames=()
  188. while read -r -t 1 filename; do
  189. filenames+=("${filename}")
  190. done
  191. if [[ ${#filenames[@]} -eq 0 ]]; then
  192. continue
  193. fi
  194. # Record SHA-256 hashes for new files
  195. sha256sum "${filenames[@]}" > >(log_loop 'sha256sum: ')
  196. # Upload
  197. date="$(date '+%Y-%m-%d')"
  198. identifier="codearchiver_${date//-/}"
  199. if [[ -z "${CODEARCHIVER_BOT_TEST}" ]]; then
  200. collection='archiveteam_codearchiver'
  201. else
  202. identifier="test_${identifier}"
  203. collection='test_collection'
  204. fi
  205. uploadsfine=y
  206. for f in "${filenames[@]}"; do
  207. ia-upload-stream --no-derive "${identifier}" "${f}" \
  208. "collection:${collection}" \
  209. 'mediatype:software' \
  210. "date:${date}" \
  211. <"${f}" 2> >(log_loop 'ia-upload-stream: ')
  212. status="$?"
  213. if [[ "${status}" -ne 0 ]]; then
  214. log "Upload failed: exit status ${status}"
  215. if [[ "${uploadsfine}" ]]; then
  216. send "Upload failed: exit status ${status}"
  217. fi
  218. uploadsfine=
  219. fi
  220. done
  221. if [[ -z "${uploadsfine}" ]]; then
  222. continue
  223. fi
  224. # Wait until all tasks for the item are done
  225. while :; do
  226. tasks="$(python3 -c 'import json, sys; o = json.load(sys.stdin); print(sum(o["value"]["summary"].values()))' < <({ curl --silent --verbose --fail --max-time 10 --header "Authorization: LOW ${IA_S3_ACCESS}:${IA_S3_SECRET}" "https://archive.org/services/tasks.php?identifier=${identifier}&summary=1&history=0" 2> >(log_loop 'curl IA tasks err: '); } | tee >(log_loop 'curl IA tasks out: ')))"
  227. if [[ "${tasks}" == '0' ]]; then
  228. break
  229. fi
  230. sleep 60
  231. done
  232. # Replace non-metadata files with a symlink to .uploaded dummy file
  233. # No locking with codearchiver processes is necessary because those will only read metadata (which is left alone) or write files.
  234. # However, a lock with the log filename finding is required.
  235. while ! mkdir '.loglock' 2> >(log_loop 'mkdir loglock (upload) err: '); do
  236. sleep 1.1 # Slightly longer than above to minimise repeated collisions
  237. done
  238. trap 'rmdir ".loglock"' EXIT
  239. touch '.uploaded'
  240. for f in "${filenames[@]}"; do
  241. if [[ "${f}" != *_codearchiver_metadata.txt ]]; then
  242. log "Replacing ${f} with symlink to .uploaded"
  243. { rm --verbose -- "${f}" && ln --symbolic --verbose '.uploaded' "${f}"; } |& log_loop 'rm/ln: '
  244. fi
  245. done
  246. trap - EXIT
  247. rmdir '.loglock'
  248. done
  249. }