#!/bin/bash
export TZ=UTC

envvars=(HTTP2IRC_GET_URL HTTP2IRC_POST_URL IA_S3_ACCESS IA_S3_SECRET)
for envvar in "${envvars[@]}"; do
	if [[ ! -v "${envvar}" ]]; then
		{ printf 'Error: one or more of the required environment variables (%s' "${envvars[0]}"; printf ', %s' "${envvars[@]:1}"; printf ') missing\n'; } >&2
		exit 1
	fi
done

for dep in awk codearchiver curl ia-upload-stream python3 sha256sum tee zstd; do
	if ! command -v "${dep}" &>/dev/null; then
		printf 'Error: %s not found\n' "${dep}" >&2
		exit 1
	fi
done

function log {
	printf '%s %s\n' "${EPOCHREALTIME}" "$1" >&2
}

function log_loop {
	prefix="$1"
	# If the output does not end with a LF, add one. Then replace CRLF with LF and replace remaining CR with LF.
	{ lastchar="$(tee /dev/fd/3 | tail -c 1 | xxd -p)"; if [[ "${lastchar}" != '0a' ]]; then printf '\n'; fi } 3>&1 |
	  sed -u 's,\r$,,; s,\r,\n,g' |
	  while IFS= read -r line; do log "${prefix}${line}"; done
}

function send {
	local message="$1"
	log "Sending message: ${message}"
	curl --silent --verbose --max-time 10 --data "${message}" "${HTTP2IRC_POST_URL}" 2> >(log_loop 'curl http2irc POST: ') | log_loop 'http2irc POST response: '
}

function respond {
	local nick="$1"
	local message="$2"
	send "${nick}: ${message}"
}

{ # Group the pipeline without requiring a backslash every time
	while :; do
		# Read from http2irc
		log 'Starting http2irc GET stream...'
		curl --silent --verbose --no-buffer "${HTTP2IRC_GET_URL}" 2> >(log_loop 'curl http2irc GET: ')
		printf '\n'  # Ensure that there's a trailing LF for `read`
	done |

	# Log all raw input
	tee >(log_loop 'Received http2irc line: ') |

	# Transform the JSONL data into a more suitable format for the following: lines of 'modes SP nick SP message'
	python3 -u -c 'import json, sys'$'\n''def json_parse_or_none(s):'$'\n'' try: return json.loads(s)'$'\n'' except json.JSONDecodeError as e:'$'\n''  print(f"Could not parse {s[:100]}…: {type(e).__name__}: {e!s}")'$'\n''{print(o["user"]["modes"] or "_", o["user"]["nick"], o["message"]) for o in map(json_parse_or_none, sys.stdin) if o and o.get("command") == "PRIVMSG"}' |

	# For valid bot commands with adequate permissions, assign a job ID and respond. Suppress everything else. Print lines of 'jobid SP nick SP URL' for the processing below.
	while read -r modes nick message; do
		if [[ "${message}" == '!help' ]]; then
			respond "${nick}" '`!a URL`: archives a single repository'
			respond "${nick}" '`!a < URL`: archives a list of repositories (no success/failure report, no warnings/errors report, check logs manually!)'
			continue
		fi
		if [[ "${message}" != '!a '* ]]; then
			continue
		fi
		if [[ "${modes}" != *[@+]* ]]; then
			respond "${nick}" 'Only voiced or opped users may use this command.'
			continue
		fi
		if [[ "${message}" =~ ^'!a '([a-z-]+\+)?[a-z]+://[^\ ]+$ ]]; then
			# Individual job
			jobs=("${message:3}")
			src="${message:3}"
		elif [[ "${message}" =~ ^'!a < 'https://transfer\.archivete\.am/[a-zA-Z0-9]+/.+$ ]]; then
			# List job
			jobs=()
			url="${message:5}"
			bad=
			log "Retrieving list job list: ${url}"
			while read -r line; do
				if [[ "${line}" =~ ^'!a '([a-z-]+\+)?[a-z]+://[^\ ]+$ ]]; then
					jobs+=("${line}")
				elif [[ "${line}" == '' ]]; then
					# Ignore empty lines
					continue
				else
					respond "${nick}" "Malformed line in ${url}: ${line}"
					bad=1
					break
				fi
			done < <({ curl --silent --verbose --fail --max-time 10 "${message:5}" 2> >(log_loop 'curl list job: '); printf '\n'; } | tee >(log_loop 'List input line: '))
			if [[ "${bad}" ]]; then
				continue
			fi
			src="${url}"
		else
			respond "${nick}" "I don't understand your command. Please forgive me."
			continue
		fi
		read -r jobid </proc/sys/kernel/random/uuid
		respond "${nick}" "Queueing job ${jobid} for ${src}"
		appendcounter=; if [[ ${#jobs[@]} -gt 1 ]]; then appendcounter=yes; fi
		for ((i=0; i<${#jobs[@]}; ++i)); do
			job="${jobs[${i}]}"
			singlejobid="${jobid}"; if [[ "${appendcounter}" ]]; then singlejobid+="_${i}"; fi
			printf '%s %s %s\n' "${singlejobid}" "${nick}" "${job}"
		done
		if [[ "${appendcounter}" ]]; then printf '%s %s end\n' "${jobid}" "${nick}"; fi  # Special value for sending a message when all list URLs have been processed
	done |

	# The actual work loop
	while :; do
		# Process in batches for efficiency of parallel IA processing after uploads
		declare -a batch=()
		while read -r -t 1 line; do
			batch+=("${line}")
		done

		if [[ ${#batch[@]} -eq 0 ]]; then
			continue
		fi

		statuscodes=()  # Exit status for each `batch` element's codearchiver process (-1 for 'end' markers)
		newfiles=()
		newfilejobindices=()  # One entry for each element of `newfiles`, containing the corresponding index in `batch` to which the file belongs

		for ((i=0; i<${#batch[@]}; ++i)); do
			line="${batch[${i}]}"
			singlejobid="${line%% *}"
			line="${line#* }"
			nick="${line%% *}"
			url="${line#* }"

			# Handle marker for end of list job: tell the user it's done and move on.
			if [[ "${url}" == 'end' ]]; then
				# No status code reflection here because the start of the list job might not even have been in this batch.
				respond "${nick}" "Job ${jobid} finished."
				statuscodes+=(-1)
				continue
			fi

			# Marker for finding new files
			touch '.filemarker'

			# Find nonexistent filename for log file (*not* concurrency-safe!)
			logbasename="$(date +%Y%m%dT%H%M%SZ)_${singlejobid}"
			if [[ -e "${logbasename}_codearchiver.log" ]]; then
				for ((i=0; ; ++i)); do
					if [[ ! -e "${logbasename}_coll${i}_codearchiver.log" ]]; then
						break
					fi
				done
				logbasename="${logbasename}_coll${i}"
			fi
			logname="${logbasename}_codearchiver.log"

			# Run codearchiver, duplicating WARNINGs and higher in the bot output
			log "Running ${url}, logging into ${logname}"
			codearchiver -vv "${url}" 2> >(tee "${logname}" | grep -Fv -e '  INFO  ' -e '  DEBUG  ' | log_loop "From codearchiver ${singlejobid}: ")
			status="$?"
			log "codearchiver ${url} finished with status code ${status}"
			statuscodes+=("${status}")
			#TODO Integrate this into the pipe from codearchiver above to avoid rereading the entire log file
			declare -i badcount=$(awk '! ($3 ~ /^(DEBUG|INFO)$/) { cnt += 1; }  END { printf "%d\n", cnt; }' "${logname}")

			# Compress log file with zstd -19
			log "Compressing log file ${logname}"
			zstd -19 --rm "${logname}" 2> >(log_loop 'zstd err: ')
			if [[ -e "${logname}.zst" && ! -e "${logname}" ]]; then
				# Compression successful
				logname="${logname}.zst"
			fi

			# Collect new files, write artefacts file, move everything but the log file to ./failed/ if codearchiver exited non-zero
			readarray -d $'\0' -t tmpnewfiles < <(find -maxdepth 1 -type f -newer '.filemarker' -print0 2> >(log_loop 'find new files err: ') | sed 's,^\./,,')
			readarray -d $'\0' -t artefacts < <(for file in "${tmpnewfiles[@]}"; do if [[ "${file}" != "${logname}" ]]; then printf '%s\0' "${file}"; fi; done)
			artefactsname="${logbasename}_codearchiver_artefacts.txt"
			for file in "${artefacts[@]}"; do
				printf '%s\n' "${file}"
			done >"${artefactsname}"
			if [[ "${status}" -ne 0 ]]; then
				msg="$(printf 'Moving artefact files'; printf ' %q' "${artefacts[@]}" "${artefactsname}"; printf ' from non-zero exit for job %s to ./failed/\n' "${singlejobid}";)"
				log "${msg}"
				mkdir --parents ./failed/
				mv --verbose -- "${artefacts[@]}" "${artefactsname}" ./failed/ 2> >(log_loop 'mv err: ') | log_loop 'mv out: '
				newfiles+=("${logname}")
				newfilejobindices+=("${i}")
			else
				for file in "${tmpnewfiles[@]}"; do
					log "New file from job ${singlejobid}: ${file}"
					newfiles+=("${file}")
					newfilejobindices+=("${i}")
				done
				newfiles+=("${artefactsname}")
				newfilejobindices+=("${i}")
			fi

			# For individual jobs, tell the user about warnings and success/failure
			if [[ "${singlejobid}" != *_* ]]; then
				if [[ "${status}" -eq 0 ]]; then
					respond "${nick}" "Job ${singlejobid} succeeded."
				else
					respond "${nick}" "Job ${singlejobid} failed."
				fi

				if [[ ${badcount} -gt 0 ]]; then
					respond "${nick}" "Job ${singlejobid} produced ${badcount} warnings or errors."
				fi
			fi
		done

		# Record SHA-256 hashes for new files
		log "SHA-256 hashes:"
		sha256sum "${newfiles[@]}" > >(log_loop 'sha256sum: ')

		# Upload
		date="$(date '+%Y-%m-%d')"
		identifier="codearchiver_${date//-/}"
		if [[ -z "${CODEARCHIVER_BOT_TEST}" ]]; then
			collection='archiveteam_codearchiver'
		else
			identifier="test_${identifier}"
			collection='test_collection'
		fi
		uploadsfine=y
		for f in "${newfiles[@]}"; do
			ia-upload-stream --no-derive "${identifier}" "${f}" \
				"collection:${collection}" \
				'mediatype:software' \
				"date:${date}" \
				<"${f}" 2> >(log_loop 'ia-upload-stream: ')
			status="$?"
			if [[ "${status}" -ne 0 ]]; then
				log "Upload failed: exit status ${status}"
				if [[ "${uploadsfine}" ]]; then
					send "Upload failed: exit status ${status}"
				fi
				uploadsfine=
			fi
		done

		if [[ "${uploadsfine}" ]]; then
			# Wait until all tasks for the item are done
			while :; do
				tasks="$(python3 -c 'import json, sys; o = json.load(sys.stdin); print(sum(o["value"]["summary"].values()))' < <({ curl --silent --verbose --fail --max-time 10 --header "Authorization: LOW ${IA_S3_ACCESS}:${IA_S3_SECRET}" "https://archive.org/services/tasks.php?identifier=${identifier}&summary=1&history=0" 2> >(log_loop 'curl IA tasks err: '); } | tee >(log_loop 'curl IA tasks out: ')))"
				if [[ "${tasks}" == '0' ]]; then
					break
				fi
				sleep 60
			done

			# Replace non-metadata files with a symlink to .uploaded dummy file
			touch '.uploaded'
			for f in "${newfiles[@]}"; do
				if [[ "${f}" != *_codearchiver_metadata.txt ]]; then
					log "Replacing ${f} with symlink to .uploaded"
					{ rm --verbose -- "${f}" && ln --symbolic --verbose '.uploaded' "${f}"; } |& log_loop 'rm/ln: '
				fi
			done
		fi
	done
}