#!/bin/bash
# Takes a wiki page in new-style viewer format on stdin.
# Everything that looks like a social media link (including YouTube) is run through social-media-extract-profile-link.
# Everything else is run through website-extract-social-media.
# This is done recursively until no new links are discovered anymore.

verbose=
while [[ $# -gt 0 ]]
do
	if [[ "$1" == '--verbose' || "$1" == '-v' ]]
	then
		verbose='--verbose'
	else
		echo "Unknown option: $1" >&2
		exit 1
	fi
	shift
done

function verbose_echo {
	if [[ "${verbose}" ]]
	then
		echo "$@"
	fi
}

function stderr_annotate {
	name="${1##*/}"
	"$@" 2> >(while read -r line; do echo "[${name}] ${line}"; done >&2)
}

scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
declare -A sectionUrls
while read -r line
do
	echo "${line}"
	if [[ "${line}" == '=='* ]]
	then
		verbose_echo "${line}" >&2
		unset sectionUrls
		declare -A sectionUrls
	fi
	if [[ "${line}" == '* http://'* || "${line}" == '* https://'* ]]
	then
		url="${line:2}"
		if [[ "${url}" == *' | '* ]]
		then
			url="${url%% | *}"
		fi

		sectionUrls["${url}"]=1
		toProcess=("${url}")
		while [[ ${#toProcess[@]} -gt 0 ]]
		do
			curUrl="${toProcess[0]}"
			toProcess=("${toProcess[@]:1}")

			if grep -Pq '//([^/]+\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${curUrl}"
			then
				mapfile -t outUrls < <(stderr_annotate "${scriptpath}/social-media-extract-profile-link" ${verbose} "${curUrl}" < <(:))
			else
				mapfile -t outUrls < <(stderr_annotate "${scriptpath}/website-extract-social-media" ${verbose} "${curUrl}" < <(:))
			fi

			for outUrl in "${outUrls[@]}"
			do
				if [[ "${sectionUrls[${outUrl}]}" ]]
				then
					# The discovered URL was processed already, skip it entirely
					continue
				else
					# Not-yet-known URL, add to the list of URLs to process, mark as seen, and print
					toProcess+=("${outUrl}")
					sectionUrls["${outUrl}"]=1
					echo "* ${outUrl}"
				fi
			done
		done
	fi
done