From 5285c406d987c7866d53750dc1ba7dbb633bcf53 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 20 Oct 2019 17:16:56 +0000 Subject: [PATCH] Add script for recursive website and social media discovery --- wiki-recursive-extract | 56 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100755 wiki-recursive-extract diff --git a/wiki-recursive-extract b/wiki-recursive-extract new file mode 100755 index 0000000..24a7dc5 --- /dev/null +++ b/wiki-recursive-extract @@ -0,0 +1,56 @@ +#!/bin/bash +# Takes a wiki page in new-style viewer format on stdin. +# Everything that looks like a social media link (including YouTube) is run through social-media-extract-profile-link. +# Everything else is run through website-extract-social-media. +# This is done recursively until no new links are discovered anymore. + +scriptpath="$(cd "$(dirname "$0")"; pwd -P)" +declare -A sectionUrls +while read -r line +do + echo "${line}" + if [[ "${line}" == '=='* ]] + then + unset sectionUrls + declare -A sectionUrls + fi + if [[ "${line}" == '* http://'* || "${line}" == '* https://'* ]] + then + url="${line:2}" + if [[ "${url}" == *' | '* ]] + then + url="${url%% | *}" + fi + + sectionUrls["${url}"]=1 + toProcess=("${url}") + while [[ ${#toProcess[@]} -gt 0 ]] + do + curUrl="${toProcess[0]}" + toProcess=("${toProcess[@]:1}") + + if grep -Pq '//([^/]+\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${curUrl}" + then + echo "Calling social-media-extract-profile-link on ${curUrl}" >&2 + mapfile -t outUrls < <("${scriptpath}/social-media-extract-profile-link" "${curUrl}" < <(:)) + else + echo "Calling website-extract-social-media on ${curUrl}" >&2 + mapfile -t outUrls < <("${scriptpath}/website-extract-social-media" "${curUrl}" < <(:) | sed 's,^\(https\?://\(www\.\)\?\(.*\)\)$,\3 \1,' | sort | awk '{ print $2 }') + fi + + for outUrl in "${outUrls[@]}" + do + if [[ "${sectionUrls[${outUrl}]}" ]] + then + # The discovered URL was processed already, skip it entirely + continue + else + # Not-yet-known URL, add to the list of URLs to process, mark as seen, and print + toProcess+=("${outUrl}") + sectionUrls["${outUrl}"]=1 + echo "* ${outUrl}" + fi + done + done + fi +done