|
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788 |
- #!/bin/bash
- # Takes a wiki page in new-style viewer format on stdin.
- # Everything that looks like a social media link (including YouTube) is run through social-media-extract-profile-link.
- # Everything else is run through website-extract-social-media.
- # This is done recursively until no new links are discovered anymore.
- # The output is further fed through url-normalise before and during processing to avoid equivalent but slightly different duplicates, and the output is deduplicated within each section at the end.
-
- verbose=
- while [[ $# -gt 0 ]]
- do
- if [[ "$1" == '--verbose' || "$1" == '-v' ]]
- then
- verbose='--verbose'
- else
- echo "Unknown option: $1" >&2
- exit 1
- fi
- shift
- done
-
- function verbose_echo {
- if [[ "${verbose}" ]]
- then
- echo "$@"
- fi
- }
-
- function stderr_annotate {
- name="$1"
- shift
- if [[ "${name}" == '' ]]; then name="${1##*/}"; fi
- "$@" 2> >(while read -r line; do echo "[${name}] ${line}"; done >&2)
- }
-
- scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
- declare -A sectionUrls
- stderr_annotate 'url-normalise/before' "${scriptpath}/url-normalise" ${verbose} | while read -r line
- do
- echo "${line}"
- if [[ "${line}" == '=='* ]]
- then
- verbose_echo "${line}" >&2
- unset sectionUrls
- declare -A sectionUrls
- fi
- if [[ "${line}" == '* http://'* || "${line}" == '* https://'* ]]
- then
- url="${line:2}"
- if [[ "${url}" == *' | '* ]]
- then
- url="${url%% | *}"
- fi
-
- if [[ "${sectionUrls[${url}]}" ]]
- then
- # Processed already, skip
- continue
- fi
- sectionUrls["${url}"]=1
- toProcess=("${url}")
- while [[ ${#toProcess[@]} -gt 0 ]]
- do
- curUrl="${toProcess[0]}"
- toProcess=("${toProcess[@]:1}")
-
- if grep -Pq '//([^/]+\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${curUrl}"
- then
- mapfile -t outUrls < <(stderr_annotate '' "${scriptpath}/social-media-extract-profile-link" ${verbose} "${curUrl}" < <(:) | stderr_annotate 'url-normalise/post-social' "${scriptpath}/url-normalise" ${verbose})
- else
- mapfile -t outUrls < <(stderr_annotate '' "${scriptpath}/website-extract-social-media" ${verbose} "${curUrl}" < <(:) | stderr_annotate 'url-normalise/post-web' "${scriptpath}/url-normalise" ${verbose})
- fi
-
- for outUrl in "${outUrls[@]}"
- do
- if [[ "${sectionUrls[${outUrl}]}" ]]
- then
- # The discovered URL was processed already, skip it entirely
- continue
- else
- # Not-yet-known URL, add to the list of URLs to process, mark as seen, and print
- toProcess+=("${outUrl}")
- sectionUrls["${outUrl}"]=1
- echo "* ${outUrl}"
- fi
- done
- done
- fi
- done | mawk -W interactive '! /^\*/ { print; } /^\*/ && !seen[$0]++ { print; } /^==/ { delete seen; }'
|