Add verbosity options, and annotate stderr on wiki-recursive-extract

4 yıl önce · 0f13a1fadd
--- a/+ 24
+++ b/+ 24
@@ -1,7 +1,15 @@
 #!/bin/bash
 # Given social media links on stdin or as args, this extracts the link in the profile description, if any.

 function verbose_echo {
 	if [[ "${verbose}" ]]
 	then
 		echo "$@"
 	fi
 }

 function fetch {
 	verbose_echo "Fetching $1" >&2
 	curl -sL -A 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0' "$1"
 }

@@ -33,6 +41,22 @@ function fetch_n_extract {
 	fi
 }

 verbose=
 for arg in "$@"
 do
 	if [[ "${arg}" == '--verbose' || "${arg}" == '-v' ]]
 	then
 		verbose=1
 		shift
 	elif [[ "${arg}" == '--' ]]
 	then
 		shift
 	else
 		# Assume end of options
 		break
 	fi
 done

 {
 	for arg in "$@"
 	do
--- a/+ 8
+++ b/+ 8
@@ -1,6 +1,9 @@
 #!/bin/bash
 function verbose_echo { if [[ "${verbose}" ]]; then echo "$@"; fi; }

 function fetch_n_extract {
 	local url="$1"
 	verbose_echo "Fetching ${url}" >&2
 	{
 		curl -sSL -A 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0' "${url}" | \
 		  grep -Fi -e 'facebook' -e 'flickr' -e 'instagram' -e 'twitter' -e 't.me' -e 'youtube' -e 'youtu.be' -e 'vk.com' | \
@@ -52,12 +55,17 @@ function fetch_n_extract {

 # Parse options
 printInputUrl=
 verbose=
 while [[ $# -gt 0 ]]
 do
 	if [[ "$1" == '--print-input-urls' || "$1" == '--print-input-url' ]]
 	then
 		printInputUrl=true
 		shift
 	elif [[ "$1" == '--verbose' || "$1" == 'v' ]]
 	then
 		verbose=1
 		shift
 	elif [[ "$1" == '--' ]]
 	then
 		# End of options
--- a/+ 28
+++ b/+ 28
@@ -4,6 +4,31 @@
 # Everything else is run through website-extract-social-media.
 # This is done recursively until no new links are discovered anymore.

 verbose=
 while [[ $# -gt 0 ]]
 do
 	if [[ "$1" == '--verbose' || "$1" == '-v' ]]
 	then
 		verbose='--verbose'
 	else
 		echo "Unknown option: $1" >&2
 		exit 1
 	fi
 	shift
 done

 function verbose_echo {
 	if [[ "${verbose}" ]]
 	then
 		echo "$@"
 	fi
 }

 function stderr_annotate {
 	name="${1##*/}"
 	"$@" 2> >(while read -r line; do echo "[${name}] ${line}"; done >&2)
 }

 scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
 declare -A sectionUrls
 while read -r line
@@ -11,6 +36,7 @@ do
 	echo "${line}"
 	if [[ "${line}" == '=='* ]]
 	then
 		verbose_echo "${line}" >&2
 		unset sectionUrls
 		declare -A sectionUrls
 	fi
@@ -31,11 +57,9 @@ do

 			if grep -Pq '//([^/]+\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${curUrl}"
 			then
 				echo "Calling social-media-extract-profile-link on ${curUrl}" >&2
 				mapfile -t outUrls < <("${scriptpath}/social-media-extract-profile-link" "${curUrl}" < <(:))
 				mapfile -t outUrls < <(stderr_annotate "${scriptpath}/social-media-extract-profile-link" ${verbose} "${curUrl}" < <(:))
 			else
 				echo "Calling website-extract-social-media on ${curUrl}" >&2
 				mapfile -t outUrls < <("${scriptpath}/website-extract-social-media" "${curUrl}" < <(:) | sed 's,^\(https\?://\(www\.\)\?\(.*\)\)$,\3 \1,' | sort | awk '{ print $2 }')
 				mapfile -t outUrls < <(stderr_annotate "${scriptpath}/website-extract-social-media" ${verbose} "${curUrl}" < <(:))
 			fi

 			for outUrl in "${outUrls[@]}"