diff --git a/social-media-normalise b/social-media-normalise
deleted file mode 100755
index 5cce5d4..0000000
--- a/social-media-normalise
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/bin/bash
-# Read a list of URLs from stdin, replace suitable social media URLs with correctly capitalised version
-errorUrls=()
-while read -r url
-do
- if [[ "${url}" == '* '* ]]
- then
- prefix="${url::2}"
- url="${url:2}"
- else
- prefix=""
- fi
-
- if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/([^/]+/?(\?|$)|pages/[^/]+/[0-9]+/?(\?|$)|pg/[^/]+([/?]|$)|profile\.php\?id=[0-9]+(&|$)) ]]
- then
- if [[ "${url}" == *profile.php* ]]
- then
- url="${url%%&*}"
- else
- url="${url%%\?*}"
- fi
- page="$(curl -sL -A 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' -H 'Accept-Language: en-US,en;q=0.5' "https://www.facebook.com/${url#*facebook.com/}")"
- user="$(grep -Po '
]*(?<=\s)data-key\s*=\s*"tab_home".*?
' <<< "${page}" | grep -Po ']*(?<=\s)href="/\K[^/]+')"
- if [[ "${user}" ]]
- then
- echo "${prefix}https://www.facebook.com/${user}/"
- continue
- else
- if grep -q 'id="pagelet_loggedout_sign_up"' <<< "${page}"
- then
- # Profile page which is only visible when logged in
- # Extract canonical URL
- user="$(grep -Po '&2
- echo "Failed to process URLs:" >&2
- for errorUrl in "${errorUrls[@]}"
- do
- echo "${errorUrl}" >&2
- done
-fi
diff --git a/url-normalise b/url-normalise
new file mode 100755
index 0000000..294a2d8
--- /dev/null
+++ b/url-normalise
@@ -0,0 +1,133 @@
+#!/bin/bash
+# Taking a list of URLs from stdin (optionally in new-viewer style wiki format), every URL is normalised as follows:
+# - For social media URLs, the correct capitalisation is extracted and extraneous parameters are removed.
+# - For YouTube user or channel URLs, the canonical base URL is extracted.
+# - For anything else, retrieval is attempted and the final, post-redirect URL is used. (To not follow redirects, use --other-no-redirects.)
+
+otherCurlRedirectOpt='-L'
+verbose=
+while [[ $# -gt 0 ]]
+do
+ if [[ "$1" == '--other-no-redirects' ]]
+ then
+ otherCurlRedirectOpt=
+ elif [[ "$1" == '--verbose' || "$1" == '-v' ]]
+ then
+ verbose=1
+ else
+ echo "Unknown option: $1" >&2
+ exit 1
+ fi
+ shift
+done
+
+function verbose_echo {
+ if [[ "${verbose}" ]]
+ then
+ echo "$@"
+ fi
+}
+
+userAgent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'
+
+while read -r line
+do
+ if [[ "${line}" != 'http://'* && "${line}" != 'https://'* && "${line}" != '* http://'* && "${line}" != '* https://'* ]]
+ then
+ echo "${line}"
+ continue
+ fi
+
+ if [[ "${line}" == '* '* ]]
+ then
+ prefix="${line::2}"
+ url="${line:2}"
+ else
+ prefix=""
+ url="${line}"
+ fi
+
+ if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/([^/]+/?(\?|$)|pages/[^/]+/[0-9]+/?(\?|$)|pg/[^/]+([/?]|$)|profile\.php\?id=[0-9]+(&|$)) ]]
+ then
+ verbose_echo "Normalising Facebook URL: ${url}" >&2
+ if [[ "${url}" == *profile.php* ]]
+ then
+ url="${url%%&*}"
+ else
+ url="${url%%\?*}"
+ fi
+ page="$(curl -sL -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "https://www.facebook.com/${url#*facebook.com/}")"
+ user="$(grep -Po ']*(?<=\s)data-key\s*=\s*"tab_home".*?
' <<< "${page}" | grep -Po ']*(?<=\s)href="/\K[^/]+')"
+ if [[ "${user}" ]]
+ then
+ echo "${prefix}https://www.facebook.com/${user}/"
+ continue
+ elif grep -q 'id="pagelet_loggedout_sign_up"' <<< "${page}"
+ then
+ # Profile page which is only visible when logged in
+ # Extract canonical URL
+ user="$(grep -Po '&2
+ echo "${prefix}${url}"
+ elif [[ "${url}" =~ ^https?://(www\.)?twitter\.com/[^/]+/?(\?.*)?$ ]]
+ then
+ verbose_echo "Normalising Twitter URL: ${url}" >&2
+ url="${url%%\?*}"
+ url="${url%/}"
+ unnormalisedUser="${url##*/}"
+ user="$(curl -sL -A "${userAgent}" "https://twitter.com/${unnormalisedUser}" | grep -Po '&2
+ echo "${prefix}${url}"
+ fi
+ elif [[ "${url}" =~ ^https?://(www\.)?instagram\.com/[^/]+/?$ ]]
+ then
+ verbose_echo "Normalising Instagram URL: ${url}" >&2
+ user="${url%/}"
+ user="${user##*/}"
+ echo "${prefix}https://www.instagram.com/${user,,}/"
+ elif [[ "${url}" =~ ^https?://(www\.)?youtube\.com/ ]]
+ then
+ verbose_echo "Normalising YouTube URL: ${url}" >&2
+ if [[ "${url}" == *'?'* ]]
+ then
+ rurl="${url}&disable_polymer=1"
+ else
+ rurl="${url}?disable_polymer=1"
+ fi
+ page="$(curl -4sL -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "${rurl}")"
+ canonical="$(grep -Po '&2
+ echo "${prefix}${url}"
+ fi
+ fi
+ else
+ verbose_echo "Normalising other URL: ${url}" >&2
+ canonical="$(curl -sS ${otherCurlRedirectOpt} --max-time 10 -A "${userAgent}" -o /dev/null -w '%{url_effective}' "${url}")"
+ if [[ "${canonical}" ]]
+ then
+ echo "${prefix}${canonical}"
+ else
+ echo "Failed to normalise other URL: ${url}" >&2
+ echo "${prefix}${url}"
+ fi
+ fi
+done
diff --git a/youtube-normalise b/youtube-normalise
deleted file mode 100755
index 50f19cf..0000000
--- a/youtube-normalise
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-while read -r url
-do
- if [[ "${url}" == '* '* ]]
- then
- prefix='* '
- url="${url:2}"
- else
- prefix=''
- fi
-
- if [[ "${url}" =~ ^https?://(www\.)?youtube\.com/ ]]
- then
- if [[ "${url}" == *'?'* ]]
- then
- rurl="${url}&disable_polymer=1"
- else
- rurl="${url}?disable_polymer=1"
- fi
- page="$(curl -4sL -A 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' -H 'Accept-Language: en-US,en;q=0.5' "${rurl}")"
- canonical="$(grep -Po '