diff --git a/social-media-normalise b/social-media-normalise deleted file mode 100755 index 5cce5d4..0000000 --- a/social-media-normalise +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash -# Read a list of URLs from stdin, replace suitable social media URLs with correctly capitalised version -errorUrls=() -while read -r url -do - if [[ "${url}" == '* '* ]] - then - prefix="${url::2}" - url="${url:2}" - else - prefix="" - fi - - if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/([^/]+/?(\?|$)|pages/[^/]+/[0-9]+/?(\?|$)|pg/[^/]+([/?]|$)|profile\.php\?id=[0-9]+(&|$)) ]] - then - if [[ "${url}" == *profile.php* ]] - then - url="${url%%&*}" - else - url="${url%%\?*}" - fi - page="$(curl -sL -A 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' -H 'Accept-Language: en-US,en;q=0.5' "https://www.facebook.com/${url#*facebook.com/}")" - user="$(grep -Po ']*(?<=\s)data-key\s*=\s*"tab_home".*?' <<< "${page}" | grep -Po ']*(?<=\s)href="/\K[^/]+')" - if [[ "${user}" ]] - then - echo "${prefix}https://www.facebook.com/${user}/" - continue - else - if grep -q 'id="pagelet_loggedout_sign_up"' <<< "${page}" - then - # Profile page which is only visible when logged in - # Extract canonical URL - user="$(grep -Po '&2 - echo "Failed to process URLs:" >&2 - for errorUrl in "${errorUrls[@]}" - do - echo "${errorUrl}" >&2 - done -fi diff --git a/url-normalise b/url-normalise new file mode 100755 index 0000000..294a2d8 --- /dev/null +++ b/url-normalise @@ -0,0 +1,133 @@ +#!/bin/bash +# Taking a list of URLs from stdin (optionally in new-viewer style wiki format), every URL is normalised as follows: +# - For social media URLs, the correct capitalisation is extracted and extraneous parameters are removed. +# - For YouTube user or channel URLs, the canonical base URL is extracted. +# - For anything else, retrieval is attempted and the final, post-redirect URL is used. (To not follow redirects, use --other-no-redirects.) + +otherCurlRedirectOpt='-L' +verbose= +while [[ $# -gt 0 ]] +do + if [[ "$1" == '--other-no-redirects' ]] + then + otherCurlRedirectOpt= + elif [[ "$1" == '--verbose' || "$1" == '-v' ]] + then + verbose=1 + else + echo "Unknown option: $1" >&2 + exit 1 + fi + shift +done + +function verbose_echo { + if [[ "${verbose}" ]] + then + echo "$@" + fi +} + +userAgent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0' + +while read -r line +do + if [[ "${line}" != 'http://'* && "${line}" != 'https://'* && "${line}" != '* http://'* && "${line}" != '* https://'* ]] + then + echo "${line}" + continue + fi + + if [[ "${line}" == '* '* ]] + then + prefix="${line::2}" + url="${line:2}" + else + prefix="" + url="${line}" + fi + + if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/([^/]+/?(\?|$)|pages/[^/]+/[0-9]+/?(\?|$)|pg/[^/]+([/?]|$)|profile\.php\?id=[0-9]+(&|$)) ]] + then + verbose_echo "Normalising Facebook URL: ${url}" >&2 + if [[ "${url}" == *profile.php* ]] + then + url="${url%%&*}" + else + url="${url%%\?*}" + fi + page="$(curl -sL -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "https://www.facebook.com/${url#*facebook.com/}")" + user="$(grep -Po ']*(?<=\s)data-key\s*=\s*"tab_home".*?' <<< "${page}" | grep -Po ']*(?<=\s)href="/\K[^/]+')" + if [[ "${user}" ]] + then + echo "${prefix}https://www.facebook.com/${user}/" + continue + elif grep -q 'id="pagelet_loggedout_sign_up"' <<< "${page}" + then + # Profile page which is only visible when logged in + # Extract canonical URL + user="$(grep -Po '&2 + echo "${prefix}${url}" + elif [[ "${url}" =~ ^https?://(www\.)?twitter\.com/[^/]+/?(\?.*)?$ ]] + then + verbose_echo "Normalising Twitter URL: ${url}" >&2 + url="${url%%\?*}" + url="${url%/}" + unnormalisedUser="${url##*/}" + user="$(curl -sL -A "${userAgent}" "https://twitter.com/${unnormalisedUser}" | grep -Po '&2 + echo "${prefix}${url}" + fi + elif [[ "${url}" =~ ^https?://(www\.)?instagram\.com/[^/]+/?$ ]] + then + verbose_echo "Normalising Instagram URL: ${url}" >&2 + user="${url%/}" + user="${user##*/}" + echo "${prefix}https://www.instagram.com/${user,,}/" + elif [[ "${url}" =~ ^https?://(www\.)?youtube\.com/ ]] + then + verbose_echo "Normalising YouTube URL: ${url}" >&2 + if [[ "${url}" == *'?'* ]] + then + rurl="${url}&disable_polymer=1" + else + rurl="${url}?disable_polymer=1" + fi + page="$(curl -4sL -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "${rurl}")" + canonical="$(grep -Po '&2 + echo "${prefix}${url}" + fi + fi + else + verbose_echo "Normalising other URL: ${url}" >&2 + canonical="$(curl -sS ${otherCurlRedirectOpt} --max-time 10 -A "${userAgent}" -o /dev/null -w '%{url_effective}' "${url}")" + if [[ "${canonical}" ]] + then + echo "${prefix}${canonical}" + else + echo "Failed to normalise other URL: ${url}" >&2 + echo "${prefix}${url}" + fi + fi +done diff --git a/youtube-normalise b/youtube-normalise deleted file mode 100755 index 50f19cf..0000000 --- a/youtube-normalise +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash -while read -r url -do - if [[ "${url}" == '* '* ]] - then - prefix='* ' - url="${url:2}" - else - prefix='' - fi - - if [[ "${url}" =~ ^https?://(www\.)?youtube\.com/ ]] - then - if [[ "${url}" == *'?'* ]] - then - rurl="${url}&disable_polymer=1" - else - rurl="${url}?disable_polymer=1" - fi - page="$(curl -4sL -A 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' -H 'Accept-Language: en-US,en;q=0.5' "${rurl}")" - canonical="$(grep -Po '