ソースを参照

One URL normalisation script to rule them all

Consolidate social media profile, YouTube, and (new) generic web page URL normalisation into one script
master
JustAnotherArchivist 4年前
コミット
dc4efcfbfb
3個のファイルの変更133行の追加111行の削除
  1. +0
    -74
      social-media-normalise
  2. +133
    -0
      url-normalise
  3. +0
    -37
      youtube-normalise

+ 0
- 74
social-media-normalise ファイルの表示

@@ -1,74 +0,0 @@
#!/bin/bash
# Read a list of URLs from stdin, replace suitable social media URLs with correctly capitalised version
errorUrls=()
while read -r url
do
if [[ "${url}" == '* '* ]]
then
prefix="${url::2}"
url="${url:2}"
else
prefix=""
fi

if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/([^/]+/?(\?|$)|pages/[^/]+/[0-9]+/?(\?|$)|pg/[^/]+([/?]|$)|profile\.php\?id=[0-9]+(&|$)) ]]
then
if [[ "${url}" == *profile.php* ]]
then
url="${url%%&*}"
else
url="${url%%\?*}"
fi
page="$(curl -sL -A 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' -H 'Accept-Language: en-US,en;q=0.5' "https://www.facebook.com/${url#*facebook.com/}")"
user="$(grep -Po '<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>' <<< "${page}" | grep -Po '<a\s[^>]*(?<=\s)href="/\K[^/]+')"
if [[ "${user}" ]]
then
echo "${prefix}https://www.facebook.com/${user}/"
continue
else
if grep -q 'id="pagelet_loggedout_sign_up"' <<< "${page}"
then
# Profile page which is only visible when logged in
# Extract canonical URL
user="$(grep -Po '<link rel="canonical" href="\K[^"]+' <<< "${page}")"
if [[ "${user}" ]]
then
echo "${prefix}${user}"
continue
fi
fi
fi
errorUrls+=("${url}")
echo "${prefix}${url}"
elif [[ "${url}" =~ ^https?://(www\.)?twitter\.com/[^/]+/?(\?.*)?$ ]]
then
url="${url%%\?*}"
url="${url%/}"
unnormalisedUser="${url##*/}"
user="$(curl -sL -A 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0' "https://twitter.com/${unnormalisedUser}" | grep -Po '<a class="([^"]*\s)?ProfileHeaderCard-screennameLink(\s[^"]*)?" href="/\K[^/"]+(?=")')"
if [[ "${user}" ]]
then
echo "${prefix}https://twitter.com/${user}"
else
errorUrls+=("${url}")
echo "${prefix}${url}"
fi
elif [[ "${url}" =~ ^https?://(www\.)?instagram\.com/[^/]+/?$ ]]
then
user="${url%/}"
user="${user##*/}"
echo "${prefix}https://www.instagram.com/${user,,}/"
else
echo "${prefix}${url}"
fi
done

if [[ ${#errorUrls[@]} -gt 0 ]]
then
echo "" >&2
echo "Failed to process URLs:" >&2
for errorUrl in "${errorUrls[@]}"
do
echo "${errorUrl}" >&2
done
fi

+ 133
- 0
url-normalise ファイルの表示

@@ -0,0 +1,133 @@
#!/bin/bash
# Taking a list of URLs from stdin (optionally in new-viewer style wiki format), every URL is normalised as follows:
# - For social media URLs, the correct capitalisation is extracted and extraneous parameters are removed.
# - For YouTube user or channel URLs, the canonical base URL is extracted.
# - For anything else, retrieval is attempted and the final, post-redirect URL is used. (To not follow redirects, use --other-no-redirects.)

otherCurlRedirectOpt='-L'
verbose=
while [[ $# -gt 0 ]]
do
if [[ "$1" == '--other-no-redirects' ]]
then
otherCurlRedirectOpt=
elif [[ "$1" == '--verbose' || "$1" == '-v' ]]
then
verbose=1
else
echo "Unknown option: $1" >&2
exit 1
fi
shift
done

function verbose_echo {
if [[ "${verbose}" ]]
then
echo "$@"
fi
}

userAgent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'

while read -r line
do
if [[ "${line}" != 'http://'* && "${line}" != 'https://'* && "${line}" != '* http://'* && "${line}" != '* https://'* ]]
then
echo "${line}"
continue
fi

if [[ "${line}" == '* '* ]]
then
prefix="${line::2}"
url="${line:2}"
else
prefix=""
url="${line}"
fi

if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/([^/]+/?(\?|$)|pages/[^/]+/[0-9]+/?(\?|$)|pg/[^/]+([/?]|$)|profile\.php\?id=[0-9]+(&|$)) ]]
then
verbose_echo "Normalising Facebook URL: ${url}" >&2
if [[ "${url}" == *profile.php* ]]
then
url="${url%%&*}"
else
url="${url%%\?*}"
fi
page="$(curl -sL -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "https://www.facebook.com/${url#*facebook.com/}")"
user="$(grep -Po '<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>' <<< "${page}" | grep -Po '<a\s[^>]*(?<=\s)href="/\K[^/]+')"
if [[ "${user}" ]]
then
echo "${prefix}https://www.facebook.com/${user}/"
continue
elif grep -q 'id="pagelet_loggedout_sign_up"' <<< "${page}"
then
# Profile page which is only visible when logged in
# Extract canonical URL
user="$(grep -Po '<link rel="canonical" href="\K[^"]+' <<< "${page}")"
if [[ "${user}" ]]
then
echo "${prefix}${user}"
continue
fi
fi
echo "Failed to normalise Facebook URL: ${url}" >&2
echo "${prefix}${url}"
elif [[ "${url}" =~ ^https?://(www\.)?twitter\.com/[^/]+/?(\?.*)?$ ]]
then
verbose_echo "Normalising Twitter URL: ${url}" >&2
url="${url%%\?*}"
url="${url%/}"
unnormalisedUser="${url##*/}"
user="$(curl -sL -A "${userAgent}" "https://twitter.com/${unnormalisedUser}" | grep -Po '<a class="([^"]*\s)?ProfileHeaderCard-screennameLink(\s[^"]*)?" href="/\K[^/"]+(?=")')"
if [[ "${user}" ]]
then
echo "${prefix}https://twitter.com/${user}"
else
echo "Failed to normalise Twitter URL: ${url}" >&2
echo "${prefix}${url}"
fi
elif [[ "${url}" =~ ^https?://(www\.)?instagram\.com/[^/]+/?$ ]]
then
verbose_echo "Normalising Instagram URL: ${url}" >&2
user="${url%/}"
user="${user##*/}"
echo "${prefix}https://www.instagram.com/${user,,}/"
elif [[ "${url}" =~ ^https?://(www\.)?youtube\.com/ ]]
then
verbose_echo "Normalising YouTube URL: ${url}" >&2
if [[ "${url}" == *'?'* ]]
then
rurl="${url}&disable_polymer=1"
else
rurl="${url}?disable_polymer=1"
fi
page="$(curl -4sL -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "${rurl}")"
canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kuser/[^"]+' <<< "${page}")"
if [[ "${canonical}" ]]
then
echo "${prefix}https://www.youtube.com/${canonical}"
else
canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kchannel/[^"]+' <<< "${page}")"
if [[ "${canonical}" ]]
then
echo "${prefix}https://www.youtube.com/${canonical}"
else
echo "Failed to normalise YouTube URL: ${url}" >&2
echo "${prefix}${url}"
fi
fi
else
verbose_echo "Normalising other URL: ${url}" >&2
canonical="$(curl -sS ${otherCurlRedirectOpt} --max-time 10 -A "${userAgent}" -o /dev/null -w '%{url_effective}' "${url}")"
if [[ "${canonical}" ]]
then
echo "${prefix}${canonical}"
else
echo "Failed to normalise other URL: ${url}" >&2
echo "${prefix}${url}"
fi
fi
done

+ 0
- 37
youtube-normalise ファイルの表示

@@ -1,37 +0,0 @@
#!/bin/bash
while read -r url
do
if [[ "${url}" == '* '* ]]
then
prefix='* '
url="${url:2}"
else
prefix=''
fi

if [[ "${url}" =~ ^https?://(www\.)?youtube\.com/ ]]
then
if [[ "${url}" == *'?'* ]]
then
rurl="${url}&disable_polymer=1"
else
rurl="${url}?disable_polymer=1"
fi
page="$(curl -4sL -A 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' -H 'Accept-Language: en-US,en;q=0.5' "${rurl}")"
canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kuser/[^"]+' <<< "${page}")"
if [[ "${canonical}" ]]
then
echo "${prefix}https://www.youtube.com/${canonical}"
else
canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kchannel/[^"]+' <<< "${page}")"
if [[ "${canonical}" ]]
then
echo "${prefix}https://www.youtube.com/${canonical}"
else
echo "${prefix}${url}"
fi
fi
else
echo "${prefix}${url}"
fi
done

読み込み中…
キャンセル
保存