The little things give you away... A collection of various small helper stuff
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 

134 lignes
4.1 KiB

  1. #!/bin/bash
  2. # Taking a list of URLs from stdin (optionally in new-viewer style wiki format), every URL is normalised as follows:
  3. # - For social media URLs, the correct capitalisation is extracted and extraneous parameters are removed.
  4. # - For YouTube user or channel URLs, the canonical base URL is extracted.
  5. # - For anything else, retrieval is attempted and the final, post-redirect URL is used. (To not follow redirects, use --other-no-redirects.)
  6. otherCurlRedirectOpt='-L'
  7. verbose=
  8. while [[ $# -gt 0 ]]
  9. do
  10. if [[ "$1" == '--other-no-redirects' ]]
  11. then
  12. otherCurlRedirectOpt=
  13. elif [[ "$1" == '--verbose' || "$1" == '-v' ]]
  14. then
  15. verbose=1
  16. else
  17. echo "Unknown option: $1" >&2
  18. exit 1
  19. fi
  20. shift
  21. done
  22. function verbose_echo {
  23. if [[ "${verbose}" ]]
  24. then
  25. echo "$@"
  26. fi
  27. }
  28. userAgent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'
  29. while read -r line
  30. do
  31. if [[ "${line}" != 'http://'* && "${line}" != 'https://'* && "${line}" != '* http://'* && "${line}" != '* https://'* ]]
  32. then
  33. echo "${line}"
  34. continue
  35. fi
  36. if [[ "${line}" == '* '* ]]
  37. then
  38. prefix="${line::2}"
  39. url="${line:2}"
  40. else
  41. prefix=""
  42. url="${line}"
  43. fi
  44. if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/([^/]+/?(\?|$)|pages/[^/]+/[0-9]+/?(\?|$)|pg/[^/]+([/?]|$)|profile\.php\?id=[0-9]+(&|$)) ]]
  45. then
  46. verbose_echo "Normalising Facebook URL: ${url}" >&2
  47. if [[ "${url}" == *profile.php* ]]
  48. then
  49. url="${url%%&*}"
  50. else
  51. url="${url%%\?*}"
  52. fi
  53. page="$(curl -sL -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "https://www.facebook.com/${url#*facebook.com/}")"
  54. user="$(grep -Po '<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>' <<< "${page}" | grep -Po '<a\s[^>]*(?<=\s)href="/\K[^/]+')"
  55. if [[ "${user}" ]]
  56. then
  57. echo "${prefix}https://www.facebook.com/${user}/"
  58. continue
  59. elif grep -q 'id="pagelet_loggedout_sign_up"' <<< "${page}"
  60. then
  61. # Profile page which is only visible when logged in
  62. # Extract canonical URL
  63. user="$(grep -Po '<link rel="canonical" href="\K[^"]+' <<< "${page}")"
  64. if [[ "${user}" ]]
  65. then
  66. echo "${prefix}${user}"
  67. continue
  68. fi
  69. fi
  70. echo "Failed to normalise Facebook URL: ${url}" >&2
  71. echo "${prefix}${url}"
  72. elif [[ "${url}" =~ ^https?://(www\.)?twitter\.com/[^/]+/?(\?.*)?$ ]]
  73. then
  74. verbose_echo "Normalising Twitter URL: ${url}" >&2
  75. url="${url%%\?*}"
  76. url="${url%/}"
  77. unnormalisedUser="${url##*/}"
  78. user="$(curl -sL -A "${userAgent}" "https://twitter.com/${unnormalisedUser}" | grep -Po '<a class="([^"]*\s)?ProfileHeaderCard-screennameLink(\s[^"]*)?" href="/\K[^/"]+(?=")')"
  79. if [[ "${user}" ]]
  80. then
  81. echo "${prefix}https://twitter.com/${user}"
  82. else
  83. echo "Failed to normalise Twitter URL: ${url}" >&2
  84. echo "${prefix}${url}"
  85. fi
  86. elif [[ "${url}" =~ ^https?://(www\.)?instagram\.com/[^/]+/?$ ]]
  87. then
  88. verbose_echo "Normalising Instagram URL: ${url}" >&2
  89. user="${url%/}"
  90. user="${user##*/}"
  91. echo "${prefix}https://www.instagram.com/${user,,}/"
  92. elif [[ "${url}" =~ ^https?://(www\.)?youtube\.com/ ]]
  93. then
  94. verbose_echo "Normalising YouTube URL: ${url}" >&2
  95. if [[ "${url}" == *'?'* ]]
  96. then
  97. rurl="${url}&disable_polymer=1"
  98. else
  99. rurl="${url}?disable_polymer=1"
  100. fi
  101. page="$(curl -4sL -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "${rurl}")"
  102. canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kuser/[^"]+' <<< "${page}")"
  103. if [[ "${canonical}" ]]
  104. then
  105. echo "${prefix}https://www.youtube.com/${canonical}"
  106. else
  107. canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kchannel/[^"]+' <<< "${page}")"
  108. if [[ "${canonical}" ]]
  109. then
  110. echo "${prefix}https://www.youtube.com/${canonical}"
  111. else
  112. echo "Failed to normalise YouTube URL: ${url}" >&2
  113. echo "${prefix}${url}"
  114. fi
  115. fi
  116. else
  117. verbose_echo "Normalising other URL: ${url}" >&2
  118. canonical="$(curl -sS ${otherCurlRedirectOpt} --max-time 10 -A "${userAgent}" -o /dev/null -w '%{url_effective}' "${url}")"
  119. if [[ "${canonical}" ]]
  120. then
  121. echo "${prefix}${canonical}"
  122. else
  123. echo "Failed to normalise other URL: ${url}" >&2
  124. echo "${prefix}${url}"
  125. fi
  126. fi
  127. done