diff --git a/snscrape-normalise b/snscrape-normalise index 485aa19..e34b54b 100755 --- a/snscrape-normalise +++ b/snscrape-normalise @@ -3,16 +3,35 @@ errorUrls=() while read -r url do - if [[ "${url}" =~ ^https?://(www|m|[a-z][a-z]-[a-z][a-z]).facebook.com/[^/]+/?$ ]] + if [[ "${url}" =~ ^https?://(www|m|[a-z][a-z]-[a-z][a-z]).facebook.com/([^/]+/?(\?|$)|pages/[^/]+/[0-9]+/?(\?|$)|profile\.php\?id=[0-9]+(&|$)) ]] then - user="$(curl -s -A 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' -H 'Accept-Language: en-US,en;q=0.5' "https://www.${url#*.}" | grep -Po ']*(?<=\s)data-key\s*=\s*"tab_home".*?' | grep -Po ']*(?<=\s)href="/\K[^/]+')" + if [[ "${url}" == *profile.php* ]] + then + url="${url%%&*}" + else + url="${url%%\?*}" + fi + page="$(curl -sL -A 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' -H 'Accept-Language: en-US,en;q=0.5' "https://www.${url#*.}")" + user="$(grep -Po ']*(?<=\s)data-key\s*=\s*"tab_home".*?' <<< "${page}" | grep -Po ']*(?<=\s)href="/\K[^/]+')" if [[ "${user}" ]] then echo "https://www.facebook.com/${user}/" + continue else - errorUrls+=("${url}") - echo "${url}" + if grep -q 'id="pagelet_loggedout_sign_up"' <<< "${page}" + then + # Profile page which is only visible when logged in + # Extract canonical URL + user="$(grep -Po '