|
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859 |
- #!/bin/bash
- # Collect all websites and social media for MEPs based on https://www.europarl.europa.eu/meps/en/full-list/all
- # Writes to several file descriptors:
- # - Info about what it's doing to stderr
- # - Extracted URLs to FD 3
- # - Warnings about EP Newshub links to FD 4
-
- # https://unix.stackexchange.com/a/206848
- if ! { >&3; } 2>/dev/null
- then
- echo "Error: FD 3 not open" >&1
- exit 1
- fi
- if ! { >&4; } 2>/dev/null
- then
- echo "Error: FD 4 not open" >&1
- exit 1
- fi
-
- scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
- export PATH="${scriptpath}:${PATH}"
- echo "Fetching MEP list" >&1
- curl-archivebot-ua -s "https://www.europarl.europa.eu/meps/en/full-list/all" | \
- grep -Po '<a class="ep_content" href="\K/meps/en/\d+(?=")' | \
- while read -r profileUrl
- do
- profileUrl="https://www.europarl.europa.eu${profileUrl}"
- echo "Fetching ${profileUrl}" >&1
- profilePage="$(curl-archivebot-ua -sL "${profileUrl}")"
- mapfile -t urls < <(tr -d '\r\n' <<< "${profilePage}" | \
- grep -Po '<div class="ep-a_share ep-layout_socialnetwok">.*?</ul>' | \
- grep -Po '<a\s+([^>]*\s+)?href="\K(?!mailto:)[^"]+')
-
- # Classification
- for url in "${urls[@]}"
- do
- if [[ "${url}" =~ //((www|[a-z][a-z]-[a-z][a-z])\.)?facebook\.com/ ]]
- then
- echo "Facebook: ${url}"
- elif [[ "${url}" =~ //(www\.)?instagram\.com/ ]]
- then
- echo "Instagram: ${url}"
- elif [[ "${url}" =~ //(www\.)?twitter\.com/ ]]
- then
- echo "Twitter: ${url}"
- elif [[ "${url}" =~ //([^/]+\.)?youtube\.com/ || "${url}" =~ //youtu\.be/ ]]
- then
- echo "YouTube: ${url}"
- else
- echo "Other: ${url}"
- fi
- done >&3
-
- # Check if there's a newshub mention and print a warning about that if necessary
- if grep -q 'container_header_newshub' <<< "${profilePage}"
- then
- echo "Has EP Newshub link: ${profileUrl}" >&4
- fi
- done
|