Add script for scraping MEP links from europarl.europa.eu

5 years ago · baa8a566bd
--- a/+ 59
+++ b/+ 59
@@ -0,0 +1,59 @@
 #!/bin/bash
 # Collect all websites and social media for MEPs based on https://www.europarl.europa.eu/meps/en/full-list/all
 # Writes to several file descriptors:
 # - Info about what it's doing to stderr
 # - Extracted URLs to FD 3
 # - Warnings about EP Newshub links to FD 4

 # https://unix.stackexchange.com/a/206848
 if ! { >&3; } 2>/dev/null
 then
 	echo "Error: FD 3 not open" >&1
 	exit 1
 fi
 if ! { >&4; } 2>/dev/null
 then
 	echo "Error: FD 4 not open" >&1
 	exit 1
 fi

 scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
 export PATH="${scriptpath}:${PATH}"
 echo "Fetching MEP list" >&1
 curl-archivebot-ua -s "https://www.europarl.europa.eu/meps/en/full-list/all" | \
 	grep -Po '<a class="ep_content" href="\K/meps/en/\d+(?=")' | \
 	while read -r profileUrl
 	do
 		profileUrl="https://www.europarl.europa.eu${profileUrl}"
 		echo "Fetching ${profileUrl}" >&1
 		profilePage="$(curl-archivebot-ua -sL "${profileUrl}")"
 		mapfile -t urls < <(tr -d '\r\n' <<< "${profilePage}" | \
 			grep -Po '<div class="ep-a_share ep-layout_socialnetwok">.*?</ul>' | \
 			grep -Po '<a\s+([^>]*\s+)?href="\K(?!mailto:)[^"]+')

 		# Classification
 		for url in "${urls[@]}"
 		do
 			if [[ "${url}" =~ //((www|[a-z][a-z]-[a-z][a-z])\.)?facebook\.com/ ]]
 			then
 				echo "Facebook: ${url}"
 			elif [[ "${url}" =~ //(www\.)?instagram\.com/ ]]
 			then
 				echo "Instagram: ${url}"
 			elif [[ "${url}" =~ //(www\.)?twitter\.com/ ]]
 			then
 				echo "Twitter: ${url}"
 			elif [[ "${url}" =~ //([^/]+\.)?youtube\.com/ || "${url}" =~ //youtu\.be/ ]]
 			then
 				echo "YouTube: ${url}"
 			else
 				echo "Other: ${url}"
 			fi
 		done >&3

 		# Check if there's a newshub mention and print a warning about that if necessary
 		if grep -q 'container_header_newshub' <<< "${profilePage}"
 		then
 			echo "Has EP Newshub link: ${profileUrl}" >&4
 		fi
 	done