JustAnotherArchivist
/
little-things


			
							#!/usr/bin/env python3
import itertools
import os
import re
import sys


if any(x in sys.argv for x in ['--help', '-h', '-?', 'help']):
	print('Usage: youtube-extract [massage|removenonyt]', file = sys.stderr)
	print(file = sys.stderr)
	print("In 'massage' mode (default), extracts any references to YouTube videos, channels, and playlists from the URLs on stdin and prints them on stdout.", file = sys.stderr)
	print('Lines that don\'t seem to contain references to such YouTube things are printed on stderr.', file = sys.stderr)
	print(file = sys.stderr)
	print("In 'removenonyt' mode, prints all URLs that look like they are or contain YouTube URLs on stdout.", file = sys.stderr)
	sys.exit(1)


mode = sys.argv[1] if len(sys.argv) >= 2 else 'massage'

# Only one slash before so it still matches inside URLs when slashes were collapsed.
domainPattern = re.compile(r'/(www\.|m\.)?(youtube\.(com|de|fr|co\.uk|it|es|at|pt|gr|hu|ro|pl|dk|no|se|fi|ee|lt|lv|ru|by|cz|sk|si|rs|hr|ca)|music\.youtube\.com|(es|uk|pl|ru|it|jp|br)\.youtube\.com|youtube-nocookie\.com)/', re.IGNORECASE)


if mode == 'removenonyt':
	# Anything in here could never be as fast as grep, so just delegate to that...
	os.execlp('grep', 'grep', '-P', domainPattern.pattern + '|' + domainPattern.pattern.replace('/', '%2F') + '|/youtu\.be/|%2Fyoutu\.be%2F')
	sys.exit(0)
assert mode == 'massage'


noisePattern = '|'.join([
	r'^//www\.youtube\.com/s/(desktop|player)/[0-9a-f]+/',
	r'^//www\.youtube\.com/s/gaming/emoji/',
	r'^//www\.youtube\.com/redirect\?event=channel_banner&',
	r'^//www\.youtube\.com/redirect\?(?=(.*&)?event=video_description(&|$))(?!(.*&)?v=)',
	r'^//www\.youtube\.com/yts/',
	r'^//www\.youtube\.com/img/',
	r'^//www\.youtube\.com/youtubei/',
	r'^//www\.youtube\.com/ads(/|$)',
	r'^//www\.youtube\.com/creators(/|$)',
	r'^//www\.youtube\.com/(player|iframe)_api(\?|$)',
	r'^//www\.youtube\.com/error(_204)?/?\?',
	r'^//www\.youtube\.com/(about|t|howyoutubeworks)([/?]|$)',
	r'^//www\.youtube\.com/results/?(\?|$)',
	r'^//www\.youtube\.com/premium/?\?',
	r'^//www\.youtube\.com/new([/?]|$)',
	r'^//www\.youtube\.com/?(\?|$)',
	r'^//www\.youtube\.com/embed/("|%22|' r"'|%27" r')(%20)?(\+|%3B)', # JS extraction stuff
	r'^//www\.youtube\.com/service_ajax$',
	r'^//www\.youtube\.com/watch(\?v=)?$',
	r'^//consent\.(youtube|google)\.com/',
	r'^//www\.youtube\.com/(c|user|channel|watch(_popup)?(\.php)?|embed|e|v|redirect|(my_videos_)?upload)(%[23]F|/)?$', # Miscellaneous crap
  ])

channelPattern = '|'.join([
	r'/www\.youtube\.com/c/[^/?&=.]+',
	r'/www\.youtube\.com/user/[^/?&=.]+',
	r'/www\.youtube\.com/channel/UC[0-9A-Za-z_-]{22}',
	r'/www\.youtube\.com/[^/?&=.]+(?=/?$)',
  ])

# Make sure that the last 11 chars of the match are always the video ID (because Python's re doesn't support \K).
# If necessary, use lookahead assertions to match further stuff after the video ID.
videoPattern = '|'.join([
	# Normal watch URL
	r'/www\.youtube\.com/watch(_popup)?(\.php)?/?\?(.*&)?v=[0-9A-Za-z_-]{11}',
	r'/www\.youtube\.com/watch/[0-9A-Za-z_-]{11}',
	# Embeds
	r'/www\.youtube\.com/e(mbed)?/(?!videoseries\?)[0-9A-Za-z_-]{11}',
	r'/www\.youtube\.com/embed/?\?(.*&)?v=[0-9A-Za-z_-]{11}',
	# Shortener
	r'/youtu\.be/[0-9A-Za-z_-]{11}',
	# Old (Flash) embeds
	r'/www\.youtube\.com/v/[0-9A-Za-z_-]{11}',
	# Redirects from links in video descriptions
	r'/www\.youtube\.com/redirect\?(.*&)?v=[0-9A-Za-z_-]{11}(?=&|$)',
	# Tracking and other crap
	r'/www\.youtube\.com/(ptracking|set_awesome)\?(.*&)?video_id=[0-9A-Za-z_-]{11}',
	r'/www\.youtube\.com/api/timedtext\?(.*&)?v=[0-9A-Za-z_-]{11}',
	r'/www\.youtube\.com/(my_videos_)?edit\?(.*&)?video_id=[0-9A-Za-z_-]{11}',
	# Generic v parameter on watch URLs including with percent encoding; this covers e.g. google.com/url?... or the oEmbed
	r'/watch\?(.*&)?v=[0-9A-Za-z_-]{11}',
  ])


def percentdecode(s):
	return s.replace('%2F', '/').replace('%3A', ':').replace('%3F', '?').replace('%3D', '=').replace('%26', '&')


matchers = [
	# (pattern, paramSearch, function(match: list[str]) -> output str or None); returning None stops further processing of a line
	# If paramSearch is True, a corresponding pattern with [/:?=&] replaced by their percent encodings is generated; the reverse replacement is done again automatically before calling the function.
	[noisePattern, False, lambda m: None],
	[channelPattern, True, lambda m: 'https://www.youtube.com/' + m[0].split('/', 2)[-1].rstrip('/')],
	[videoPattern, True, lambda m: f'https://www.youtube.com/watch?v={m[0][-11:]}'],
	[r'/www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=UU([0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/UC{m[1]}'],
	[r'/www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=((PL|FL|RD|OL)[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/playlist?list={m[1]}'],
	[r'/www\.youtube\.com/embed/?\?(?=(?:.*&)?listType=user_uploads(?:&|$))(?:.*&)?list=([^&]+)', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
	[r'/www\.youtube\.com/rss/user/([^/?]+)', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
	[r'/www\.youtube\.com/(?:subscription_center\?(?:.*&)?add_user=|subscribe_widget\?(?:.*&)?p=|profile\?(?:.*&)?user=)([^/=&]+)(?=(&|$))', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
	[r'/www\.youtube\.com/feeds/videos\.xml\?(?:.*&)?channel_id=(UC[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/{m[1]}'],
	[r'/www\.youtube\.com(?:/view_play_list\?(?:.*&)?p=|/playlist\?(?:.*&)?list=)([0-9A-F]{16})(?=(&|$))', True, lambda m: f'https://www.youtube.com/playlist?list=PL{m[1]}'],
	[r'^//i\.ytimg\.com/vi/([0-9A-Za-z_-]{11})/', False, lambda m: f'https://www.youtube.com/watch?v={m[1]}'],
]

# Compile pattern and generate one for parameters if desired
for e in matchers:
	e[0] = re.compile(e[0])

for origLine in sys.stdin:
	origLine = origLine.strip()
	line = re.sub(r'^https?://', '//', origLine)
	line = domainPattern.sub('/www.youtube.com/', line)
	hadMatches = False
	for pattern, paramSearch, f in matchers:
		results = set()
		for m in itertools.chain((x for x in pattern.finditer(line)), (x for x in pattern.finditer(percentdecode(line))) if paramSearch else ()):
			hadMatches = True
			r = f(m)
			if r in results:
				continue
			results.add(r)
			if r is None:
				break
			print(r)
		if None in results:
			break
	if not hadMatches:
		print(origLine, file = sys.stderr)