Refactor

3 years ago · 0ee83bc0f2
--- a/+ 64
+++ b/+ 64
@@ -1,4 +1,6 @@
 #!/usr/bin/env python3
 import itertools
 import os
 import re
 import sys

@@ -13,19 +15,28 @@ if any(x in sys.argv for x in ['--help', '-h', '-?', 'help']):
 	sys.exit(1)


 # For all patterns: don't use \\? for an optional backslash (if needed) as it breaks the automatic pattern rewrite for parameters below. Use [\\]? instead.
 mode = sys.argv[1] if len(sys.argv) >= 2 else 'massage'

 # Channel/user URLs; the protocol and domain are stripped and replaced below.
 channelPattern = re.compile('|'.join([
 	r'//www\.youtube\.com/c/[^/?]+',
 	r'//www\.youtube\.com/user/[^/?]+',
 	r'//www\.youtube\.com/channel/UC[^/?]+',
 	r'//www\.youtube\.com/(?!(c|user|channel|watch(_popup)?(\.php)?|embed|e|v|redirect)$)[^/?]+(?=/?$)',
  ]))

 if mode == 'removenonyt':
 	# Anything in here could never be as fast as grep, so just delegate to that...
 	os.execlp('grep', 'grep', '-F', '-e', '/www.youtube.com/', '-e', '/youtu.be/', '-e', '%2Fwww.youtube.com%2F', '-e', '%2Fyoutu.be%2F')
 	sys.exit(0)
 assert mode == 'massage'


 # For all patterns: don't use \\? for an optional backslash (if needed) as it breaks the automatic pattern rewrite for parameters below. Use [\\]? or (\\)? instead. But really, why would you have backslashes in URLs?

 channelPattern = '|'.join([
 	r'//www\.youtube\.com/c/[^/?&]+',
 	r'//www\.youtube\.com/user/[^/?&]+',
 	r'//www\.youtube\.com/channel/UC[^/?&]+',
 	r'//www\.youtube\.com/(?!(c|user|channel|watch(_popup)?(\.php)?|embed|e|v|redirect)(%[23]F|$))[^/?&]+(?=/?$)',
  ])

 # Make sure that the last 11 chars of the match are always the video ID (because Python's re doesn't support \K).
 # If necessary, use lookahead assertions to match further stuff after the video ID.
 videoPattern = re.compile('|'.join([
 videoPattern = '|'.join([
 	# Normal watch URL
 	r'//www\.youtube\.com/watch(_popup)?(\.php)?/?\?(.*&)?v=[0-9A-Za-z_-]{11}',
 	r'//www\.youtube\.com/watch/[0-9A-Za-z_-]{11}',
@@ -44,9 +55,9 @@ videoPattern = re.compile('|'.join([
 	r'//www\.youtube\.com/(my_videos_)?edit\?(.*&)?video_id=[0-9A-Za-z_-]{11}',
 	# Generic v parameter on watch URLs including with percent encoding; this covers e.g. google.com/url?... or the oEmbed
 	r'/watch\?(.*&)?v=[0-9A-Za-z_-]{11}',
  ]))
  ])

 noisePattern = re.compile('|'.join([
 noisePattern = '|'.join([
 	r'^//www\.youtube\.com/s/(desktop|player)/[0-9a-f]+/',
 	r'^//www\.youtube\.com/s/gaming/emoji/',
 	r'^//www\.youtube\.com/redirect\?event=channel_banner&',
@@ -54,39 +65,51 @@ noisePattern = re.compile('|'.join([
 	r'^//www\.youtube\.com/yts/',
 	r'^//www\.youtube\.com/img/',
 	r'^//www\.youtube\.com/youtubei/',
 	r'^//www\.youtube\.com/ads/',
 	r'^//www\.youtube\.com/ads(/|$)',
 	r'^//www\.youtube\.com/creators(/|$)',
 	r'^//www\.youtube\.com/(player|iframe)_api\?',
 	r'^//www\.youtube\.com/error(_204)?/?\?',
 	r'^//www\.youtube\.com/(about|t|howyoutubeworks)[/?]',
 	r'^//www\.youtube\.com/(about|t|howyoutubeworks)([/?]|$)',
 	r'^//www\.youtube\.com/results/?\?',
 	r'^//www\.youtube\.com/premium/?\?',
 	r'^//www\.youtube\.com/new([/?]|$)',
 	r'^//www\.youtube\.com/?(\?|$)',
 	r'^//www\.youtube\.com/embed/("|%22|' r"'|%27" r')(%20)?(\+|%3B)', # JS extraction stuff
 	r'^//www\.youtube\.com/service_ajax$',
 	r'^//www\.youtube\.com/watch(\?v=)?$',
 	r'^//consent\.(youtube|google)\.com/',
  ]))
  ])


 def percentdecode(s):
 	return s.replace('%2F', '/').replace('%3A', ':').replace('%3F', '?').replace('%3D', '=').replace('%26', '&')


 matchers = [
 	# (pattern, paramSearch, function(match) -> output str or None); returning None stops further processing
 	# (pattern, paramSearch, function(match: list[str]) -> output str or None); returning None stops further processing of a line
 	# If paramSearch is True, a corresponding pattern with [/:?=&] replaced by their percent encodings is generated; the reverse replacement is done again automatically before calling the function.
 	[noisePattern, False, lambda m: None],
 	[channelPattern, True, lambda m: 'https://www.youtube.com/' + m[0].split('/', 3)[-1]],
 	[videoPattern, True, lambda m: f'https://www.youtube.com/watch?v={m[0][-11:]}'],
 	[re.compile(r'//www\.youtube\.com/(playlist|embed(/videoseries|/\+lastest|/playlist)?/?)\?(.*&)?list=UU[0-9A-Za-z_-]+'), True, lambda m: f'https://www.youtube.com/channel/UC{m[0].rsplit("=", 1)[1][2:]}'],
 	[re.compile(r'//www\.youtube\.com/(playlist|embed(/videoseries|/\+lastest|/playlist)?/?)\?(.*&)?list=(PL|FL|RD|OL)[0-9A-Za-z_-]+'), True, lambda m: f'https://www.youtube.com/playlist?list={m[0].rsplit("=", 1)[1]}'],
 	[re.compile(r'//www\.youtube\.com/embed/?\?(?=(.*&)?listType=user_uploads(&|$))(.*&)?list=[^&]+'), True, lambda m: f'https://www.youtube.com/user/{m[0].rsplit("=", 1)[1]}'],
 	[re.compile(r'//www\.youtube\.com/rss/user/[^/]+'), True, lambda m: f'https://www.youtube.com/user/{m[0].rsplit("/", 1)[1]}'],
 	[re.compile(r'//www\.youtube\.com/feeds/videos\.xml\?(.*&)?channel_id=UC[0-9A-Za-z_-]+'), True, lambda m: f'https://www.youtube.com/channel/{m[0].rsplit("=", 1)[1]}'],
 	[re.compile(r'//www\.youtube\.com(/view_play_list\?p=|/playlist\?(.*&)?list=)[0-9A-F]{16}(?=(&|$))'), True, lambda m: f'https://www.youtube.com/playlist?list=PL{m[0].rsplit("=", 1)[1]}'],
 	[re.compile(r'^//i\.ytimg\.com/vi/([0-9A-Za-z_-]{11})/'), False, lambda m: f'https://www.youtube.com/watch?v={m[1]}'],
 	[noisePattern, False, lambda m: None],
 	[r'//www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=UU([0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/UC{m[1]}'],
 	[r'//www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=((PL|FL|RD|OL)[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/playlist?list={m[1]}'],
 	[r'//www\.youtube\.com/embed/?\?(?=(?:.*&)?listType=user_uploads(?:&|$))(?:.*&)?list=([^&]+)', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
 	[r'//www\.youtube\.com/rss/user/([^/?]+)', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
 	[r'//www\.youtube\.com/subscription_center\?(?:.*&)?add_user=([^/=&]+)(?=(&|$))', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
 	[r'//www\.youtube\.com/feeds/videos\.xml\?(?:.*&)?channel_id=(UC[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/{m[1]}'],
 	[r'//www\.youtube\.com(?:/view_play_list\?(?:.*&)?p=|/playlist\?(?:.*&)?list=)([0-9A-F]{16})(?=(&|$))', True, lambda m: f'https://www.youtube.com/playlist?list=PL{m[1]}'],
 	[r'^//i\.ytimg\.com/vi/([0-9A-Za-z_-]{11})/', False, lambda m: f'https://www.youtube.com/watch?v={m[1]}'],
 ]

 # Compile second pattern for parameters if needed
 # Compile pattern and generate one for parameters if desired
 for e in matchers:
 	pattern, paramSearch, f = e
 	e[0] = re.compile(pattern)
 	if paramSearch:
 		p2 = pattern.pattern.replace('//', '/{1,2}').replace('/', '(/|%2F)').replace(':', '(:|%3A)').replace(r'\?', r'(\?|%3F)')
 		p2 = pattern.replace('//', '/{1,2}').replace('/', '(/|%2F)').replace(r'\?', r'(\?|%3F)')
 		p2 = re.sub(r'(?<!\(\?):', '(:|%3A)', p2)
 		p2 = re.sub(r'(?<!\(\?)=', '(=|%3D)', p2)
 		e[1] = re.compile(p2.replace('&', '(&|%26)'), pattern.flags)
 		e[1] = re.compile(p2.replace('&', '(&|%26)'))
 	else:
 		e[1] = None

@@ -97,28 +120,19 @@ for origLine in sys.stdin:
 	origLine = origLine.strip()
 	line = re.sub(r'^https?://', '//', origLine)
 	line = domainPattern.sub('/www.youtube.com/', line)
 	if sys.argv[1] == 'massage':
 		hadMatches = False
 		for pattern1, pattern2, f in matchers:
 			patterns = [pattern1]
 			if pattern2:
 				patterns.append(pattern2)
 			results = set()
 			for pattern in patterns:
 				m = pattern.search(line)
 				if m:
 					hadMatches = True
 					r = f(m)
 					if r in results:
 						continue
 					results.add(r)
 					if r is None:
 						break
 					print(r)
 			if None in results:
 	hadMatches = False
 	for pattern1, pattern2, f in matchers:
 		results = set()
 		for m, encoded in itertools.chain(((x, False) for x in pattern1.finditer(line)), ((x, True) for x in pattern2.finditer(line)) if pattern2 else ()):
 			hadMatches = True
 			r = f(m if not encoded else [percentdecode(x) if x else x for x in itertools.chain((m[0],), m.groups())])
 			if r in results:
 				continue
 			results.add(r)
 			if r is None:
 				break
 		if not hadMatches:
 			print(origLine, file = sys.stderr)
 	elif sys.argv[1] == 'removenonyt':
 		if any(x in line for x in ('/www.youtube.com/', '/youtu.be/', '%2Fwww.youtube.com%2F', '%2Fyoutu.be%2F')):
 			print(origLine)
 			print(r)
 		if None in results:
 			break
 	if not hadMatches:
 		print(origLine, file = sys.stderr)