diff --git a/youtube-extract b/youtube-extract index 01028a2..e96f2da 100755 --- a/youtube-extract +++ b/youtube-extract @@ -1,4 +1,6 @@ #!/usr/bin/env python3 +import itertools +import os import re import sys @@ -13,19 +15,28 @@ if any(x in sys.argv for x in ['--help', '-h', '-?', 'help']): sys.exit(1) -# For all patterns: don't use \\? for an optional backslash (if needed) as it breaks the automatic pattern rewrite for parameters below. Use [\\]? instead. +mode = sys.argv[1] if len(sys.argv) >= 2 else 'massage' -# Channel/user URLs; the protocol and domain are stripped and replaced below. -channelPattern = re.compile('|'.join([ - r'//www\.youtube\.com/c/[^/?]+', - r'//www\.youtube\.com/user/[^/?]+', - r'//www\.youtube\.com/channel/UC[^/?]+', - r'//www\.youtube\.com/(?!(c|user|channel|watch(_popup)?(\.php)?|embed|e|v|redirect)$)[^/?]+(?=/?$)', - ])) + +if mode == 'removenonyt': + # Anything in here could never be as fast as grep, so just delegate to that... + os.execlp('grep', 'grep', '-F', '-e', '/www.youtube.com/', '-e', '/youtu.be/', '-e', '%2Fwww.youtube.com%2F', '-e', '%2Fyoutu.be%2F') + sys.exit(0) +assert mode == 'massage' + + +# For all patterns: don't use \\? for an optional backslash (if needed) as it breaks the automatic pattern rewrite for parameters below. Use [\\]? or (\\)? instead. But really, why would you have backslashes in URLs? + +channelPattern = '|'.join([ + r'//www\.youtube\.com/c/[^/?&]+', + r'//www\.youtube\.com/user/[^/?&]+', + r'//www\.youtube\.com/channel/UC[^/?&]+', + r'//www\.youtube\.com/(?!(c|user|channel|watch(_popup)?(\.php)?|embed|e|v|redirect)(%[23]F|$))[^/?&]+(?=/?$)', + ]) # Make sure that the last 11 chars of the match are always the video ID (because Python's re doesn't support \K). # If necessary, use lookahead assertions to match further stuff after the video ID. -videoPattern = re.compile('|'.join([ +videoPattern = '|'.join([ # Normal watch URL r'//www\.youtube\.com/watch(_popup)?(\.php)?/?\?(.*&)?v=[0-9A-Za-z_-]{11}', r'//www\.youtube\.com/watch/[0-9A-Za-z_-]{11}', @@ -44,9 +55,9 @@ videoPattern = re.compile('|'.join([ r'//www\.youtube\.com/(my_videos_)?edit\?(.*&)?video_id=[0-9A-Za-z_-]{11}', # Generic v parameter on watch URLs including with percent encoding; this covers e.g. google.com/url?... or the oEmbed r'/watch\?(.*&)?v=[0-9A-Za-z_-]{11}', - ])) + ]) -noisePattern = re.compile('|'.join([ +noisePattern = '|'.join([ r'^//www\.youtube\.com/s/(desktop|player)/[0-9a-f]+/', r'^//www\.youtube\.com/s/gaming/emoji/', r'^//www\.youtube\.com/redirect\?event=channel_banner&', @@ -54,39 +65,51 @@ noisePattern = re.compile('|'.join([ r'^//www\.youtube\.com/yts/', r'^//www\.youtube\.com/img/', r'^//www\.youtube\.com/youtubei/', - r'^//www\.youtube\.com/ads/', + r'^//www\.youtube\.com/ads(/|$)', + r'^//www\.youtube\.com/creators(/|$)', r'^//www\.youtube\.com/(player|iframe)_api\?', r'^//www\.youtube\.com/error(_204)?/?\?', - r'^//www\.youtube\.com/(about|t|howyoutubeworks)[/?]', + r'^//www\.youtube\.com/(about|t|howyoutubeworks)([/?]|$)', r'^//www\.youtube\.com/results/?\?', r'^//www\.youtube\.com/premium/?\?', + r'^//www\.youtube\.com/new([/?]|$)', r'^//www\.youtube\.com/?(\?|$)', r'^//www\.youtube\.com/embed/("|%22|' r"'|%27" r')(%20)?(\+|%3B)', # JS extraction stuff + r'^//www\.youtube\.com/service_ajax$', + r'^//www\.youtube\.com/watch(\?v=)?$', r'^//consent\.(youtube|google)\.com/', - ])) + ]) + + +def percentdecode(s): + return s.replace('%2F', '/').replace('%3A', ':').replace('%3F', '?').replace('%3D', '=').replace('%26', '&') matchers = [ - # (pattern, paramSearch, function(match) -> output str or None); returning None stops further processing + # (pattern, paramSearch, function(match: list[str]) -> output str or None); returning None stops further processing of a line + # If paramSearch is True, a corresponding pattern with [/:?=&] replaced by their percent encodings is generated; the reverse replacement is done again automatically before calling the function. + [noisePattern, False, lambda m: None], [channelPattern, True, lambda m: 'https://www.youtube.com/' + m[0].split('/', 3)[-1]], [videoPattern, True, lambda m: f'https://www.youtube.com/watch?v={m[0][-11:]}'], - [re.compile(r'//www\.youtube\.com/(playlist|embed(/videoseries|/\+lastest|/playlist)?/?)\?(.*&)?list=UU[0-9A-Za-z_-]+'), True, lambda m: f'https://www.youtube.com/channel/UC{m[0].rsplit("=", 1)[1][2:]}'], - [re.compile(r'//www\.youtube\.com/(playlist|embed(/videoseries|/\+lastest|/playlist)?/?)\?(.*&)?list=(PL|FL|RD|OL)[0-9A-Za-z_-]+'), True, lambda m: f'https://www.youtube.com/playlist?list={m[0].rsplit("=", 1)[1]}'], - [re.compile(r'//www\.youtube\.com/embed/?\?(?=(.*&)?listType=user_uploads(&|$))(.*&)?list=[^&]+'), True, lambda m: f'https://www.youtube.com/user/{m[0].rsplit("=", 1)[1]}'], - [re.compile(r'//www\.youtube\.com/rss/user/[^/]+'), True, lambda m: f'https://www.youtube.com/user/{m[0].rsplit("/", 1)[1]}'], - [re.compile(r'//www\.youtube\.com/feeds/videos\.xml\?(.*&)?channel_id=UC[0-9A-Za-z_-]+'), True, lambda m: f'https://www.youtube.com/channel/{m[0].rsplit("=", 1)[1]}'], - [re.compile(r'//www\.youtube\.com(/view_play_list\?p=|/playlist\?(.*&)?list=)[0-9A-F]{16}(?=(&|$))'), True, lambda m: f'https://www.youtube.com/playlist?list=PL{m[0].rsplit("=", 1)[1]}'], - [re.compile(r'^//i\.ytimg\.com/vi/([0-9A-Za-z_-]{11})/'), False, lambda m: f'https://www.youtube.com/watch?v={m[1]}'], - [noisePattern, False, lambda m: None], + [r'//www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=UU([0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/UC{m[1]}'], + [r'//www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=((PL|FL|RD|OL)[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/playlist?list={m[1]}'], + [r'//www\.youtube\.com/embed/?\?(?=(?:.*&)?listType=user_uploads(?:&|$))(?:.*&)?list=([^&]+)', True, lambda m: f'https://www.youtube.com/user/{m[1]}'], + [r'//www\.youtube\.com/rss/user/([^/?]+)', True, lambda m: f'https://www.youtube.com/user/{m[1]}'], + [r'//www\.youtube\.com/subscription_center\?(?:.*&)?add_user=([^/=&]+)(?=(&|$))', True, lambda m: f'https://www.youtube.com/user/{m[1]}'], + [r'//www\.youtube\.com/feeds/videos\.xml\?(?:.*&)?channel_id=(UC[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/{m[1]}'], + [r'//www\.youtube\.com(?:/view_play_list\?(?:.*&)?p=|/playlist\?(?:.*&)?list=)([0-9A-F]{16})(?=(&|$))', True, lambda m: f'https://www.youtube.com/playlist?list=PL{m[1]}'], + [r'^//i\.ytimg\.com/vi/([0-9A-Za-z_-]{11})/', False, lambda m: f'https://www.youtube.com/watch?v={m[1]}'], ] -# Compile second pattern for parameters if needed +# Compile pattern and generate one for parameters if desired for e in matchers: pattern, paramSearch, f = e + e[0] = re.compile(pattern) if paramSearch: - p2 = pattern.pattern.replace('//', '/{1,2}').replace('/', '(/|%2F)').replace(':', '(:|%3A)').replace(r'\?', r'(\?|%3F)') + p2 = pattern.replace('//', '/{1,2}').replace('/', '(/|%2F)').replace(r'\?', r'(\?|%3F)') + p2 = re.sub(r'(?