From bbf2d2c3159b1fbbb4c5fee885c57e6c0512c4d1 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Thu, 26 Nov 2020 04:42:35 +0000 Subject: [PATCH] Be more lenient regarding slashes to catch things with collapsed URLs in paths etc. --- youtube-extract | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/youtube-extract b/youtube-extract index fbe3dd8..e8d3445 100755 --- a/youtube-extract +++ b/youtube-extract @@ -53,31 +53,31 @@ noisePattern = '|'.join([ ]) channelPattern = '|'.join([ - r'//www\.youtube\.com/c/[^/?&=.]+', - r'//www\.youtube\.com/user/[^/?&=.]+', - r'//www\.youtube\.com/channel/UC[0-9A-Za-z_-]{22}', - r'//www\.youtube\.com/[^/?&=.]+(?=/?$)', + r'/www\.youtube\.com/c/[^/?&=.]+', + r'/www\.youtube\.com/user/[^/?&=.]+', + r'/www\.youtube\.com/channel/UC[0-9A-Za-z_-]{22}', + r'/www\.youtube\.com/[^/?&=.]+(?=/?$)', ]) # Make sure that the last 11 chars of the match are always the video ID (because Python's re doesn't support \K). # If necessary, use lookahead assertions to match further stuff after the video ID. videoPattern = '|'.join([ # Normal watch URL - r'//www\.youtube\.com/watch(_popup)?(\.php)?/?\?(.*&)?v=[0-9A-Za-z_-]{11}', - r'//www\.youtube\.com/watch/[0-9A-Za-z_-]{11}', + r'/www\.youtube\.com/watch(_popup)?(\.php)?/?\?(.*&)?v=[0-9A-Za-z_-]{11}', + r'/www\.youtube\.com/watch/[0-9A-Za-z_-]{11}', # Embeds - r'//www\.youtube\.com/e(mbed)?/(?!videoseries\?)[0-9A-Za-z_-]{11}', - r'//www\.youtube\.com/embed/?\?(.*&)?v=[0-9A-Za-z_-]{11}', + r'/www\.youtube\.com/e(mbed)?/(?!videoseries\?)[0-9A-Za-z_-]{11}', + r'/www\.youtube\.com/embed/?\?(.*&)?v=[0-9A-Za-z_-]{11}', # Shortener - r'//youtu\.be/[0-9A-Za-z_-]{11}', + r'/youtu\.be/[0-9A-Za-z_-]{11}', # Old (Flash) embeds - r'//www\.youtube\.com/v/[0-9A-Za-z_-]{11}', + r'/www\.youtube\.com/v/[0-9A-Za-z_-]{11}', # Redirects from links in video descriptions - r'//www\.youtube\.com/redirect\?(.*&)?v=[0-9A-Za-z_-]{11}(?=&|$)', + r'/www\.youtube\.com/redirect\?(.*&)?v=[0-9A-Za-z_-]{11}(?=&|$)', # Tracking and other crap - r'//www\.youtube\.com/(ptracking|set_awesome)\?(.*&)?video_id=[0-9A-Za-z_-]{11}', - r'//www\.youtube\.com/api/timedtext\?(.*&)?v=[0-9A-Za-z_-]{11}', - r'//www\.youtube\.com/(my_videos_)?edit\?(.*&)?video_id=[0-9A-Za-z_-]{11}', + r'/www\.youtube\.com/(ptracking|set_awesome)\?(.*&)?video_id=[0-9A-Za-z_-]{11}', + r'/www\.youtube\.com/api/timedtext\?(.*&)?v=[0-9A-Za-z_-]{11}', + r'/www\.youtube\.com/(my_videos_)?edit\?(.*&)?video_id=[0-9A-Za-z_-]{11}', # Generic v parameter on watch URLs including with percent encoding; this covers e.g. google.com/url?... or the oEmbed r'/watch\?(.*&)?v=[0-9A-Za-z_-]{11}', ]) @@ -93,13 +93,13 @@ matchers = [ [noisePattern, False, lambda m: None], [channelPattern, True, lambda m: 'https://www.youtube.com/' + m[0].split('/', 3)[-1].rstrip('/')], [videoPattern, True, lambda m: f'https://www.youtube.com/watch?v={m[0][-11:]}'], - [r'//www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=UU([0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/UC{m[1]}'], - [r'//www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=((PL|FL|RD|OL)[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/playlist?list={m[1]}'], - [r'//www\.youtube\.com/embed/?\?(?=(?:.*&)?listType=user_uploads(?:&|$))(?:.*&)?list=([^&]+)', True, lambda m: f'https://www.youtube.com/user/{m[1]}'], - [r'//www\.youtube\.com/rss/user/([^/?]+)', True, lambda m: f'https://www.youtube.com/user/{m[1]}'], - [r'//www\.youtube\.com/subscription_center\?(?:.*&)?add_user=([^/=&]+)(?=(&|$))', True, lambda m: f'https://www.youtube.com/user/{m[1]}'], - [r'//www\.youtube\.com/feeds/videos\.xml\?(?:.*&)?channel_id=(UC[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/{m[1]}'], - [r'//www\.youtube\.com(?:/view_play_list\?(?:.*&)?p=|/playlist\?(?:.*&)?list=)([0-9A-F]{16})(?=(&|$))', True, lambda m: f'https://www.youtube.com/playlist?list=PL{m[1]}'], + [r'/www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=UU([0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/UC{m[1]}'], + [r'/www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=((PL|FL|RD|OL)[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/playlist?list={m[1]}'], + [r'/www\.youtube\.com/embed/?\?(?=(?:.*&)?listType=user_uploads(?:&|$))(?:.*&)?list=([^&]+)', True, lambda m: f'https://www.youtube.com/user/{m[1]}'], + [r'/www\.youtube\.com/rss/user/([^/?]+)', True, lambda m: f'https://www.youtube.com/user/{m[1]}'], + [r'/www\.youtube\.com/subscription_center\?(?:.*&)?add_user=([^/=&]+)(?=(&|$))', True, lambda m: f'https://www.youtube.com/user/{m[1]}'], + [r'/www\.youtube\.com/feeds/videos\.xml\?(?:.*&)?channel_id=(UC[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/{m[1]}'], + [r'/www\.youtube\.com(?:/view_play_list\?(?:.*&)?p=|/playlist\?(?:.*&)?list=)([0-9A-F]{16})(?=(&|$))', True, lambda m: f'https://www.youtube.com/playlist?list=PL{m[1]}'], [r'^//i\.ytimg\.com/vi/([0-9A-Za-z_-]{11})/', False, lambda m: f'https://www.youtube.com/watch?v={m[1]}'], ]