Browse Source

Be more lenient regarding slashes to catch things with collapsed URLs in paths etc.

master
JustAnotherArchivist 3 years ago
parent
commit
bbf2d2c315
1 changed files with 21 additions and 21 deletions
  1. +21
    -21
      youtube-extract

+ 21
- 21
youtube-extract View File

@@ -53,31 +53,31 @@ noisePattern = '|'.join([
])

channelPattern = '|'.join([
r'//www\.youtube\.com/c/[^/?&=.]+',
r'//www\.youtube\.com/user/[^/?&=.]+',
r'//www\.youtube\.com/channel/UC[0-9A-Za-z_-]{22}',
r'//www\.youtube\.com/[^/?&=.]+(?=/?$)',
r'/www\.youtube\.com/c/[^/?&=.]+',
r'/www\.youtube\.com/user/[^/?&=.]+',
r'/www\.youtube\.com/channel/UC[0-9A-Za-z_-]{22}',
r'/www\.youtube\.com/[^/?&=.]+(?=/?$)',
])

# Make sure that the last 11 chars of the match are always the video ID (because Python's re doesn't support \K).
# If necessary, use lookahead assertions to match further stuff after the video ID.
videoPattern = '|'.join([
# Normal watch URL
r'//www\.youtube\.com/watch(_popup)?(\.php)?/?\?(.*&)?v=[0-9A-Za-z_-]{11}',
r'//www\.youtube\.com/watch/[0-9A-Za-z_-]{11}',
r'/www\.youtube\.com/watch(_popup)?(\.php)?/?\?(.*&)?v=[0-9A-Za-z_-]{11}',
r'/www\.youtube\.com/watch/[0-9A-Za-z_-]{11}',
# Embeds
r'//www\.youtube\.com/e(mbed)?/(?!videoseries\?)[0-9A-Za-z_-]{11}',
r'//www\.youtube\.com/embed/?\?(.*&)?v=[0-9A-Za-z_-]{11}',
r'/www\.youtube\.com/e(mbed)?/(?!videoseries\?)[0-9A-Za-z_-]{11}',
r'/www\.youtube\.com/embed/?\?(.*&)?v=[0-9A-Za-z_-]{11}',
# Shortener
r'//youtu\.be/[0-9A-Za-z_-]{11}',
r'/youtu\.be/[0-9A-Za-z_-]{11}',
# Old (Flash) embeds
r'//www\.youtube\.com/v/[0-9A-Za-z_-]{11}',
r'/www\.youtube\.com/v/[0-9A-Za-z_-]{11}',
# Redirects from links in video descriptions
r'//www\.youtube\.com/redirect\?(.*&)?v=[0-9A-Za-z_-]{11}(?=&|$)',
r'/www\.youtube\.com/redirect\?(.*&)?v=[0-9A-Za-z_-]{11}(?=&|$)',
# Tracking and other crap
r'//www\.youtube\.com/(ptracking|set_awesome)\?(.*&)?video_id=[0-9A-Za-z_-]{11}',
r'//www\.youtube\.com/api/timedtext\?(.*&)?v=[0-9A-Za-z_-]{11}',
r'//www\.youtube\.com/(my_videos_)?edit\?(.*&)?video_id=[0-9A-Za-z_-]{11}',
r'/www\.youtube\.com/(ptracking|set_awesome)\?(.*&)?video_id=[0-9A-Za-z_-]{11}',
r'/www\.youtube\.com/api/timedtext\?(.*&)?v=[0-9A-Za-z_-]{11}',
r'/www\.youtube\.com/(my_videos_)?edit\?(.*&)?video_id=[0-9A-Za-z_-]{11}',
# Generic v parameter on watch URLs including with percent encoding; this covers e.g. google.com/url?... or the oEmbed
r'/watch\?(.*&)?v=[0-9A-Za-z_-]{11}',
])
@@ -93,13 +93,13 @@ matchers = [
[noisePattern, False, lambda m: None],
[channelPattern, True, lambda m: 'https://www.youtube.com/' + m[0].split('/', 3)[-1].rstrip('/')],
[videoPattern, True, lambda m: f'https://www.youtube.com/watch?v={m[0][-11:]}'],
[r'//www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=UU([0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/UC{m[1]}'],
[r'//www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=((PL|FL|RD|OL)[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/playlist?list={m[1]}'],
[r'//www\.youtube\.com/embed/?\?(?=(?:.*&)?listType=user_uploads(?:&|$))(?:.*&)?list=([^&]+)', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
[r'//www\.youtube\.com/rss/user/([^/?]+)', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
[r'//www\.youtube\.com/subscription_center\?(?:.*&)?add_user=([^/=&]+)(?=(&|$))', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
[r'//www\.youtube\.com/feeds/videos\.xml\?(?:.*&)?channel_id=(UC[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/{m[1]}'],
[r'//www\.youtube\.com(?:/view_play_list\?(?:.*&)?p=|/playlist\?(?:.*&)?list=)([0-9A-F]{16})(?=(&|$))', True, lambda m: f'https://www.youtube.com/playlist?list=PL{m[1]}'],
[r'/www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=UU([0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/UC{m[1]}'],
[r'/www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=((PL|FL|RD|OL)[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/playlist?list={m[1]}'],
[r'/www\.youtube\.com/embed/?\?(?=(?:.*&)?listType=user_uploads(?:&|$))(?:.*&)?list=([^&]+)', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
[r'/www\.youtube\.com/rss/user/([^/?]+)', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
[r'/www\.youtube\.com/subscription_center\?(?:.*&)?add_user=([^/=&]+)(?=(&|$))', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
[r'/www\.youtube\.com/feeds/videos\.xml\?(?:.*&)?channel_id=(UC[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/{m[1]}'],
[r'/www\.youtube\.com(?:/view_play_list\?(?:.*&)?p=|/playlist\?(?:.*&)?list=)([0-9A-F]{16})(?=(&|$))', True, lambda m: f'https://www.youtube.com/playlist?list=PL{m[1]}'],
[r'^//i\.ytimg\.com/vi/([0-9A-Za-z_-]{11})/', False, lambda m: f'https://www.youtube.com/watch?v={m[1]}'],
]



Loading…
Cancel
Save