Browse Source

Fix performance regression due to 479c2684

master
JustAnotherArchivist 2 years ago
parent
commit
50a0fcc7b0
1 changed files with 52 additions and 54 deletions
  1. +52
    -54
      youtube-extract

+ 52
- 54
youtube-extract View File

@@ -29,32 +29,32 @@ assert mode == 'massage'


noisePattern = '|'.join([
r'^//www\.youtube\.com/s/(desktop|player)/[0-9a-f]+/',
r'^//www\.youtube\.com/s/gaming/emoji/',
r'^//www\.youtube\.com/redirect\?event=channel_banner&',
r'^//www\.youtube\.com/redirect\?(?=(.*&)?event=video_description(&|$))(?!(.*&)?v=)',
r'^//www\.youtube\.com/yts/',
r'^//www\.youtube\.com/img/',
r'^//www\.youtube\.com/youtubei/',
r'^//www\.youtube\.com/ads(/|$)',
r'^//www\.youtube\.com/creators(/|$)',
r'^//www\.youtube\.com/(player|iframe)_api(\?|$)',
r'^//www\.youtube\.com/error(_204)?/?\?',
r'^//www\.youtube\.com/(about|t|howyoutubeworks)([/?]|$)',
r'^//www\.youtube\.com/results/?(\?|$)',
r'^//www\.youtube\.com/premium/?\?',
r'^//www\.youtube\.com/new([/?]|$)',
r'^//www\.youtube\.com/?(\?|$)',
r'^//www\.youtube\.com/embed/("|%22|' r"'|%27" r')(%20)?(\+|%3B)', # JS extraction stuff
r'^//www\.youtube\.com/service_ajax$',
r'^//www\.youtube\.com/watch(\?v=)?$',
r'^//consent\.(youtube|google)\.com/',
r'^//www\.youtube\.com/(c|user|channel|watch(_popup)?(\.php)?|embed|e|v|redirect|(my_videos_)?upload)(%[23]F|/)?$', # Miscellaneous crap
r'//www\.youtube\.com/s/(desktop|player)/[0-9a-f]+/',
r'//www\.youtube\.com/s/gaming/emoji/',
r'//www\.youtube\.com/redirect\?event=channel_banner&',
r'//www\.youtube\.com/redirect\?(?=(\S*&)?event=video_description(&|$))(?!(\S*&)?v=)',
r'//www\.youtube\.com/yts/',
r'//www\.youtube\.com/img/',
r'//www\.youtube\.com/youtubei/',
r'//www\.youtube\.com/ads(/|$)',
r'//www\.youtube\.com/creators(/|$)',
r'//www\.youtube\.com/(player|iframe)_api(\?|$)',
r'//www\.youtube\.com/error(_204)?/?\?',
r'//www\.youtube\.com/(about|t|howyoutubeworks)([/?]|$)',
r'//www\.youtube\.com/results/?(\?|$)',
r'//www\.youtube\.com/premium/?\?',
r'//www\.youtube\.com/new([/?]|$)',
r'//www\.youtube\.com/?(\?|$)',
r'//www\.youtube\.com/embed/("|%22|' r"'|%27" r')(%20)?(\+|%3B)', # JS extraction stuff
r'//www\.youtube\.com/service_ajax$',
r'//www\.youtube\.com/watch(\?v=)?$',
r'//consent\.(youtube|google)\.com/',
r'//www\.youtube\.com/(c|user|channel|watch(_popup)?(\.php)?|embed|e|v|redirect|(my_videos_)?upload)(%[23]F|/)?$', # Miscellaneous crap
])

channelPattern = '|'.join([
r'/www\.youtube\.com/c/[^/?&=.]+',
r'/www\.youtube\.com/user/[^/?&=.]+',
r'/www\.youtube\.com/c/[^/?&=.\s]+',
r'/www\.youtube\.com/user/[^/?&=.\s]+',
r'/www\.youtube\.com/channel/UC[0-9A-Za-z_-]{22}',
r'/www\.youtube\.com/[^/?&=.\s]+(?=/?(\s|$))',
])
@@ -63,26 +63,26 @@ channelPattern = '|'.join([
# If necessary, use lookahead assertions to match further stuff after the video ID.
videoPattern = '|'.join([
# Normal watch URL
r'/www\.youtube\.com/watch(_popup)?(\.php)?/?\?(.*&)?v=[0-9A-Za-z_-]{11}',
r'/www\.youtube\.com/watch(_popup)?(\.php)?/?\?(\S*&)?v=[0-9A-Za-z_-]{11}',
r'/www\.youtube\.com/watch/[0-9A-Za-z_-]{11}',
# Embeds
r'/www\.youtube\.com/e(mbed)?/(?!videoseries\?)[0-9A-Za-z_-]{11}',
r'/www\.youtube\.com/embed/?\?(.*&)?v=[0-9A-Za-z_-]{11}',
r'/www\.youtube\.com/embed/?\?(\S*&)?v=[0-9A-Za-z_-]{11}',
# Shortener
r'/(?i:youtu\.be)(:\d+)?/[0-9A-Za-z_-]{11}',
# Old (Flash) embeds
r'/www\.youtube\.com/v/[0-9A-Za-z_-]{11}',
# Redirects from links in video descriptions
r'/www\.youtube\.com/redirect\?(.*&)?v=[0-9A-Za-z_-]{11}(?=&|$)',
r'/www\.youtube\.com/redirect\?(\S*&)?v=[0-9A-Za-z_-]{11}(?=&|$)',
# Tracking and other crap
r'/www\.youtube\.com/(ptracking|set_awesome)\?(.*&)?video_id=[0-9A-Za-z_-]{11}',
r'/www\.youtube\.com/api/timedtext\?(.*&)?v=[0-9A-Za-z_-]{11}',
r'/www\.youtube\.com/(my_videos_)?edit\?(.*&)?video_id=[0-9A-Za-z_-]{11}',
r'/www\.youtube\.com/(all_comments|attribution|cthru|get_endscreen|livestreaming/dashboard)\?(.*&)?v=[0-9A-Za-z_-]{11}',
r'/www\.youtube\.com/(ptracking|set_awesome)\?(\S*&)?video_id=[0-9A-Za-z_-]{11}',
r'/www\.youtube\.com/api/timedtext\?(\S*&)?v=[0-9A-Za-z_-]{11}',
r'/www\.youtube\.com/(my_videos_)?edit\?(\S*&)?video_id=[0-9A-Za-z_-]{11}',
r'/www\.youtube\.com/(all_comments|attribution|cthru|get_endscreen|livestreaming/dashboard)\?(\S*&)?v=[0-9A-Za-z_-]{11}',
# Generic v parameter on watch URLs including with percent encoding; this covers e.g. google.com/url?... or the oEmbed
r'/watch/?\?(.*&)?v=[0-9A-Za-z_-]{11}',
r'/watch/?\?(\S*&)?v=[0-9A-Za-z_-]{11}',
# Generic v parameter on anything
r'[?&]v=[0-9A-Za-z_-]{11}(?=&|$)',
r'[?&]v=[0-9A-Za-z_-]{11}(?=&|\s|$)',
])


@@ -96,13 +96,13 @@ matchers = [
[noisePattern, False, lambda m: None],
[channelPattern, True, lambda m: 'https://www.youtube.com/' + m[0].split('/', 2)[-1].rstrip('/')],
[videoPattern, True, lambda m: f'https://www.youtube.com/watch?v={m[0][-11:]}'],
[r'/www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=UU([0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/UC{m[1]}'],
[r'/www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=((PL|FL|RD|OL)[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/playlist?list={m[1]}'],
[r'/www\.youtube\.com/embed/?\?(?=(?:.*&)?listType=user_uploads(?:&|$))(?:.*&)?list=([^&]+)', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
[r'/www\.youtube\.com/rss/user/([^/?]+)', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
[r'/www\.youtube\.com/(?:subscription_center\?(?:.*&)?add_user=|subscribe_widget\?(?:.*&)?p=|profile\?(?:.*&)?user=)([^/=&]+)(?=(&|$))', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
[r'/www\.youtube\.com/feeds/videos\.xml\?(?:.*&)?channel_id=(UC[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/{m[1]}'],
[r'/www\.youtube\.com(?:/view_play_list\?(?:.*&)?p=|/playlist\?(?:.*&)?list=)([0-9A-F]{16})(?=(&|$))', True, lambda m: f'https://www.youtube.com/playlist?list=PL{m[1]}'],
[r'/www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:\S*&)?list=UU([0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/UC{m[1]}'],
[r'/www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:\S*&)?list=((PL|FL|RD|OL)[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/playlist?list={m[1]}'],
[r'/www\.youtube\.com/embed/?\?(?=(?:\S*&)?listType=user_uploads(?:&|$))(?:\S*&)?list=([^&\s]+)', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
[r'/www\.youtube\.com/rss/user/([^/?\s]+)', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
[r'/www\.youtube\.com/(?:subscription_center\?(?:\S*&)?add_user=|subscribe_widget\?(?:\S*&)?p=|profile\?(?:\S*&)?user=)([^/=&\s]+)(?=(&|\s|$))', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
[r'/www\.youtube\.com/feeds/videos\.xml\?(?:\S*&)?channel_id=(UC[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/{m[1]}'],
[r'/www\.youtube\.com(?:/view_play_list\?(?:\S*&)?p=|/playlist\?(?:.*&)?list=)([0-9A-F]{16})(?=(&|\s|$))', True, lambda m: f'https://www.youtube.com/playlist?list=PL{m[1]}'],
[r'/i\.ytimg\.com/vi/([0-9A-Za-z_-]{11})/', True, lambda m: f'https://www.youtube.com/watch?v={m[1]}'],
]

@@ -112,23 +112,21 @@ for e in matchers:

for origLine in sys.stdin:
origLine = origLine.strip()
line = re.sub(r'^https?://', '//', origLine)
line = re.sub(r'https?://', '//', origLine)
line = domainPattern.sub('/www.youtube.com/', line)
candidates = re.split(r'\s+', line)
hadMatches = False
for candidate in candidates:
for pattern, paramSearch, f in matchers:
results = set()
for m in itertools.chain((x for x in pattern.finditer(candidate)), (x for x in pattern.finditer(percentdecode(candidate))) if paramSearch else ()):
hadMatches = True
r = f(m)
if r in results:
continue
results.add(r)
if r is None:
break
print(r)
if None in results:
for pattern, paramSearch, f in matchers:
results = set()
for m in itertools.chain((x for x in pattern.finditer(line)), (x for x in pattern.finditer(percentdecode(line))) if paramSearch else ()):
hadMatches = True
r = f(m)
if r in results:
continue
results.add(r)
if r is None:
break
print(r)
if None in results:
break
if not hadMatches:
print(origLine, file = sys.stderr)

Loading…
Cancel
Save