|
|
@@ -0,0 +1,124 @@ |
|
|
|
#!/usr/bin/env python3 |
|
|
|
import re |
|
|
|
import sys |
|
|
|
|
|
|
|
|
|
|
|
if any(x in sys.argv for x in ['--help', '-h', '-?', 'help']): |
|
|
|
print('Usage: youtube-extract [massage|removenonyt]', file = sys.stderr) |
|
|
|
print(file = sys.stderr) |
|
|
|
print("In 'massage' mode (default), extracts any references to YouTube videos, channels, and playlists from the URLs on stdin and prints them on stdout.", file = sys.stderr) |
|
|
|
print('Lines that don\'t seem to contain references to such YouTube things are printed on stderr.', file = sys.stderr) |
|
|
|
print(file = sys.stderr) |
|
|
|
print("In 'removenonyt' mode, prints all URLs that look like they are or contain YouTube URLs on stdout.", file = sys.stderr) |
|
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
|
|
|
# For all patterns: don't use \\? for an optional backslash (if needed) as it breaks the automatic pattern rewrite for parameters below. Use [\\]? instead. |
|
|
|
|
|
|
|
# Channel/user URLs; the protocol and domain are stripped and replaced below. |
|
|
|
channelPattern = re.compile('|'.join([ |
|
|
|
r'//www\.youtube\.com/c/[^/?]+', |
|
|
|
r'//www\.youtube\.com/user/[^/?]+', |
|
|
|
r'//www\.youtube\.com/channel/UC[^/?]+', |
|
|
|
r'//www\.youtube\.com/(?!(c|user|channel|watch(_popup)?(\.php)?|embed|e|v|redirect)$)[^/?]+(?=/?$)', |
|
|
|
])) |
|
|
|
|
|
|
|
# Make sure that the last 11 chars of the match are always the video ID (because Python's re doesn't support \K). |
|
|
|
# If necessary, use lookahead assertions to match further stuff after the video ID. |
|
|
|
videoPattern = re.compile('|'.join([ |
|
|
|
# Normal watch URL |
|
|
|
r'//www\.youtube\.com/watch(_popup)?(\.php)?/?\?(.*&)?v=[0-9A-Za-z_-]{11}', |
|
|
|
r'//www\.youtube\.com/watch/[0-9A-Za-z_-]{11}', |
|
|
|
# Embeds |
|
|
|
r'//www\.youtube\.com/e(mbed)?/(?!videoseries\?)[0-9A-Za-z_-]{11}', |
|
|
|
r'//www\.youtube\.com/embed/?\?(.*&)?v=[0-9A-Za-z_-]{11}', |
|
|
|
# Shortener |
|
|
|
r'//youtu\.be/[0-9A-Za-z_-]{11}', |
|
|
|
# Old (Flash) embeds |
|
|
|
r'//www\.youtube\.com/v/[0-9A-Za-z_-]{11}', |
|
|
|
# Redirects from links in video descriptions |
|
|
|
r'//www\.youtube\.com/redirect\?(.*&)?v=[0-9A-Za-z_-]{11}(?=&|$)', |
|
|
|
# Tracking and other crap |
|
|
|
r'//www\.youtube\.com/(ptracking|set_awesome)\?(.*&)?video_id=[0-9A-Za-z_-]{11}', |
|
|
|
r'//www\.youtube\.com/api/timedtext\?(.*&)?v=[0-9A-Za-z_-]{11}', |
|
|
|
r'//www\.youtube\.com/(my_videos_)?edit\?(.*&)?video_id=[0-9A-Za-z_-]{11}', |
|
|
|
# Generic v parameter on watch URLs including with percent encoding; this covers e.g. google.com/url?... or the oEmbed |
|
|
|
r'/watch\?(.*&)?v=[0-9A-Za-z_-]{11}', |
|
|
|
])) |
|
|
|
|
|
|
|
noisePattern = re.compile('|'.join([ |
|
|
|
r'^//www\.youtube\.com/s/(desktop|player)/[0-9a-f]+/', |
|
|
|
r'^//www\.youtube\.com/s/gaming/emoji/', |
|
|
|
r'^//www\.youtube\.com/redirect\?event=channel_banner&', |
|
|
|
r'^//www\.youtube\.com/redirect\?(?=(.*&)?event=video_description(&|$))(?!(.*&)?v=)', |
|
|
|
r'^//www\.youtube\.com/yts/', |
|
|
|
r'^//www\.youtube\.com/img/', |
|
|
|
r'^//www\.youtube\.com/youtubei/', |
|
|
|
r'^//www\.youtube\.com/ads/', |
|
|
|
r'^//www\.youtube\.com/(player|iframe)_api\?', |
|
|
|
r'^//www\.youtube\.com/error(_204)?/?\?', |
|
|
|
r'^//www\.youtube\.com/(about|t|howyoutubeworks)[/?]', |
|
|
|
r'^//www\.youtube\.com/results/?\?', |
|
|
|
r'^//www\.youtube\.com/premium/?\?', |
|
|
|
r'^//www\.youtube\.com/?(\?|$)', |
|
|
|
r'^//www\.youtube\.com/embed/("|%22|' r"'|%27" r')(%20)?(\+|%3B)', # JS extraction stuff |
|
|
|
r'^//consent\.(youtube|google)\.com/', |
|
|
|
])) |
|
|
|
|
|
|
|
|
|
|
|
matchers = [ |
|
|
|
# (pattern, paramSearch, function(match) -> output str or None); returning None stops further processing |
|
|
|
[channelPattern, True, lambda m: 'https://www.youtube.com/' + m[0].split('/', 3)[-1]], |
|
|
|
[videoPattern, True, lambda m: f'https://www.youtube.com/watch?v={m[0][-11:]}'], |
|
|
|
[re.compile(r'//www\.youtube\.com/(playlist|embed(/videoseries|/\+lastest|/playlist)?/?)\?(.*&)?list=UU[0-9A-Za-z_-]+'), True, lambda m: f'https://www.youtube.com/channel/UC{m[0].rsplit("=", 1)[1][2:]}'], |
|
|
|
[re.compile(r'//www\.youtube\.com/(playlist|embed(/videoseries|/\+lastest|/playlist)?/?)\?(.*&)?list=(PL|FL|RD|OL)[0-9A-Za-z_-]+'), True, lambda m: f'https://www.youtube.com/playlist?list={m[0].rsplit("=", 1)[1]}'], |
|
|
|
[re.compile(r'//www\.youtube\.com/embed/?\?(?=(.*&)?listType=user_uploads(&|$))(.*&)?list=[^&]+'), True, lambda m: f'https://www.youtube.com/user/{m[0].rsplit("=", 1)[1]}'], |
|
|
|
[re.compile(r'//www\.youtube\.com/rss/user/[^/]+'), True, lambda m: f'https://www.youtube.com/user/{m[0].rsplit("/", 1)[1]}'], |
|
|
|
[re.compile(r'//www\.youtube\.com/feeds/videos\.xml\?(.*&)?channel_id=UC[0-9A-Za-z_-]+'), True, lambda m: f'https://www.youtube.com/channel/{m[0].rsplit("=", 1)[1]}'], |
|
|
|
[re.compile(r'//www\.youtube\.com(/view_play_list\?p=|/playlist\?(.*&)?list=)[0-9A-F]{16}(?=(&|$))'), True, lambda m: f'https://www.youtube.com/playlist?list=PL{m[0].rsplit("=", 1)[1]}'], |
|
|
|
[re.compile(r'^//i\.ytimg\.com/vi/([0-9A-Za-z_-]{11})/'), False, lambda m: f'https://www.youtube.com/watch?v={m[1]}'], |
|
|
|
[noisePattern, False, lambda m: None], |
|
|
|
] |
|
|
|
|
|
|
|
# Compile second pattern for parameters if needed |
|
|
|
for e in matchers: |
|
|
|
pattern, paramSearch, f = e |
|
|
|
if paramSearch: |
|
|
|
p2 = pattern.pattern.replace('//', '/{1,2}').replace('/', '(/|%2F)').replace(':', '(:|%3A)').replace(r'\?', r'(\?|%3F)') |
|
|
|
p2 = re.sub(r'(?<!\(\?)=', '(=|%3D)', p2) |
|
|
|
e[1] = re.compile(p2.replace('&', '(&|%26)'), pattern.flags) |
|
|
|
else: |
|
|
|
e[1] = None |
|
|
|
|
|
|
|
# Only one slash before so it still matches inside URLs when slashes were collapsed. |
|
|
|
domainPattern = re.compile(r'/(www\.)?youtube\.(com|de|fr|co\.uk|it|es|at|pt|gr|hu|ro|pl|dk|no|se|fi|ee|lt|lv|ru|by|cz|sk|si|rs|hr|ca)/') |
|
|
|
|
|
|
|
for origLine in sys.stdin: |
|
|
|
origLine = origLine.strip() |
|
|
|
line = re.sub(r'^https?://', '//', origLine) |
|
|
|
line = domainPattern.sub('/www.youtube.com/', line) |
|
|
|
if sys.argv[1] == 'massage': |
|
|
|
hadMatches = False |
|
|
|
for pattern1, pattern2, f in matchers: |
|
|
|
patterns = [pattern1] |
|
|
|
if pattern2: |
|
|
|
patterns.append(pattern2) |
|
|
|
results = set() |
|
|
|
for pattern in patterns: |
|
|
|
m = pattern.search(line) |
|
|
|
if m: |
|
|
|
hadMatches = True |
|
|
|
r = f(m) |
|
|
|
if r in results: |
|
|
|
continue |
|
|
|
results.add(r) |
|
|
|
if r is None: |
|
|
|
break |
|
|
|
print(r) |
|
|
|
if None in results: |
|
|
|
break |
|
|
|
if not hadMatches: |
|
|
|
print(origLine, file = sys.stderr) |
|
|
|
elif sys.argv[1] == 'removenonyt': |
|
|
|
if any(x in line for x in ('/www.youtube.com/', '/youtu.be/', '%2Fwww.youtube.com%2F', '%2Fyoutu.be%2F')): |
|
|
|
print(origLine) |