The little things give you away... A collection of various small helper stuff
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 

130 lignes
6.3 KiB

  1. #!/usr/bin/env python3
  2. import itertools
  3. import os
  4. import re
  5. import sys
  6. if any(x in sys.argv for x in ['--help', '-h', '-?', 'help']):
  7. print('Usage: youtube-extract [massage|removenonyt]', file = sys.stderr)
  8. print(file = sys.stderr)
  9. print("In 'massage' mode (default), extracts any references to YouTube videos, channels, and playlists from the URLs on stdin and prints them on stdout.", file = sys.stderr)
  10. print('Lines that don\'t seem to contain references to such YouTube things are printed on stderr.', file = sys.stderr)
  11. print(file = sys.stderr)
  12. print("In 'removenonyt' mode, prints all URLs that look like they are or contain YouTube URLs on stdout.", file = sys.stderr)
  13. sys.exit(1)
  14. mode = sys.argv[1] if len(sys.argv) >= 2 else 'massage'
  15. # Only one slash before so it still matches inside URLs when slashes were collapsed.
  16. domainPattern = re.compile(r'/(www\.|m\.)?(youtube\.(com|de|fr|co\.uk|it|es|at|pt|gr|hu|ro|pl|dk|no|se|fi|ee|lt|lv|ru|by|cz|sk|si|rs|hr|ca)|music\.youtube\.com|(es|uk|pl|ru|it|jp|br)\.youtube\.com|youtube-nocookie\.com)/', re.IGNORECASE)
  17. if mode == 'removenonyt':
  18. # Anything in here could never be as fast as grep, so just delegate to that...
  19. os.execlp('grep', 'grep', '-P', domainPattern.pattern + '|' + domainPattern.pattern.replace('/', '%2F') + '|/youtu\.be/|%2Fyoutu\.be%2F')
  20. sys.exit(0)
  21. assert mode == 'massage'
  22. noisePattern = '|'.join([
  23. r'^//www\.youtube\.com/s/(desktop|player)/[0-9a-f]+/',
  24. r'^//www\.youtube\.com/s/gaming/emoji/',
  25. r'^//www\.youtube\.com/redirect\?event=channel_banner&',
  26. r'^//www\.youtube\.com/redirect\?(?=(.*&)?event=video_description(&|$))(?!(.*&)?v=)',
  27. r'^//www\.youtube\.com/yts/',
  28. r'^//www\.youtube\.com/img/',
  29. r'^//www\.youtube\.com/youtubei/',
  30. r'^//www\.youtube\.com/ads(/|$)',
  31. r'^//www\.youtube\.com/creators(/|$)',
  32. r'^//www\.youtube\.com/(player|iframe)_api(\?|$)',
  33. r'^//www\.youtube\.com/error(_204)?/?\?',
  34. r'^//www\.youtube\.com/(about|t|howyoutubeworks)([/?]|$)',
  35. r'^//www\.youtube\.com/results/?(\?|$)',
  36. r'^//www\.youtube\.com/premium/?\?',
  37. r'^//www\.youtube\.com/new([/?]|$)',
  38. r'^//www\.youtube\.com/?(\?|$)',
  39. r'^//www\.youtube\.com/embed/("|%22|' r"'|%27" r')(%20)?(\+|%3B)', # JS extraction stuff
  40. r'^//www\.youtube\.com/service_ajax$',
  41. r'^//www\.youtube\.com/watch(\?v=)?$',
  42. r'^//consent\.(youtube|google)\.com/',
  43. r'^//www\.youtube\.com/(c|user|channel|watch(_popup)?(\.php)?|embed|e|v|redirect|(my_videos_)?upload)(%[23]F|/)?$', # Miscellaneous crap
  44. ])
  45. channelPattern = '|'.join([
  46. r'/www\.youtube\.com/c/[^/?&=.]+',
  47. r'/www\.youtube\.com/user/[^/?&=.]+',
  48. r'/www\.youtube\.com/channel/UC[0-9A-Za-z_-]{22}',
  49. r'/www\.youtube\.com/[^/?&=.]+(?=/?$)',
  50. ])
  51. # Make sure that the last 11 chars of the match are always the video ID (because Python's re doesn't support \K).
  52. # If necessary, use lookahead assertions to match further stuff after the video ID.
  53. videoPattern = '|'.join([
  54. # Normal watch URL
  55. r'/www\.youtube\.com/watch(_popup)?(\.php)?/?\?(.*&)?v=[0-9A-Za-z_-]{11}',
  56. r'/www\.youtube\.com/watch/[0-9A-Za-z_-]{11}',
  57. # Embeds
  58. r'/www\.youtube\.com/e(mbed)?/(?!videoseries\?)[0-9A-Za-z_-]{11}',
  59. r'/www\.youtube\.com/embed/?\?(.*&)?v=[0-9A-Za-z_-]{11}',
  60. # Shortener
  61. r'/youtu\.be/[0-9A-Za-z_-]{11}',
  62. # Old (Flash) embeds
  63. r'/www\.youtube\.com/v/[0-9A-Za-z_-]{11}',
  64. # Redirects from links in video descriptions
  65. r'/www\.youtube\.com/redirect\?(.*&)?v=[0-9A-Za-z_-]{11}(?=&|$)',
  66. # Tracking and other crap
  67. r'/www\.youtube\.com/(ptracking|set_awesome)\?(.*&)?video_id=[0-9A-Za-z_-]{11}',
  68. r'/www\.youtube\.com/api/timedtext\?(.*&)?v=[0-9A-Za-z_-]{11}',
  69. r'/www\.youtube\.com/(my_videos_)?edit\?(.*&)?video_id=[0-9A-Za-z_-]{11}',
  70. # Generic v parameter on watch URLs including with percent encoding; this covers e.g. google.com/url?... or the oEmbed
  71. r'/watch\?(.*&)?v=[0-9A-Za-z_-]{11}',
  72. ])
  73. def percentdecode(s):
  74. return s.replace('%2F', '/').replace('%3A', ':').replace('%3F', '?').replace('%3D', '=').replace('%26', '&')
  75. matchers = [
  76. # (pattern, paramSearch, function(match: list[str]) -> output str or None); returning None stops further processing of a line
  77. # If paramSearch is True, a corresponding pattern with [/:?=&] replaced by their percent encodings is generated; the reverse replacement is done again automatically before calling the function.
  78. [noisePattern, False, lambda m: None],
  79. [channelPattern, True, lambda m: 'https://www.youtube.com/' + m[0].split('/', 2)[-1].rstrip('/')],
  80. [videoPattern, True, lambda m: f'https://www.youtube.com/watch?v={m[0][-11:]}'],
  81. [r'/www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=UU([0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/UC{m[1]}'],
  82. [r'/www\.youtube\.com/(?:playlist|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:.*&)?list=((PL|FL|RD|OL)[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/playlist?list={m[1]}'],
  83. [r'/www\.youtube\.com/embed/?\?(?=(?:.*&)?listType=user_uploads(?:&|$))(?:.*&)?list=([^&]+)', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
  84. [r'/www\.youtube\.com/rss/user/([^/?]+)', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
  85. [r'/www\.youtube\.com/(?:subscription_center\?(?:.*&)?add_user=|subscribe_widget\?(?:.*&)?p=|profile\?(?:.*&)?user=)([^/=&]+)(?=(&|$))', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
  86. [r'/www\.youtube\.com/feeds/videos\.xml\?(?:.*&)?channel_id=(UC[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/{m[1]}'],
  87. [r'/www\.youtube\.com(?:/view_play_list\?(?:.*&)?p=|/playlist\?(?:.*&)?list=)([0-9A-F]{16})(?=(&|$))', True, lambda m: f'https://www.youtube.com/playlist?list=PL{m[1]}'],
  88. [r'^//i\.ytimg\.com/vi/([0-9A-Za-z_-]{11})/', False, lambda m: f'https://www.youtube.com/watch?v={m[1]}'],
  89. ]
  90. # Compile pattern and generate one for parameters if desired
  91. for e in matchers:
  92. e[0] = re.compile(e[0])
  93. for origLine in sys.stdin:
  94. origLine = origLine.strip()
  95. line = re.sub(r'^https?://', '//', origLine)
  96. line = domainPattern.sub('/www.youtube.com/', line)
  97. hadMatches = False
  98. for pattern, paramSearch, f in matchers:
  99. results = set()
  100. for m in itertools.chain((x for x in pattern.finditer(line)), (x for x in pattern.finditer(percentdecode(line))) if paramSearch else ()):
  101. hadMatches = True
  102. r = f(m)
  103. if r in results:
  104. continue
  105. results.add(r)
  106. if r is None:
  107. break
  108. print(r)
  109. if None in results:
  110. break
  111. if not hadMatches:
  112. print(origLine, file = sys.stderr)