The little things give you away... A collection of various small helper stuff
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 

125 lignes
5.9 KiB

  1. #!/usr/bin/env python3
  2. import re
  3. import sys
  4. if any(x in sys.argv for x in ['--help', '-h', '-?', 'help']):
  5. print('Usage: youtube-extract [massage|removenonyt]', file = sys.stderr)
  6. print(file = sys.stderr)
  7. print("In 'massage' mode (default), extracts any references to YouTube videos, channels, and playlists from the URLs on stdin and prints them on stdout.", file = sys.stderr)
  8. print('Lines that don\'t seem to contain references to such YouTube things are printed on stderr.', file = sys.stderr)
  9. print(file = sys.stderr)
  10. print("In 'removenonyt' mode, prints all URLs that look like they are or contain YouTube URLs on stdout.", file = sys.stderr)
  11. sys.exit(1)
  12. # For all patterns: don't use \\? for an optional backslash (if needed) as it breaks the automatic pattern rewrite for parameters below. Use [\\]? instead.
  13. # Channel/user URLs; the protocol and domain are stripped and replaced below.
  14. channelPattern = re.compile('|'.join([
  15. r'//www\.youtube\.com/c/[^/?]+',
  16. r'//www\.youtube\.com/user/[^/?]+',
  17. r'//www\.youtube\.com/channel/UC[^/?]+',
  18. r'//www\.youtube\.com/(?!(c|user|channel|watch(_popup)?(\.php)?|embed|e|v|redirect)$)[^/?]+(?=/?$)',
  19. ]))
  20. # Make sure that the last 11 chars of the match are always the video ID (because Python's re doesn't support \K).
  21. # If necessary, use lookahead assertions to match further stuff after the video ID.
  22. videoPattern = re.compile('|'.join([
  23. # Normal watch URL
  24. r'//www\.youtube\.com/watch(_popup)?(\.php)?/?\?(.*&)?v=[0-9A-Za-z_-]{11}',
  25. r'//www\.youtube\.com/watch/[0-9A-Za-z_-]{11}',
  26. # Embeds
  27. r'//www\.youtube\.com/e(mbed)?/(?!videoseries\?)[0-9A-Za-z_-]{11}',
  28. r'//www\.youtube\.com/embed/?\?(.*&)?v=[0-9A-Za-z_-]{11}',
  29. # Shortener
  30. r'//youtu\.be/[0-9A-Za-z_-]{11}',
  31. # Old (Flash) embeds
  32. r'//www\.youtube\.com/v/[0-9A-Za-z_-]{11}',
  33. # Redirects from links in video descriptions
  34. r'//www\.youtube\.com/redirect\?(.*&)?v=[0-9A-Za-z_-]{11}(?=&|$)',
  35. # Tracking and other crap
  36. r'//www\.youtube\.com/(ptracking|set_awesome)\?(.*&)?video_id=[0-9A-Za-z_-]{11}',
  37. r'//www\.youtube\.com/api/timedtext\?(.*&)?v=[0-9A-Za-z_-]{11}',
  38. r'//www\.youtube\.com/(my_videos_)?edit\?(.*&)?video_id=[0-9A-Za-z_-]{11}',
  39. # Generic v parameter on watch URLs including with percent encoding; this covers e.g. google.com/url?... or the oEmbed
  40. r'/watch\?(.*&)?v=[0-9A-Za-z_-]{11}',
  41. ]))
  42. noisePattern = re.compile('|'.join([
  43. r'^//www\.youtube\.com/s/(desktop|player)/[0-9a-f]+/',
  44. r'^//www\.youtube\.com/s/gaming/emoji/',
  45. r'^//www\.youtube\.com/redirect\?event=channel_banner&',
  46. r'^//www\.youtube\.com/redirect\?(?=(.*&)?event=video_description(&|$))(?!(.*&)?v=)',
  47. r'^//www\.youtube\.com/yts/',
  48. r'^//www\.youtube\.com/img/',
  49. r'^//www\.youtube\.com/youtubei/',
  50. r'^//www\.youtube\.com/ads/',
  51. r'^//www\.youtube\.com/(player|iframe)_api\?',
  52. r'^//www\.youtube\.com/error(_204)?/?\?',
  53. r'^//www\.youtube\.com/(about|t|howyoutubeworks)[/?]',
  54. r'^//www\.youtube\.com/results/?\?',
  55. r'^//www\.youtube\.com/premium/?\?',
  56. r'^//www\.youtube\.com/?(\?|$)',
  57. r'^//www\.youtube\.com/embed/("|%22|' r"'|%27" r')(%20)?(\+|%3B)', # JS extraction stuff
  58. r'^//consent\.(youtube|google)\.com/',
  59. ]))
  60. matchers = [
  61. # (pattern, paramSearch, function(match) -> output str or None); returning None stops further processing
  62. [channelPattern, True, lambda m: 'https://www.youtube.com/' + m[0].split('/', 3)[-1]],
  63. [videoPattern, True, lambda m: f'https://www.youtube.com/watch?v={m[0][-11:]}'],
  64. [re.compile(r'//www\.youtube\.com/(playlist|embed(/videoseries|/\+lastest|/playlist)?/?)\?(.*&)?list=UU[0-9A-Za-z_-]+'), True, lambda m: f'https://www.youtube.com/channel/UC{m[0].rsplit("=", 1)[1][2:]}'],
  65. [re.compile(r'//www\.youtube\.com/(playlist|embed(/videoseries|/\+lastest|/playlist)?/?)\?(.*&)?list=(PL|FL|RD|OL)[0-9A-Za-z_-]+'), True, lambda m: f'https://www.youtube.com/playlist?list={m[0].rsplit("=", 1)[1]}'],
  66. [re.compile(r'//www\.youtube\.com/embed/?\?(?=(.*&)?listType=user_uploads(&|$))(.*&)?list=[^&]+'), True, lambda m: f'https://www.youtube.com/user/{m[0].rsplit("=", 1)[1]}'],
  67. [re.compile(r'//www\.youtube\.com/rss/user/[^/]+'), True, lambda m: f'https://www.youtube.com/user/{m[0].rsplit("/", 1)[1]}'],
  68. [re.compile(r'//www\.youtube\.com/feeds/videos\.xml\?(.*&)?channel_id=UC[0-9A-Za-z_-]+'), True, lambda m: f'https://www.youtube.com/channel/{m[0].rsplit("=", 1)[1]}'],
  69. [re.compile(r'//www\.youtube\.com(/view_play_list\?p=|/playlist\?(.*&)?list=)[0-9A-F]{16}(?=(&|$))'), True, lambda m: f'https://www.youtube.com/playlist?list=PL{m[0].rsplit("=", 1)[1]}'],
  70. [re.compile(r'^//i\.ytimg\.com/vi/([0-9A-Za-z_-]{11})/'), False, lambda m: f'https://www.youtube.com/watch?v={m[1]}'],
  71. [noisePattern, False, lambda m: None],
  72. ]
  73. # Compile second pattern for parameters if needed
  74. for e in matchers:
  75. pattern, paramSearch, f = e
  76. if paramSearch:
  77. p2 = pattern.pattern.replace('//', '/{1,2}').replace('/', '(/|%2F)').replace(':', '(:|%3A)').replace(r'\?', r'(\?|%3F)')
  78. p2 = re.sub(r'(?<!\(\?)=', '(=|%3D)', p2)
  79. e[1] = re.compile(p2.replace('&', '(&|%26)'), pattern.flags)
  80. else:
  81. e[1] = None
  82. # Only one slash before so it still matches inside URLs when slashes were collapsed.
  83. domainPattern = re.compile(r'/(www\.)?youtube\.(com|de|fr|co\.uk|it|es|at|pt|gr|hu|ro|pl|dk|no|se|fi|ee|lt|lv|ru|by|cz|sk|si|rs|hr|ca)/')
  84. for origLine in sys.stdin:
  85. origLine = origLine.strip()
  86. line = re.sub(r'^https?://', '//', origLine)
  87. line = domainPattern.sub('/www.youtube.com/', line)
  88. if sys.argv[1] == 'massage':
  89. hadMatches = False
  90. for pattern1, pattern2, f in matchers:
  91. patterns = [pattern1]
  92. if pattern2:
  93. patterns.append(pattern2)
  94. results = set()
  95. for pattern in patterns:
  96. m = pattern.search(line)
  97. if m:
  98. hadMatches = True
  99. r = f(m)
  100. if r in results:
  101. continue
  102. results.add(r)
  103. if r is None:
  104. break
  105. print(r)
  106. if None in results:
  107. break
  108. if not hadMatches:
  109. print(origLine, file = sys.stderr)
  110. elif sys.argv[1] == 'removenonyt':
  111. if any(x in line for x in ('/www.youtube.com/', '/youtu.be/', '%2Fwww.youtube.com%2F', '%2Fyoutu.be%2F')):
  112. print(origLine)