The little things give you away... A collection of various small helper stuff
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 

136 lignes
6.8 KiB

  1. #!/usr/bin/env python3
  2. import itertools
  3. import os
  4. import re
  5. import sys
  6. if any(x in sys.argv for x in ['--help', '-h', '-?', 'help']):
  7. print('Usage: youtube-extract [massage|removenonyt]', file = sys.stderr)
  8. print(file = sys.stderr)
  9. print("In 'massage' mode (default), extracts any references to YouTube videos, channels, and playlists from the lines on stdin and prints them on stdout.", file = sys.stderr)
  10. print('Lines that don\'t seem to contain references to such YouTube things are printed on stderr.', file = sys.stderr)
  11. print(file = sys.stderr)
  12. print("In 'removenonyt' mode, prints all lines that look like they are or contain YouTube URLs on stdout.", file = sys.stderr)
  13. sys.exit(1)
  14. mode = sys.argv[1] if len(sys.argv) >= 2 else 'massage'
  15. # Only one slash before so it still matches inside URLs when slashes were collapsed.
  16. domainPattern = re.compile(r'/(www\.|m\.)?(youtube\.(com|de|fr|co\.uk|it|es|at|pt|gr|hu|ro|pl|dk|no|se|fi|ee|lt|lv|ru|by|cz|sk|si|rs|hr|ca)|(music|gaming)\.youtube\.com|(es|uk|pl|ru|it|jp|br)\.youtube\.com|youtube-nocookie\.com)(:\d+)?/', re.IGNORECASE)
  17. if mode == 'removenonyt':
  18. # Anything in here could never be as fast as grep, so just delegate to that...
  19. os.execlp('grep', 'grep', '-Fai', '-e', 'youtube', '-e', 'youtu.be', '-e', 'ytimg.com', '-e', '?v=', '-e', '%3Fv%3D', '-e', '&v=', '-e', '%26v%3D')
  20. sys.exit(0)
  21. assert mode == 'massage'
  22. noisePattern = '|'.join([
  23. r'//www\.youtube\.com/s/(desktop|player)/[0-9a-f]+/',
  24. r'//www\.youtube\.com/s/gaming/emoji/',
  25. r'//www\.youtube\.com/redirect\?event=channel_banner&',
  26. r'//www\.youtube\.com/redirect\?(?=(\S*&)?event=video_description(&|$))(?!(\S*&)?v=)',
  27. r'//www\.youtube\.com/yts/',
  28. r'//www\.youtube\.com/img/',
  29. r'//www\.youtube\.com/youtubei/',
  30. r'//www\.youtube\.com/ads(/|$)',
  31. r'//www\.youtube\.com/creators(/|$)',
  32. r'//www\.youtube\.com/(player|iframe)_api(\?|$)',
  33. r'//www\.youtube\.com/error(_204)?/?\?',
  34. r'//www\.youtube\.com/(about|t|howyoutubeworks)([/?]|$)',
  35. r'//www\.youtube\.com/results/?(\?|$)',
  36. r'//www\.youtube\.com/premium/?\?',
  37. r'//www\.youtube\.com/new([/?]|$)',
  38. r'//www\.youtube\.com/?(\?(?!(\S*&)?v=)|$)',
  39. r'//www\.youtube\.com/embed/("|%22|' r"'|%27" r')(%20)?(\+|%3B)', # JS extraction stuff
  40. r'//www\.youtube\.com/service_ajax$',
  41. r'//www\.youtube\.com/watch(\?v=)?$',
  42. r'//consent\.(youtube|google)\.com/',
  43. r'//www\.youtube\.com/(c|user|channel|watch(_popup)?(\.php)?|embed|e|v|redirect|(my_videos_)?upload)(%[23]F|/)?$', # Miscellaneous crap
  44. ])
  45. channelPattern = '|'.join([
  46. r'''/www\.youtube\.com/c/[^/?&=."'>\\\s]+''',
  47. r'/www\.youtube\.com/user/[A-Za-z0-9]{1,20}',
  48. r'/www\.youtube\.com/channel/UC[0-9A-Za-z_-]{22}',
  49. r'''/www\.youtube\.com/[^/?&=."'>\\\s]+(?=/?(\s|\\?["'>]|$))''',
  50. ])
  51. # Make sure that the last 11 chars of the match are always the video ID (because Python's re doesn't support \K).
  52. # If necessary, use lookahead assertions to match further stuff after the video ID.
  53. videoPattern = '|'.join([
  54. # Normal watch URL
  55. r'/www\.youtube\.com/watch(_popup)?(\.php)?/?\?(\S*&)?v=[0-9A-Za-z_-]{11}',
  56. r'/www\.youtube\.com/watch/[0-9A-Za-z_-]{11}',
  57. # Embeds
  58. r'/www\.youtube\.com/e(mbed)?/(?!videoseries\?)[0-9A-Za-z_-]{11}',
  59. r'/www\.youtube\.com/embed/?\?(\S*&)?v=[0-9A-Za-z_-]{11}',
  60. # Shortener
  61. r'/(?i:youtu\.be)(:\d+)?/[0-9A-Za-z_-]{11}',
  62. # Shorts
  63. r'/www\.youtube\.com/shorts/[0-9A-Za-z_-]{11}',
  64. # Old (Flash) embeds
  65. r'/www\.youtube\.com/v/[0-9A-Za-z_-]{11}',
  66. # Redirects from links in video descriptions
  67. r'/www\.youtube\.com/redirect\?(\S*&)?v=[0-9A-Za-z_-]{11}(?=&|$)',
  68. # Tracking and other crap
  69. r'/www\.youtube\.com/(ptracking|set_awesome)\?(\S*&)?video_id=[0-9A-Za-z_-]{11}',
  70. r'/www\.youtube\.com/api/timedtext\?(\S*&)?v=[0-9A-Za-z_-]{11}',
  71. r'/www\.youtube\.com/(my_videos_)?edit\?(\S*&)?video_id=[0-9A-Za-z_-]{11}',
  72. r'/www\.youtube\.com/(all_comments|attribution|cthru|get_endscreen|livestreaming/dashboard)\?(\S*&)?v=[0-9A-Za-z_-]{11}',
  73. # Generic v parameter on watch URLs including with percent encoding; this covers e.g. google.com/url?... or the oEmbed
  74. r'/watch/?\?(\S*&)?v=[0-9A-Za-z_-]{11}',
  75. # Generic v parameter on anything
  76. r'[?&]v=[0-9A-Za-z_-]{11}(?=&|\s|$)',
  77. ])
  78. def percentdecode(s):
  79. return s.replace('%2F', '/').replace('%3A', ':').replace('%3F', '?').replace('%3D', '=').replace('%26', '&')
  80. matchers = [
  81. # (pattern, paramSearch, function(match: list[str]) -> output str or None); returning None stops further processing of a line
  82. # If paramSearch is True, a corresponding pattern with [/:?=&] replaced by their percent encodings is generated; the reverse replacement is done again automatically before calling the function.
  83. [noisePattern, False, lambda m: None],
  84. [channelPattern, True, lambda m: 'https://www.youtube.com/' + m[0].split('/', 2)[-1].rstrip('/')],
  85. [videoPattern, True, lambda m: f'https://www.youtube.com/watch?v={m[0][-11:]}'],
  86. [r'/www\.youtube\.com/(?:playlist|watch|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:\S*&)?list=UU([0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/UC{m[1]}'],
  87. [r'/www\.youtube\.com/(?:playlist|watch|embed(?:/videoseries|/\+lastest|/playlist)?/?)\?(?:\S*&)?list=((PL|FL|RD|OL)[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/playlist?list={m[1]}'],
  88. [r'/www\.youtube\.com/embed/?\?(?=(?:\S*&)?listType=user_uploads(?:&|$))(?:\S*&)?list=([A-Za-z0-9]{1,20})', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
  89. [r'/www\.youtube\.com/rss/user/([A-Za-z0-9]{1,20})', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
  90. [r'/www\.youtube\.com/(?:subscription_center\?(?:\S*&)?add_user=|subscribe_widget\?(?:\S*&)?p=|profile\?(?:\S*&)?user=)([A-Za-z0-9]{1,20})', True, lambda m: f'https://www.youtube.com/user/{m[1]}'],
  91. [r'/www\.youtube\.com/feeds/videos\.xml\?(?:\S*&)?channel_id=(UC[0-9A-Za-z_-]+)', True, lambda m: f'https://www.youtube.com/channel/{m[1]}'],
  92. [r'/www\.youtube\.com(?:/view_play_list\?(?:\S*&)?p=|/playlist\?(?:.*&)?list=)([0-9A-F]{16})(?=(&|\s|$))', True, lambda m: f'https://www.youtube.com/playlist?list=PL{m[1]}'],
  93. [r'/(?i:i\.ytimg\.com|img\.youtube\.com)(?::\d+)?/vi/([0-9A-Za-z_-]{11})/', True, lambda m: f'https://www.youtube.com/watch?v={m[1]}'],
  94. ]
  95. # Compile pattern and generate one for parameters if desired
  96. for e in matchers:
  97. e[0] = re.compile(e[0])
  98. for origLine in sys.stdin.buffer:
  99. line = re.sub(r'https?://', '//', origLine.strip().decode('utf-8', 'surrogateescape'))
  100. line = domainPattern.sub('/www.youtube.com/', line)
  101. decodedLine = percentdecode(line)
  102. hadMatches = False
  103. for pattern, paramSearch, f in matchers:
  104. results = set()
  105. for m in itertools.chain((x for x in pattern.finditer(line)), (x for x in pattern.finditer(decodedLine)) if paramSearch else ()):
  106. hadMatches = True
  107. r = f(m)
  108. if r in results:
  109. continue
  110. results.add(r)
  111. if r is None:
  112. break
  113. sys.stdout.buffer.write(r.encode('utf-8', 'surrogateescape'))
  114. sys.stdout.buffer.write(b'\n')
  115. if None in results:
  116. break
  117. if not hadMatches:
  118. sys.stderr.buffer.write(origLine)