A method to grab the comments from YouTube videos
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

121 lignes
5.5 KiB

  1. import itertools
  2. import os
  3. import qwarc
  4. import qwarc.utils
  5. responseHandler = qwarc.utils.handle_response_limit_error_retries(5)
  6. class Comments(qwarc.Item):
  7. itemType = 'comments'
  8. # itemValue = '{videoId}'
  9. @classmethod
  10. def generate(cls):
  11. yield os.environ['YOUTUBE_VIDEOID']
  12. def get_json_obj_from_pos(self, content, startPos):
  13. # Given a startPos in content, extracts the content until braces or brackets are matching (requiring at least one set of parentheses)
  14. openParens = None
  15. for pos in itertools.count(start = startPos):
  16. char = content[pos:pos+1]
  17. if char in (b'{', b'['):
  18. if openParens is None: # First {[ in the string
  19. openParens = 0
  20. openParens += 1
  21. elif char in (b'}', b']'):
  22. openParens -= 1
  23. if openParens == 0:
  24. break
  25. return content[startPos:pos]
  26. def get_continuation_parameters(self, content):
  27. continuationToken = qwarc.utils.str_get_between(content, b'"continuation":"', b'"')
  28. if continuationToken.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
  29. self.logger.error(f'Unexpected continuation token value: {continuationToken!r}')
  30. return
  31. continuationToken = continuationToken.decode('ascii')
  32. itct = qwarc.utils.str_get_between(content, b'"clickTrackingParams":"', b'"')
  33. if itct.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
  34. self.logger.error(f'Unexpected itct value: {itct!r}')
  35. return
  36. itct = itct.decode('ascii')
  37. return continuationToken, itct
  38. async def continue_recursively(self, videoPageUrl, sessionToken, continuationToken, itct, nested = False, initial = False):
  39. '''
  40. Fetch the comments, recursively, including other sort orders.
  41. nested indicates whether the continuationToken is for nested comments (which require a different URL parameter), i.e. "View N replies" or "Show more replies".
  42. '''
  43. response, _ = await self.fetch(
  44. f'https://www.youtube.com/comment_service_ajax?action_get_{"comments" if not nested else "comment_replies"}=1&pbj=1&ctoken={continuationToken}&continuation={continuationToken}&itct={itct}',
  45. method = 'POST',
  46. data = {'session_token': sessionToken},
  47. headers = [('X-YouTube-Client-Name', '1'), ('X-YouTube-Client-Version', '2.20191212.06.02'), ('X-SPF-Referer', videoPageUrl), ('X-SPF-Previous', videoPageUrl)],
  48. responseHandler = responseHandler,
  49. )
  50. if not response or response.status != 200 or (await response.read(16) == b'{"reload":"now"}'):
  51. self.logger.error('Could not fetch initial comments')
  52. return
  53. content = await response.read()
  54. # Yes, the response is JSON and I could parse that into an object, but where's the fun in that?
  55. for continuationsPos in qwarc.utils.find_all(content, b'"continuations":'):
  56. continuations = self.get_json_obj_from_pos(content, continuationsPos)
  57. subContinuationToken, subItct = self.get_continuation_parameters(continuations)
  58. await self.continue_recursively(videoPageUrl, sessionToken, subContinuationToken, subItct, nested = b'"label":' in continuations)
  59. if initial:
  60. sortMenuPos = content.find(b'"sortMenu":')
  61. if sortMenuPos < 0:
  62. self.logger.error('Could not find sort menu')
  63. return
  64. sortMenu = self.get_json_obj_from_pos(content, sortMenuPos)
  65. for continuationPos in qwarc.utils.find_all(sortMenu, b'"continuation":{'):
  66. continuation = self.get_json_obj_from_pos(sortMenu, continuationPos)
  67. subContinuationToken, subItct = self.get_continuation_parameters(continuation)
  68. await self.continue_recursively(videoPageUrl, sessionToken, subContinuationToken, subItct)
  69. async def process(self):
  70. videoPageUrl = f'https://www.youtube.com/watch?v={self.itemValue}'
  71. response, _ = await self.fetch(videoPageUrl, responseHandler = responseHandler)
  72. if not response or response.status != 200:
  73. self.logger.error('Could not fetch video page')
  74. return
  75. content = await response.read()
  76. sessionToken = qwarc.utils.str_get_between(content, b'"XSRF_TOKEN":"', b'"')
  77. if not sessionToken:
  78. self.logger.error('Could not find session token')
  79. return
  80. if sessionToken.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'=='):
  81. self.logger.error('Unexpected session token value: {sessionToken!r}')
  82. return
  83. sessionToken = sessionToken.decode('ascii')
  84. sectionIdentifierPos = content.find(b'"comment-item-section"')
  85. if sectionIdentifierPos < 0:
  86. self.logger.error('Could not find comment section identifier')
  87. return
  88. continuationStartPos = content.rfind(b'"continuation":', 0, sectionIdentifierPos)
  89. if continuationStartPos < 0:
  90. self.logger.error('Could not find continuation start position')
  91. return
  92. section = content[continuationStartPos:sectionIdentifierPos]
  93. continuationToken = qwarc.utils.str_get_between(section, b'"continuation":"', b'"')
  94. if continuationToken.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'=='):
  95. self.logger.error('Unexpected continuation token value: {continuationToken!r}')
  96. return
  97. continuationToken = continuationToken.decode('ascii')
  98. itct = qwarc.utils.str_get_between(section, b'"clickTrackingParams":"', b'"')
  99. if itct.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'=='):
  100. self.logger.error('Unexpected itct value: {itct!r}')
  101. return
  102. itct = itct.decode('ascii')
  103. await self.continue_recursively(videoPageUrl, sessionToken, continuationToken, itct, initial = True)
  104. specDependencies = qwarc.utils.SpecDependencies(extra = (('videoId', os.environ['YOUTUBE_VIDEOID']),))