A method to grab the comments from YouTube videos
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

121 lines
5.5 KiB

  1. import itertools
  2. import os
  3. import qwarc
  4. import qwarc.utils
  5. responseHandler = qwarc.utils.handle_response_limit_error_retries(5)
  6. class Comments(qwarc.Item):
  7. itemType = 'comments'
  8. # itemValue = '{videoId}'
  9. @classmethod
  10. def generate(cls):
  11. yield os.environ['YOUTUBE_VIDEOID']
  12. def get_json_obj_from_pos(self, content, startPos):
  13. # Given a startPos in content, extracts the content until braces or brackets are matching (requiring at least one set of parentheses)
  14. openParens = None
  15. for pos in itertools.count(start = startPos):
  16. char = content[pos:pos+1]
  17. if char in (b'{', b'['):
  18. if openParens is None: # First {[ in the string
  19. openParens = 0
  20. openParens += 1
  21. elif char in (b'}', b']'):
  22. openParens -= 1
  23. if openParens == 0:
  24. break
  25. return content[startPos:pos]
  26. def get_continuation_parameters(self, content):
  27. continuationToken = qwarc.utils.str_get_between(content, b'"continuation":"', b'"')
  28. if continuationToken.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
  29. self.logger.error(f'Unexpected continuation token value: {continuationToken!r}')
  30. return
  31. continuationToken = continuationToken.decode('ascii')
  32. itct = qwarc.utils.str_get_between(content, b'"clickTrackingParams":"', b'"')
  33. if itct.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
  34. self.logger.error(f'Unexpected itct value: {itct!r}')
  35. return
  36. itct = itct.decode('ascii')
  37. return continuationToken, itct
  38. async def continue_recursively(self, videoPageUrl, sessionToken, continuationToken, itct, nested = False, initial = False):
  39. '''
  40. Fetch the comments, recursively, including other sort orders.
  41. nested indicates whether the continuationToken is for nested comments (which require a different URL parameter), i.e. "View N replies" or "Show more replies".
  42. '''
  43. response, _ = await self.fetch(
  44. f'https://www.youtube.com/comment_service_ajax?action_get_{"comments" if not nested else "comment_replies"}=1&pbj=1&ctoken={continuationToken}&continuation={continuationToken}&itct={itct}',
  45. method = 'POST',
  46. data = {'session_token': sessionToken},
  47. headers = [('X-YouTube-Client-Name', '1'), ('X-YouTube-Client-Version', '2.20191212.06.02'), ('X-SPF-Referer', videoPageUrl), ('X-SPF-Previous', videoPageUrl)],
  48. responseHandler = responseHandler,
  49. )
  50. if not response or response.status != 200 or (await response.read(16) == b'{"reload":"now"}'):
  51. self.logger.error('Could not fetch initial comments')
  52. return
  53. content = await response.read()
  54. # Yes, the response is JSON and I could parse that into an object, but where's the fun in that?
  55. for continuationsPos in qwarc.utils.find_all(content, b'"continuations":'):
  56. continuations = self.get_json_obj_from_pos(content, continuationsPos)
  57. subContinuationToken, subItct = self.get_continuation_parameters(continuations)
  58. await self.continue_recursively(videoPageUrl, sessionToken, subContinuationToken, subItct, nested = b'"label":' in continuations)
  59. if initial:
  60. sortMenuPos = content.find(b'"sortMenu":')
  61. if sortMenuPos < 0:
  62. self.logger.error('Could not find sort menu')
  63. return
  64. sortMenu = self.get_json_obj_from_pos(content, sortMenuPos)
  65. for continuationPos in qwarc.utils.find_all(sortMenu, b'"continuation":{'):
  66. continuation = self.get_json_obj_from_pos(sortMenu, continuationPos)
  67. subContinuationToken, subItct = self.get_continuation_parameters(continuation)
  68. await self.continue_recursively(videoPageUrl, sessionToken, subContinuationToken, subItct)
  69. async def process(self):
  70. videoPageUrl = f'https://www.youtube.com/watch?v={self.itemValue}'
  71. response, _ = await self.fetch(videoPageUrl, responseHandler = responseHandler)
  72. if not response or response.status != 200:
  73. self.logger.error('Could not fetch video page')
  74. return
  75. contents = await response.read()
  76. sessionToken = qwarc.utils.str_get_between(contents, b'"XSRF_TOKEN":"', b'"')
  77. if not sessionToken:
  78. self.logger.error('Could not find session token')
  79. return
  80. if sessionToken.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'=='):
  81. self.logger.error('Unexpected session token value: {sessionToken!r}')
  82. return
  83. sessionToken = sessionToken.decode('ascii')
  84. sectionIdentifierPos = contents.find(b'"comment-item-section"')
  85. if sectionIdentifierPos < 0:
  86. self.logger.error('Could not find comment section identifier')
  87. return
  88. continuationStartPos = contents.rfind(b'"continuation":', 0, sectionIdentifierPos)
  89. if continuationStartPos < 0:
  90. self.logger.error('Could not find continuation start position')
  91. return
  92. section = contents[continuationStartPos:sectionIdentifierPos]
  93. continuationToken = qwarc.utils.str_get_between(section, b'"continuation":"', b'"')
  94. if continuationToken.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'=='):
  95. self.logger.error('Unexpected continuation token value: {continuationToken!r}')
  96. return
  97. continuationToken = continuationToken.decode('ascii')
  98. itct = qwarc.utils.str_get_between(section, b'"clickTrackingParams":"', b'"')
  99. if itct.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'=='):
  100. self.logger.error('Unexpected itct value: {itct!r}')
  101. return
  102. itct = itct.decode('ascii')
  103. await self.continue_recursively(videoPageUrl, sessionToken, continuationToken, itct, initial = True)
  104. specDependencies = qwarc.utils.SpecDependencies(extra = (('videoId', os.environ['YOUTUBE_VIDEOID']),))