A method to grab the comments from YouTube videos
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

110 lignes
5.0 KiB

  1. import collections
  2. import itertools
  3. import os
  4. import qwarc
  5. import qwarc.utils
  6. responseHandler = qwarc.utils.handle_response_limit_error_retries(5)
  7. class Comments(qwarc.Item):
  8. itemType = 'comments'
  9. # itemValue = '{videoId}'
  10. @classmethod
  11. def generate(cls):
  12. yield os.environ['YOUTUBE_VIDEOID']
  13. async def process(self):
  14. videoPageUrl = f'https://www.youtube.com/watch?v={self.itemValue}'
  15. response, _ = await self.fetch(videoPageUrl, responseHandler = responseHandler)
  16. if not response or response.status != 200:
  17. self.logger.error('Could not fetch video page')
  18. return
  19. content = await response.read()
  20. sessionToken = qwarc.utils.str_get_between(content, b'"XSRF_TOKEN":"', b'"')
  21. if not sessionToken:
  22. self.logger.error('Could not find session token')
  23. return
  24. if sessionToken.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
  25. self.logger.error(f'Unexpected session token value: {sessionToken!r}')
  26. return
  27. sessionToken = sessionToken.decode('ascii')
  28. sectionIdentifierPos = content.find(b'"comment-item-section"')
  29. if sectionIdentifierPos < 0:
  30. self.logger.error('Could not find comment section identifier')
  31. return
  32. continuationStartPos = content.rfind(b'"continuation":', 0, sectionIdentifierPos)
  33. if continuationStartPos < 0:
  34. self.logger.error('Could not find continuation start position')
  35. return
  36. section = content[continuationStartPos:sectionIdentifierPos]
  37. continuationToken = qwarc.utils.str_get_between(section, b'"continuation":"', b'"')
  38. if continuationToken.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
  39. self.logger.error(f'Unexpected continuation token value: {continuationToken!r}')
  40. return
  41. continuationToken = continuationToken.decode('ascii')
  42. itct = qwarc.utils.str_get_between(section, b'"clickTrackingParams":"', b'"')
  43. if itct.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
  44. self.logger.error(f'Unexpected itct value: {itct!r}')
  45. return
  46. itct = itct.decode('ascii')
  47. queue = collections.deque() # of (continuationToken, itct, nested) where nested indicates that it's a comment's replies ("View N replies" or "Show more comments")
  48. queue.append((continuationToken, itct, False))
  49. first = True
  50. while queue:
  51. continuationToken, itct, nested = queue.popleft()
  52. response, _ = await self.fetch(
  53. f'https://www.youtube.com/comment_service_ajax?action_get_{"comments" if not nested else "comment_replies"}=1&pbj=1&ctoken={continuationToken}&continuation={continuationToken}&itct={itct}',
  54. method = 'POST',
  55. data = {'session_token': sessionToken},
  56. headers = [('X-YouTube-Client-Name', '1'), ('X-YouTube-Client-Version', '2.20191212.06.02'), ('X-SPF-Referer', videoPageUrl), ('X-SPF-Previous', videoPageUrl)],
  57. responseHandler = responseHandler,
  58. )
  59. if not response or response.status != 200 or (await response.read(16) == b'{"reload":"now"}'):
  60. self.logger.error('Error fetching comments, skipping')
  61. continue
  62. obj = await response.json()
  63. if first:
  64. sortMenu = obj['response']['continuationContents']['itemSectionContinuation']['header']['commentsHeaderRenderer']['sortMenu']
  65. for subMenuItem in sortMenu['sortFilterSubMenuRenderer']['subMenuItems']:
  66. if subMenuItem['title'] != 'Newest first':
  67. continue
  68. subContinuation = subMenuItem['continuation']['reloadContinuationData']
  69. queue.append((subContinuation['continuation'], subContinuation['clickTrackingParams'], False))
  70. break
  71. else:
  72. self.logger.error('Could not find newest first sort continuation')
  73. first = False
  74. else:
  75. if not nested:
  76. o = obj
  77. continuationKey = 'itemSectionContinuation'
  78. else:
  79. # Of course the data format is different here...
  80. for o in obj:
  81. if 'response' in o:
  82. break
  83. continuationKey = 'commentRepliesContinuation'
  84. if 'continuationContents' not in o['response']:
  85. # Empty response
  86. continue
  87. for reply in o['response']['continuationContents'][continuationKey]['contents']:
  88. if 'commentThreadRenderer' in reply and 'replies' in reply['commentThreadRenderer']:
  89. # Nested continuations
  90. continuations = reply['commentThreadRenderer']['replies']['commentRepliesRenderer']['continuations']
  91. assert len(continuations) == 1
  92. queue.append((continuations[0]['nextContinuationData']['continuation'], continuations[0]['nextContinuationData']['clickTrackingParams'], True))
  93. if 'continuations' in o['response']['continuationContents'][continuationKey]:
  94. assert len(o['response']['continuationContents'][continuationKey]['continuations']) == 1
  95. continuation = o['response']['continuationContents'][continuationKey]['continuations'][0]['nextContinuationData']
  96. queue.append((continuation['continuation'], continuation['clickTrackingParams'], nested))
  97. specDependencies = qwarc.utils.SpecDependencies(extra = (('videoId', os.environ['YOUTUBE_VIDEOID']),))