A method to grab the comments from YouTube videos
Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

110 rader
5.0 KiB

  1. import collections
  2. import itertools
  3. import os
  4. import qwarc
  5. import qwarc.utils
  6. responseHandler = qwarc.utils.handle_response_limit_error_retries(5)
  7. class Comments(qwarc.Item):
  8. itemType = 'comments'
  9. # itemValue = '{videoId}'
  10. @classmethod
  11. def generate(cls):
  12. yield os.environ['YOUTUBE_VIDEOID']
  13. async def process(self):
  14. videoPageUrl = f'https://www.youtube.com/watch?v={self.itemValue}'
  15. response, _ = await self.fetch(videoPageUrl, responseHandler = responseHandler)
  16. if not response or response.status != 200:
  17. self.logger.error('Could not fetch video page')
  18. return
  19. content = await response.read()
  20. sessionToken = qwarc.utils.str_get_between(content, b'"XSRF_TOKEN":"', b'"')
  21. if not sessionToken:
  22. self.logger.error('Could not find session token')
  23. return
  24. if sessionToken.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
  25. self.logger.error(f'Unexpected session token value: {sessionToken!r}')
  26. return
  27. sessionToken = sessionToken.decode('ascii')
  28. sectionIdentifierPos = content.find(b'"comment-item-section"')
  29. if sectionIdentifierPos < 0:
  30. self.logger.error('Could not find comment section identifier')
  31. return
  32. continuationStartPos = content.rfind(b'"continuation":', 0, sectionIdentifierPos)
  33. if continuationStartPos < 0:
  34. self.logger.error('Could not find continuation start position')
  35. return
  36. section = content[continuationStartPos:sectionIdentifierPos]
  37. continuationToken = qwarc.utils.str_get_between(section, b'"continuation":"', b'"')
  38. if continuationToken.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
  39. self.logger.error(f'Unexpected continuation token value: {continuationToken!r}')
  40. return
  41. continuationToken = continuationToken.decode('ascii')
  42. itct = qwarc.utils.str_get_between(section, b'"clickTrackingParams":"', b'"')
  43. if itct.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
  44. self.logger.error(f'Unexpected itct value: {itct!r}')
  45. return
  46. itct = itct.decode('ascii')
  47. queue = collections.deque() # of (continuationToken, itct, nested) where nested indicates that it's a comment's replies ("View N replies" or "Show more comments")
  48. queue.append((continuationToken, itct, False))
  49. first = True
  50. while queue:
  51. continuationToken, itct, nested = queue.popleft()
  52. response, _ = await self.fetch(
  53. f'https://www.youtube.com/comment_service_ajax?action_get_{"comments" if not nested else "comment_replies"}=1&pbj=1&ctoken={continuationToken}&continuation={continuationToken}&itct={itct}',
  54. method = 'POST',
  55. data = {'session_token': sessionToken},
  56. headers = [('X-YouTube-Client-Name', '1'), ('X-YouTube-Client-Version', '2.20191212.06.02'), ('X-SPF-Referer', videoPageUrl), ('X-SPF-Previous', videoPageUrl)],
  57. responseHandler = responseHandler,
  58. )
  59. if not response or response.status != 200 or (await response.read(16) == b'{"reload":"now"}'):
  60. self.logger.error('Error fetching comments, skipping')
  61. continue
  62. obj = await response.json()
  63. if first:
  64. sortMenu = obj['response']['continuationContents']['itemSectionContinuation']['header']['commentsHeaderRenderer']['sortMenu']
  65. for subMenuItem in sortMenu['sortFilterSubMenuRenderer']['subMenuItems']:
  66. if subMenuItem['title'] != 'Newest first':
  67. continue
  68. subContinuation = subMenuItem['continuation']['reloadContinuationData']
  69. queue.append((subContinuation['continuation'], subContinuation['clickTrackingParams'], False))
  70. break
  71. else:
  72. self.logger.error('Could not find newest first sort continuation')
  73. first = False
  74. else:
  75. if not nested:
  76. o = obj
  77. continuationKey = 'itemSectionContinuation'
  78. else:
  79. # Of course the data format is different here...
  80. for o in obj:
  81. if 'response' in o:
  82. break
  83. continuationKey = 'commentRepliesContinuation'
  84. if 'continuationContents' not in o['response']:
  85. # Empty response
  86. continue
  87. for reply in o['response']['continuationContents'][continuationKey]['contents']:
  88. if 'commentThreadRenderer' in reply and 'replies' in reply['commentThreadRenderer']:
  89. # Nested continuations
  90. continuations = reply['commentThreadRenderer']['replies']['commentRepliesRenderer']['continuations']
  91. assert len(continuations) == 1
  92. queue.append((continuations[0]['nextContinuationData']['continuation'], continuations[0]['nextContinuationData']['clickTrackingParams'], True))
  93. if 'continuations' in o['response']['continuationContents'][continuationKey]:
  94. assert len(o['response']['continuationContents'][continuationKey]['continuations']) == 1
  95. continuation = o['response']['continuationContents'][continuationKey]['continuations'][0]['nextContinuationData']
  96. queue.append((continuation['continuation'], continuation['clickTrackingParams'], nested))
  97. specDependencies = qwarc.utils.SpecDependencies(extra = (('videoId', os.environ['YOUTUBE_VIDEOID']),))