|
|
@@ -1,3 +1,4 @@ |
|
|
|
import collections |
|
|
|
import itertools |
|
|
|
import os |
|
|
|
import qwarc |
|
|
@@ -43,40 +44,6 @@ class Comments(qwarc.Item): |
|
|
|
itct = itct.decode('ascii') |
|
|
|
return continuationToken, itct |
|
|
|
|
|
|
|
async def continue_recursively(self, videoPageUrl, sessionToken, continuationToken, itct, nested = False, initial = False): |
|
|
|
''' |
|
|
|
Fetch the comments, recursively, including other sort orders. |
|
|
|
nested indicates whether the continuationToken is for nested comments (which require a different URL parameter), i.e. "View N replies" or "Show more replies". |
|
|
|
''' |
|
|
|
response, _ = await self.fetch( |
|
|
|
f'https://www.youtube.com/comment_service_ajax?action_get_{"comments" if not nested else "comment_replies"}=1&pbj=1&ctoken={continuationToken}&continuation={continuationToken}&itct={itct}', |
|
|
|
method = 'POST', |
|
|
|
data = {'session_token': sessionToken}, |
|
|
|
headers = [('X-YouTube-Client-Name', '1'), ('X-YouTube-Client-Version', '2.20191212.06.02'), ('X-SPF-Referer', videoPageUrl), ('X-SPF-Previous', videoPageUrl)], |
|
|
|
responseHandler = responseHandler, |
|
|
|
) |
|
|
|
if not response or response.status != 200 or (await response.read(16) == b'{"reload":"now"}'): |
|
|
|
self.logger.error('Could not fetch initial comments') |
|
|
|
return |
|
|
|
content = await response.read() |
|
|
|
# Yes, the response is JSON and I could parse that into an object, but where's the fun in that? |
|
|
|
|
|
|
|
for continuationsPos in qwarc.utils.find_all(content, b'"continuations":'): |
|
|
|
continuations = self.get_json_obj_from_pos(content, continuationsPos) |
|
|
|
subContinuationToken, subItct = self.get_continuation_parameters(continuations) |
|
|
|
await self.continue_recursively(videoPageUrl, sessionToken, subContinuationToken, subItct, nested = b'"label":' in continuations) |
|
|
|
|
|
|
|
if initial: |
|
|
|
sortMenuPos = content.find(b'"sortMenu":') |
|
|
|
if sortMenuPos < 0: |
|
|
|
self.logger.error('Could not find sort menu') |
|
|
|
return |
|
|
|
sortMenu = self.get_json_obj_from_pos(content, sortMenuPos) |
|
|
|
for continuationPos in qwarc.utils.find_all(sortMenu, b'"continuation":{'): |
|
|
|
continuation = self.get_json_obj_from_pos(sortMenu, continuationPos) |
|
|
|
subContinuationToken, subItct = self.get_continuation_parameters(continuation) |
|
|
|
await self.continue_recursively(videoPageUrl, sessionToken, subContinuationToken, subItct) |
|
|
|
|
|
|
|
async def process(self): |
|
|
|
videoPageUrl = f'https://www.youtube.com/watch?v={self.itemValue}' |
|
|
|
response, _ = await self.fetch(videoPageUrl, responseHandler = responseHandler) |
|
|
@@ -114,7 +81,40 @@ class Comments(qwarc.Item): |
|
|
|
return |
|
|
|
itct = itct.decode('ascii') |
|
|
|
|
|
|
|
await self.continue_recursively(videoPageUrl, sessionToken, continuationToken, itct, initial = True) |
|
|
|
|
|
|
|
queue = collections.deque() # of (continuationToken, itct, nested, initial) where nested indicates that it's a comment's replies ("View N replies" or "Show more comments") |
|
|
|
queue.append((continuationToken, itct, False)) |
|
|
|
first = True |
|
|
|
while queue: |
|
|
|
continuationToken, itct, nested = queue.popleft() |
|
|
|
response, _ = await self.fetch( |
|
|
|
f'https://www.youtube.com/comment_service_ajax?action_get_{"comments" if not nested else "comment_replies"}=1&pbj=1&ctoken={continuationToken}&continuation={continuationToken}&itct={itct}', |
|
|
|
method = 'POST', |
|
|
|
data = {'session_token': sessionToken}, |
|
|
|
headers = [('X-YouTube-Client-Name', '1'), ('X-YouTube-Client-Version', '2.20191212.06.02'), ('X-SPF-Referer', videoPageUrl), ('X-SPF-Previous', videoPageUrl)], |
|
|
|
responseHandler = responseHandler, |
|
|
|
) |
|
|
|
if not response or response.status != 200 or (await response.read(16) == b'{"reload":"now"}'): |
|
|
|
self.logger.error('Error fetching comments, skipping') |
|
|
|
continue |
|
|
|
content = await response.read() |
|
|
|
# Yes, the response is JSON and I could parse that into an object, but where's the fun in that? |
|
|
|
|
|
|
|
for continuationsPos in qwarc.utils.find_all(content, b'"continuations":'): |
|
|
|
continuations = self.get_json_obj_from_pos(content, continuationsPos) |
|
|
|
subContinuationToken, subItct = self.get_continuation_parameters(continuations) |
|
|
|
queue.append((subContinuationToken, subItct, b'"label":' in continuations)) |
|
|
|
|
|
|
|
if first: |
|
|
|
sortMenuPos = content.find(b'"sortMenu":') |
|
|
|
if sortMenuPos < 0: |
|
|
|
self.logger.error('Could not find sort menu') |
|
|
|
else: |
|
|
|
sortMenu = self.get_json_obj_from_pos(content, sortMenuPos) |
|
|
|
for continuationPos in qwarc.utils.find_all(sortMenu, b'"continuation":{'): |
|
|
|
continuation = self.get_json_obj_from_pos(sortMenu, continuationPos) |
|
|
|
subContinuationToken, subItct = self.get_continuation_parameters(continuation) |
|
|
|
queue.append((subContinuationToken, subItct, False)) |
|
|
|
|
|
|
|
first = False |
|
|
|
|
|
|
|
specDependencies = qwarc.utils.SpecDependencies(extra = (('videoId', os.environ['YOUTUBE_VIDEOID']),)) |