From 2ddcec9fbb6a5ca7b3385ff1b15ae5cd42f129ac Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 2 Sep 2020 02:19:05 +0000 Subject: [PATCH] Refactor pagination parsing to use the JSON objects instead of stupidly extracting all continuation tokens, and only retrieve the 'newest first' sort order --- comments.py | 79 +++++++++++++++++++++++------------------------------ 1 file changed, 34 insertions(+), 45 deletions(-) diff --git a/comments.py b/comments.py index 7ed76e6..85cadf5 100644 --- a/comments.py +++ b/comments.py @@ -16,34 +16,6 @@ class Comments(qwarc.Item): def generate(cls): yield os.environ['YOUTUBE_VIDEOID'] - def get_json_obj_from_pos(self, content, startPos): - # Given a startPos in content, extracts the content until braces or brackets are matching (requiring at least one set of parentheses) - openParens = None - for pos in itertools.count(start = startPos): - char = content[pos:pos+1] - if char in (b'{', b'['): - if openParens is None: # First {[ in the string - openParens = 0 - openParens += 1 - elif char in (b'}', b']'): - openParens -= 1 - if openParens == 0: - break - return content[startPos:pos] - - def get_continuation_parameters(self, content): - continuationToken = qwarc.utils.str_get_between(content, b'"continuation":"', b'"') - if continuationToken.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'): - self.logger.error(f'Unexpected continuation token value: {continuationToken!r}') - return - continuationToken = continuationToken.decode('ascii') - itct = qwarc.utils.str_get_between(content, b'"clickTrackingParams":"', b'"') - if itct.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'): - self.logger.error(f'Unexpected itct value: {itct!r}') - return - itct = itct.decode('ascii') - return continuationToken, itct - async def process(self): videoPageUrl = f'https://www.youtube.com/watch?v={self.itemValue}' response, _ = await self.fetch(videoPageUrl, responseHandler = responseHandler) @@ -81,7 +53,7 @@ class Comments(qwarc.Item): return itct = itct.decode('ascii') - queue = collections.deque() # of (continuationToken, itct, nested, initial) where nested indicates that it's a comment's replies ("View N replies" or "Show more comments") + queue = collections.deque() # of (continuationToken, itct, nested) where nested indicates that it's a comment's replies ("View N replies" or "Show more comments") queue.append((continuationToken, itct, False)) first = True while queue: @@ -96,25 +68,42 @@ class Comments(qwarc.Item): if not response or response.status != 200 or (await response.read(16) == b'{"reload":"now"}'): self.logger.error('Error fetching comments, skipping') continue - content = await response.read() - # Yes, the response is JSON and I could parse that into an object, but where's the fun in that? - - for continuationsPos in qwarc.utils.find_all(content, b'"continuations":'): - continuations = self.get_json_obj_from_pos(content, continuationsPos) - subContinuationToken, subItct = self.get_continuation_parameters(continuations) - queue.append((subContinuationToken, subItct, b'"label":' in continuations)) + obj = await response.json() if first: - sortMenuPos = content.find(b'"sortMenu":') - if sortMenuPos < 0: - self.logger.error('Could not find sort menu') + sortMenu = obj['response']['continuationContents']['itemSectionContinuation']['header']['commentsHeaderRenderer']['sortMenu'] + for subMenuItem in sortMenu['sortFilterSubMenuRenderer']['subMenuItems']: + if subMenuItem['title'] != 'Newest first': + continue + subContinuation = subMenuItem['continuation']['reloadContinuationData'] + queue.append((subContinuation['continuation'], subContinuation['clickTrackingParams'], False)) + break + else: + self.logger.error('Could not find newest first sort continuation') + first = False + else: + if not nested: + o = obj + continuationKey = 'itemSectionContinuation' else: - sortMenu = self.get_json_obj_from_pos(content, sortMenuPos) - for continuationPos in qwarc.utils.find_all(sortMenu, b'"continuation":{'): - continuation = self.get_json_obj_from_pos(sortMenu, continuationPos) - subContinuationToken, subItct = self.get_continuation_parameters(continuation) - queue.append((subContinuationToken, subItct, False)) + # Of course the data format is different here... + for o in obj: + if 'response' in o: + break + continuationKey = 'commentRepliesContinuation' + if 'continuationContents' not in o['response']: + # Empty response + continue + for reply in o['response']['continuationContents'][continuationKey]['contents']: + if 'commentThreadRenderer' in reply and 'replies' in reply['commentThreadRenderer']: + # Nested continuations + continuations = reply['commentThreadRenderer']['replies']['commentRepliesRenderer']['continuations'] + assert len(continuations) == 1 + queue.append((continuations[0]['nextContinuationData']['continuation'], continuations[0]['nextContinuationData']['clickTrackingParams'], True)) + if 'continuations' in o['response']['continuationContents'][continuationKey]: + assert len(o['response']['continuationContents'][continuationKey]['continuations']) == 1 + continuation = o['response']['continuationContents'][continuationKey]['continuations'][0]['nextContinuationData'] + queue.append((continuation['continuation'], continuation['clickTrackingParams'], nested)) - first = False specDependencies = qwarc.utils.SpecDependencies(extra = (('videoId', os.environ['YOUTUBE_VIDEOID']),))