From 2ddcec9fbb6a5ca7b3385ff1b15ae5cd42f129ac Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Wed, 2 Sep 2020 02:19:05 +0000
Subject: [PATCH] Refactor pagination parsing to use the JSON objects instead
 of stupidly extracting all continuation tokens, and only retrieve the 'newest
 first' sort order

---
 comments.py | 79 +++++++++++++++++++++++------------------------------
 1 file changed, 34 insertions(+), 45 deletions(-)

diff --git a/comments.py b/comments.py
index 7ed76e6..85cadf5 100644
--- a/comments.py
+++ b/comments.py
@@ -16,34 +16,6 @@ class Comments(qwarc.Item):
 	def generate(cls):
 		yield os.environ['YOUTUBE_VIDEOID']
 
-	def get_json_obj_from_pos(self, content, startPos):
-		# Given a startPos in content, extracts the content until braces or brackets are matching (requiring at least one set of parentheses)
-		openParens = None
-		for pos in itertools.count(start = startPos):
-			char = content[pos:pos+1]
-			if char in (b'{', b'['):
-				if openParens is None: # First {[ in the string
-					openParens = 0
-				openParens += 1
-			elif char in (b'}', b']'):
-				openParens -= 1
-			if openParens == 0:
-				break
-		return content[startPos:pos]
-
-	def get_continuation_parameters(self, content):
-		continuationToken = qwarc.utils.str_get_between(content, b'"continuation":"', b'"')
-		if continuationToken.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
-			self.logger.error(f'Unexpected continuation token value: {continuationToken!r}')
-			return
-		continuationToken = continuationToken.decode('ascii')
-		itct = qwarc.utils.str_get_between(content, b'"clickTrackingParams":"', b'"')
-		if itct.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
-			self.logger.error(f'Unexpected itct value: {itct!r}')
-			return
-		itct = itct.decode('ascii')
-		return continuationToken, itct
-
 	async def process(self):
 		videoPageUrl = f'https://www.youtube.com/watch?v={self.itemValue}'
 		response, _ = await self.fetch(videoPageUrl, responseHandler = responseHandler)
@@ -81,7 +53,7 @@ class Comments(qwarc.Item):
 			return
 		itct = itct.decode('ascii')
 
-		queue = collections.deque() # of (continuationToken, itct, nested, initial) where nested indicates that it's a comment's replies ("View N replies" or "Show more comments")
+		queue = collections.deque() # of (continuationToken, itct, nested) where nested indicates that it's a comment's replies ("View N replies" or "Show more comments")
 		queue.append((continuationToken, itct, False))
 		first = True
 		while queue:
@@ -96,25 +68,42 @@ class Comments(qwarc.Item):
 			if not response or response.status != 200 or (await response.read(16) == b'{"reload":"now"}'):
 				self.logger.error('Error fetching comments, skipping')
 				continue
-			content = await response.read()
-			# Yes, the response is JSON and I could parse that into an object, but where's the fun in that?
-
-			for continuationsPos in qwarc.utils.find_all(content, b'"continuations":'):
-				continuations = self.get_json_obj_from_pos(content, continuationsPos)
-				subContinuationToken, subItct = self.get_continuation_parameters(continuations)
-				queue.append((subContinuationToken, subItct, b'"label":' in continuations))
+			obj = await response.json()
 
 			if first:
-				sortMenuPos = content.find(b'"sortMenu":')
-				if sortMenuPos < 0:
-					self.logger.error('Could not find sort menu')
+				sortMenu = obj['response']['continuationContents']['itemSectionContinuation']['header']['commentsHeaderRenderer']['sortMenu']
+				for subMenuItem in sortMenu['sortFilterSubMenuRenderer']['subMenuItems']:
+					if subMenuItem['title'] != 'Newest first':
+						continue
+					subContinuation = subMenuItem['continuation']['reloadContinuationData']
+					queue.append((subContinuation['continuation'], subContinuation['clickTrackingParams'], False))
+					break
+				else:
+					self.logger.error('Could not find newest first sort continuation')
+				first = False
+			else:
+				if not nested:
+					o = obj
+					continuationKey = 'itemSectionContinuation'
 				else:
-					sortMenu = self.get_json_obj_from_pos(content, sortMenuPos)
-					for continuationPos in qwarc.utils.find_all(sortMenu, b'"continuation":{'):
-						continuation = self.get_json_obj_from_pos(sortMenu, continuationPos)
-						subContinuationToken, subItct = self.get_continuation_parameters(continuation)
-						queue.append((subContinuationToken, subItct, False))
+					# Of course the data format is different here...
+					for o in obj:
+						if 'response' in o:
+							break
+					continuationKey = 'commentRepliesContinuation'
+				if 'continuationContents' not in o['response']:
+					# Empty response
+					continue
+				for reply in o['response']['continuationContents'][continuationKey]['contents']:
+					if 'commentThreadRenderer' in reply and 'replies' in reply['commentThreadRenderer']:
+						# Nested continuations
+						continuations = reply['commentThreadRenderer']['replies']['commentRepliesRenderer']['continuations']
+						assert len(continuations) == 1
+						queue.append((continuations[0]['nextContinuationData']['continuation'], continuations[0]['nextContinuationData']['clickTrackingParams'], True))
+				if 'continuations' in o['response']['continuationContents'][continuationKey]:
+					assert len(o['response']['continuationContents'][continuationKey]['continuations']) == 1
+					continuation = o['response']['continuationContents'][continuationKey]['continuations'][0]['nextContinuationData']
+					queue.append((continuation['continuation'], continuation['clickTrackingParams'], nested))
 
-			first = False
 
 specDependencies = qwarc.utils.SpecDependencies(extra = (('videoId', os.environ['YOUTUBE_VIDEOID']),))