ソースを参照

Refactor pagination parsing to use the JSON objects instead of stupidly extracting all continuation tokens, and only retrieve the 'newest first' sort order

master
JustAnotherArchivist 3年前
コミット
2ddcec9fbb
1個のファイルの変更34行の追加45行の削除
  1. +34
    -45
      comments.py

+ 34
- 45
comments.py ファイルの表示

@@ -16,34 +16,6 @@ class Comments(qwarc.Item):
def generate(cls):
yield os.environ['YOUTUBE_VIDEOID']

def get_json_obj_from_pos(self, content, startPos):
# Given a startPos in content, extracts the content until braces or brackets are matching (requiring at least one set of parentheses)
openParens = None
for pos in itertools.count(start = startPos):
char = content[pos:pos+1]
if char in (b'{', b'['):
if openParens is None: # First {[ in the string
openParens = 0
openParens += 1
elif char in (b'}', b']'):
openParens -= 1
if openParens == 0:
break
return content[startPos:pos]

def get_continuation_parameters(self, content):
continuationToken = qwarc.utils.str_get_between(content, b'"continuation":"', b'"')
if continuationToken.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
self.logger.error(f'Unexpected continuation token value: {continuationToken!r}')
return
continuationToken = continuationToken.decode('ascii')
itct = qwarc.utils.str_get_between(content, b'"clickTrackingParams":"', b'"')
if itct.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
self.logger.error(f'Unexpected itct value: {itct!r}')
return
itct = itct.decode('ascii')
return continuationToken, itct

async def process(self):
videoPageUrl = f'https://www.youtube.com/watch?v={self.itemValue}'
response, _ = await self.fetch(videoPageUrl, responseHandler = responseHandler)
@@ -81,7 +53,7 @@ class Comments(qwarc.Item):
return
itct = itct.decode('ascii')

queue = collections.deque() # of (continuationToken, itct, nested, initial) where nested indicates that it's a comment's replies ("View N replies" or "Show more comments")
queue = collections.deque() # of (continuationToken, itct, nested) where nested indicates that it's a comment's replies ("View N replies" or "Show more comments")
queue.append((continuationToken, itct, False))
first = True
while queue:
@@ -96,25 +68,42 @@ class Comments(qwarc.Item):
if not response or response.status != 200 or (await response.read(16) == b'{"reload":"now"}'):
self.logger.error('Error fetching comments, skipping')
continue
content = await response.read()
# Yes, the response is JSON and I could parse that into an object, but where's the fun in that?

for continuationsPos in qwarc.utils.find_all(content, b'"continuations":'):
continuations = self.get_json_obj_from_pos(content, continuationsPos)
subContinuationToken, subItct = self.get_continuation_parameters(continuations)
queue.append((subContinuationToken, subItct, b'"label":' in continuations))
obj = await response.json()

if first:
sortMenuPos = content.find(b'"sortMenu":')
if sortMenuPos < 0:
self.logger.error('Could not find sort menu')
sortMenu = obj['response']['continuationContents']['itemSectionContinuation']['header']['commentsHeaderRenderer']['sortMenu']
for subMenuItem in sortMenu['sortFilterSubMenuRenderer']['subMenuItems']:
if subMenuItem['title'] != 'Newest first':
continue
subContinuation = subMenuItem['continuation']['reloadContinuationData']
queue.append((subContinuation['continuation'], subContinuation['clickTrackingParams'], False))
break
else:
self.logger.error('Could not find newest first sort continuation')
first = False
else:
if not nested:
o = obj
continuationKey = 'itemSectionContinuation'
else:
sortMenu = self.get_json_obj_from_pos(content, sortMenuPos)
for continuationPos in qwarc.utils.find_all(sortMenu, b'"continuation":{'):
continuation = self.get_json_obj_from_pos(sortMenu, continuationPos)
subContinuationToken, subItct = self.get_continuation_parameters(continuation)
queue.append((subContinuationToken, subItct, False))
# Of course the data format is different here...
for o in obj:
if 'response' in o:
break
continuationKey = 'commentRepliesContinuation'
if 'continuationContents' not in o['response']:
# Empty response
continue
for reply in o['response']['continuationContents'][continuationKey]['contents']:
if 'commentThreadRenderer' in reply and 'replies' in reply['commentThreadRenderer']:
# Nested continuations
continuations = reply['commentThreadRenderer']['replies']['commentRepliesRenderer']['continuations']
assert len(continuations) == 1
queue.append((continuations[0]['nextContinuationData']['continuation'], continuations[0]['nextContinuationData']['clickTrackingParams'], True))
if 'continuations' in o['response']['continuationContents'][continuationKey]:
assert len(o['response']['continuationContents'][continuationKey]['continuations']) == 1
continuation = o['response']['continuationContents'][continuationKey]['continuations'][0]['nextContinuationData']
queue.append((continuation['continuation'], continuation['clickTrackingParams'], nested))

first = False

specDependencies = qwarc.utils.SpecDependencies(extra = (('videoId', os.environ['YOUTUBE_VIDEOID']),))

読み込み中…
キャンセル
保存