From 3ed2f2178b003e3734777f675ebaae7a91af51b6 Mon Sep 17 00:00:00 2001 From: tech234a <46801700+tech234a@users.noreply.github.com> Date: Fri, 2 Oct 2020 17:53:39 -0400 Subject: [PATCH] Retrieve all channels and playlists on channel items --- tracker.py | 2 +- worker.py | 12 +++-- youtube_channel.py | 106 +++++++++++++++++++++++++++++++++++++++++++++ youtube_util.py | 26 +++++++++++ 4 files changed, 141 insertions(+), 5 deletions(-) create mode 100644 youtube_channel.py create mode 100644 youtube_util.py diff --git a/tracker.py b/tracker.py index 4d29de3..f3782bf 100644 --- a/tracker.py +++ b/tracker.py @@ -9,7 +9,7 @@ from os.path import isfile from json import loads # https://github.com/ArchiveTeam/tencent-weibo-grab/blob/9bae5f9747e014db9227821a9c11557267967023/pipeline.py -VERSION = "20201001.01" +VERSION = "20201002.01" TRACKER_ID = "ext-yt-communitycontribs" TRACKER_HOST = "trackerproxy.meo.ws" diff --git a/worker.py b/worker.py index 8cfbede..bf9738c 100644 --- a/worker.py +++ b/worker.py @@ -5,6 +5,8 @@ from os import mkdir, rmdir, listdir, system, environ from os.path import isdir, isfile, getsize from json import dumps, loads +from youtube_channel import main + import signal import tracker @@ -176,10 +178,12 @@ def threadrunner(): jobs.put(("submitdiscovery", itemyv["id"], tracker.ItemType.Video)) #channel created playlists - y = ydl.extract_info("https://www.youtube.com/channel/"+desit.split(":", 1)[1]+"/playlists?view=1", download=False) - for itemyv in y["entries"]: - jobs.put(("submitdiscovery", itemyv["url"].split("?list=", 1)[1], tracker.ItemType.Playlist)) #[38:] - #TODO: saved playlists, featured channels + y = main(desit.split(":", 1)[1]) + for itemyv in y["playlists"]: + jobs.put(("submitdiscovery", itemyv, tracker.ItemType.Playlist)) + for itemyv in y["channels"]: + jobs.put(("submitdiscovery", itemyv, tracker.ItemType.Channel)) + jobs.put(("complete", None, "channel:"+args)) except: print("YouTube-DL error, ignoring but not marking as complete...", "https://www.youtube.com/channel/"+desit.split(":", 1)[1]) diff --git a/youtube_channel.py b/youtube_channel.py new file mode 100644 index 0000000..57ffbbe --- /dev/null +++ b/youtube_channel.py @@ -0,0 +1,106 @@ +from requests import session +from youtube_util import getinitialdata, fullyexpand + +# TODO: Rate limit detection, HTTP3? + +mysession = session() +#extract latest version automatically +try: + lver = getinitialdata(mysession.get("https://www.youtube.com/").text)["responseContext"]["serviceTrackingParams"][2]["params"][2]["value"] +except: + lver = "2.20201002.02.01" + +#print(lver) +mysession.headers.update({"x-youtube-client-name": "1", "x-youtube-client-version": lver, "Accept-Language": "en-US"}) + +def main(channelid: str): + playlists = set() + shelfres = set() + channellist = set() + + # PLAYLISTS + initdata = getinitialdata(mysession.get("https://www.youtube.com/channel/"+str(channelid)+"/playlists").text) + + CHANNELS_ID = 0 + PLAYLISTS_ID = 0 + + current = 0 + for tab in initdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"]: + if "tabRenderer" in tab.keys(): + if tab["tabRenderer"]["endpoint"]["commandMetadata"]["webCommandMetadata"]["url"].rsplit("/", 1)[-1] == "playlists": + PLAYLISTS_ID = current + elif tab["tabRenderer"]["endpoint"]["commandMetadata"]["webCommandMetadata"]["url"].rsplit("/", 1)[-1] == "channels": + CHANNELS_ID = current + current += 1 + + del current + + shelflist = initdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][PLAYLISTS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"] + + for item in shelflist: + itemint = item["itemSectionRenderer"]["contents"][0] + if "shelfRenderer" in itemint.keys(): + shelfres.add(itemint["shelfRenderer"]["title"]["runs"][0]["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"]) + elif "gridRenderer" in itemint.keys(): + playlistsint = fullyexpand(itemint["gridRenderer"])["items"] + + for playlist in playlistsint: + playlists.add(playlist["gridPlaylistRenderer"]["playlistId"]) + if "shortBylineText" in playlist["gridPlaylistRenderer"].keys(): + channellist.add(playlist["gridPlaylistRenderer"]["shortBylineText"]["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"]) + + for item in shelfres: + shelfiteminitdata = getinitialdata(mysession.get("https://www.youtube.com/"+str(item)).text) + playlistsint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][PLAYLISTS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"])["items"] + + for playlist in playlistsint: + playlists.add(playlist["gridPlaylistRenderer"]["playlistId"]) + if "shortBylineText" in playlist["gridPlaylistRenderer"].keys(): + channellist.add(playlist["gridPlaylistRenderer"]["shortBylineText"]["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"]) + + # CHANNELS + cshelfres = set() + + initdata = getinitialdata(mysession.get("https://www.youtube.com/channel/"+str(channelid)+"/channels").text) + + shelflist = initdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"] + + for item in shelflist: + itemint = item["itemSectionRenderer"]["contents"][0] + if "shelfRenderer" in itemint.keys(): + cshelfres.add(itemint["shelfRenderer"]["title"]["runs"][0]["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"]) + elif "gridRenderer" in itemint.keys(): + chanlistint = fullyexpand(itemint["gridRenderer"])["items"] + + for channel in chanlistint: + channellist.add(channel["gridChannelRenderer"]["channelId"]) + + for item in cshelfres: + shelfiteminitdata = getinitialdata(mysession.get("https://www.youtube.com/"+str(item)).text) + chanlistint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"])["items"] + + for channel in chanlistint: + channellist.add(channel["gridChannelRenderer"]["channelId"]) + + return {"playlists": playlists, "channels": channellist} + +if __name__ == "__main__": + from sys import argv + chanl = argv + chanl.pop(0) + for channel in chanl: + print(main(channel)) + +# SAMPLES: +# UCqj7Cz7revf5maW9g5pgNcg lots of playlists +# UCRwczJ_nk1t9IGHyHfHbXRQ Nathaniel Bandy - created playlists only, featured channels only +# UCo8bcnLyZH8tBIH9V1mLgqQ the odd 1 is out - shelf, way too many subscriptions +# UCfXIV2vThxEF8Hq2OE17AeQ no playlists or channels featured + +# UCJqV2-l0jqAa7uYN8IGJW7w TONS OF SUBSCRIPTIONS, no featured channels + +# UC_1nZUpPS6jFv5Pn3f85CaA TONS OF SUBSCRIPTIONS, some featured channels + +# UCJOh5FKisc0hUlEeWFBlD-w no subscriptions, plenty of featured channels + +# UC7fjJERoGTs_eOKk-nn7RMw fair number of featured channels diff --git a/youtube_util.py b/youtube_util.py new file mode 100644 index 0000000..54fe2cf --- /dev/null +++ b/youtube_util.py @@ -0,0 +1,26 @@ +from requests import session +from json import loads +from urllib.parse import unquote + +def getinitialdata(html: str): + for line in html.splitlines(): + if line.strip().startswith('window["ytInitialData"] = '): + return loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1]) + return {} + +mysession = session() +#extract latest version automatically +try: + lver = getinitialdata(mysession.get("https://www.youtube.com/").text)["responseContext"]["serviceTrackingParams"][2]["params"][2]["value"] +except: + lver = "2.20201002.02.01" +mysession.headers.update({"x-youtube-client-name": "1", "x-youtube-client-version": lver, "Accept-Language": "en-US"}) + +def fullyexpand(inputdict: dict): + lastrequestj = inputdict + while "continuations" in lastrequestj.keys(): + lastrequest = mysession.get("https://www.youtube.com/browse_ajax?continuation="+unquote(lastrequestj["continuations"][0]["nextContinuationData"]["continuation"])) + lastrequestj = lastrequest.json()[1]["response"]["continuationContents"]["gridContinuation"] + inputdict["items"].extend(lastrequestj["items"]) + + return inputdict