diff --git a/discovery.py b/discovery.py index c6ad4a5..bf3c24c 100644 --- a/discovery.py +++ b/discovery.py @@ -123,34 +123,11 @@ def getmetadata(mysession, vid, ccenabledonly=False): except BaseException as e: print(e) print("Exception in discovery, continuing anyway") - - creditdata = {} - - if not ccenabledonly: - try: - mdinfo = initdata["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][1]["videoSecondaryInfoRenderer"]["metadataRowContainer"]["metadataRowContainerRenderer"]["rows"] - for item in mdinfo: - if item["metadataRowRenderer"]["title"]["simpleText"].startswith("Caption author"): #the request to /watch needs to be in English for this to work - try: - desl = langcodes[item["metadataRowRenderer"]["title"]["simpleText"].split("(", 1)[1][:-1]] - except KeyError as e: - #print(e) - print("Language code conversion error, using language name") - desl = item["metadataRowRenderer"]["title"]["simpleText"].split("(", 1)[1][:-1] - creditdata[desl] = [] - for itemint in item["metadataRowRenderer"]["contents"]: - creditdata[desl].append({"name": itemint["runs"][0]["text"], "channel": itemint["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"]}) - - except KeyError as e: - #print("Video does not have credits") - pass - #raise - #print(e) if initplay and (initdata or ccenabledonly): break - return ccenabled, creditdata, recvids, recchans, recmixes, recplayl + return ccenabled, recvids, recchans, recmixes, recplayl if __name__ == "__main__": from sys import argv diff --git a/tracker.py b/tracker.py index f3782bf..a78ec20 100644 --- a/tracker.py +++ b/tracker.py @@ -9,7 +9,7 @@ from os.path import isfile from json import loads # https://github.com/ArchiveTeam/tencent-weibo-grab/blob/9bae5f9747e014db9227821a9c11557267967023/pipeline.py -VERSION = "20201002.01" +VERSION = "20201017.01" TRACKER_ID = "ext-yt-communitycontribs" TRACKER_HOST = "trackerproxy.meo.ws" diff --git a/worker.py b/worker.py index bf9738c..40904bb 100644 --- a/worker.py +++ b/worker.py @@ -3,9 +3,9 @@ import requests from time import sleep from os import mkdir, rmdir, listdir, system, environ from os.path import isdir, isfile, getsize -from json import dumps, loads +from json import loads -from youtube_channel import main +from youtube_channel import process_channel import signal @@ -87,7 +87,6 @@ open("cookies.txt", "w").write("""# HTTP Cookie File del cookies validationtimes = 0 -shouldgetjob = True #Graceful Shutdown class GracefulKiller: @@ -102,20 +101,8 @@ class GracefulKiller: gkiller = GracefulKiller() -#REMOVED PANIC MECHANISM! -""" -enres = getmetadata(mysession, "IjJKfe-0Ty0", True)[0] -if not enres: - print("Community Contribution discovery has been disabled for this account, please report this on our Discord as this may have caused some videos to be incorrectly marked as having community contributions disabled.") - shouldgetjob = False - gkiller.kill_now = True #exit the script - -del enres -""" - #microtasks def threadrunner(): - global shouldgetjob global validationtimes jobs = Queue() ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False}) @@ -124,7 +111,6 @@ def threadrunner(): task, vid, args = jobs.get() if task == "submitdiscovery": tracker.add_item_to_tracker(args, vid) - #pass elif task == "discovery": while True: @@ -135,11 +121,9 @@ def threadrunner(): print(e) print("Error in retrieving information, waiting 30 seconds and trying again") sleep(30) - if info[0] or info[1]: # ccenabled or creditdata + if info[0]: # ccenabled if not isdir("out/"+str(vid).strip()): mkdir("out/"+str(vid).strip()) - if info[1]: - open("out/"+str(vid).strip()+"/"+str(vid).strip()+"_published_credits.json", "w").write(dumps(info[1])) if info[0]: for langcode in langs: @@ -153,17 +137,14 @@ def threadrunner(): jobs.put(("complete", None, "video:"+vid)) - for videodisc in info[2]: + for videodisc in info[1]: jobs.put(("submitdiscovery", videodisc, tracker.ItemType.Video)) - for channeldisc in info[3]: + for channeldisc in info[2]: jobs.put(("submitdiscovery", channeldisc, tracker.ItemType.Channel)) - for mixdisc in info[4]: + for mixdisc in info[3]: jobs.put(("submitdiscovery", mixdisc, tracker.ItemType.MixPlaylist)) - for playldisc in info[5]: + for playldisc in info[4]: jobs.put(("submitdiscovery", playldisc, tracker.ItemType.Playlist)) - - #jobs.put(("complete", None, "video:"+vid)) - #pass elif task == "subtitles": subprrun(mysession, args, vid, "default", needforcemetadata, needforcecaptions) @@ -178,7 +159,7 @@ def threadrunner(): jobs.put(("submitdiscovery", itemyv["id"], tracker.ItemType.Video)) #channel created playlists - y = main(desit.split(":", 1)[1]) + y = process_channel(desit.split(":", 1)[1]) for itemyv in y["playlists"]: jobs.put(("submitdiscovery", itemyv, tracker.ItemType.Playlist)) for itemyv in y["channels"]: @@ -200,17 +181,14 @@ def threadrunner(): elif task == "mixplaylist": try: wptext = mysession.get("https://www.youtube.com/watch?v=jNQXAC9IVRw&list="+desit.split(":", 1)[1]).text - #chanl = set() + #channel handling not needed here because we will get it from the video for line in wptext.splitlines(): if line.strip().startswith('window["ytInitialData"] = '): initdata = loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1]) for itemyvp in initdata["contents"]["twoColumnWatchNextResults"]["playlist"]["playlist"]["contents"]: jobs.put(("submitdiscovery", itemyvp["playlistPanelVideoRenderer"]["videoId"], tracker.ItemType.Video)) - #chanl.add(itemyvp["playlistPanelVideoRenderer"]["longBylineText"]["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"]) - - #for itemn in chanl: - # jobs.put(("submitdiscovery", itemn, tracker.ItemType.Channel)) + jobs.put(("complete", None, "mixplaylist:"+args)) except: print("Mix Playlist error, ignoring but not marking as complete...", "https://www.youtube.com/watch?v=jNQXAC9IVRw&list="+desit.split(":", 1)[1]) @@ -241,10 +219,17 @@ def threadrunner(): print("Waiting 5 minutes...") sleep(300) - if targetloc.startswith("rsync"): - system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 directory/"+args.split(":", 1)[1]+"/ "+targetloc) - elif targetloc.startswith("http"): - system("curl -F "+args.split(":", 1)[1]+".zip=@directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip "+targetloc) + while True: + if targetloc.startswith("rsync"): + exitinfo = system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 directory/"+args.split(":", 1)[1]+"/ "+targetloc) + elif targetloc.startswith("http"): + exitinfo = system("curl -F "+args.split(":", 1)[1]+".zip=@directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip "+targetloc) + + if exitinfo == 0: # note that on Unix this isn't necessarily the exit code but it's still 0 upon successful exit + break + else: + print("Error in sending data to target, waiting 30 seconds and trying again.") + sleep(30) size = getsize("directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip") @@ -264,60 +249,44 @@ def threadrunner(): # get a new task from tracker collect() #cleanup - #Protection Mechanism Disarmed - """ - #check that the account has community contributions enabled every 50th item - validationtimes += 1 - if not validationtimes % 50: - enres = getmetadata(mysession, "IjJKfe-0Ty0", True)[0] - if not enres: - print("Community Contribution discovery has been disabled for this account, please report this on our Discord as this may have caused some videos to be incorrectly marked as having community contributions disabled.") - shouldgetjob = False - gkiller.kill_now = True #exit the script - del enres - """ - - if shouldgetjob: - desit = tracker.request_item_from_tracker() - print("New task:", desit) - - if desit: - if desit.split(":", 1)[0] == "video": - needforcemetadata = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None, - 'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None, - 'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None, - 'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None, - 'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None, - 'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None, - 'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None, - 'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None, - 'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None, - 'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None, - 'xh': None, 'yi': None, 'yo': None, 'zu': None} - needforcecaptions = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None, - 'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None, - 'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None, - 'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None, - 'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None, - 'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None, - 'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None, - 'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None, - 'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None, - 'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None, - 'xh': None, 'yi': None, 'yo': None, 'zu': None} - jobs.put(("discovery", desit.split(":", 1)[1], None)) - elif desit.split(":", 1)[0] == "channel": - jobs.put(("channel", None, desit.split(":", 1)[1])) - elif desit.split(":", 1)[0] == "playlist": - jobs.put(("playlist", None, desit.split(":", 1)[1])) - elif desit.split(":", 1)[0] == "mixplaylist": - jobs.put(("mixplaylist", None, desit.split(":", 1)[1])) - else: - print("Ignoring item for now", desit) + desit = tracker.request_item_from_tracker() + print("New task:", desit) + + if desit: + if desit.split(":", 1)[0] == "video": + needforcemetadata = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None, + 'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None, + 'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None, + 'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None, + 'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None, + 'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None, + 'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None, + 'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None, + 'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None, + 'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None, + 'xh': None, 'yi': None, 'yo': None, 'zu': None} + needforcecaptions = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None, + 'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None, + 'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None, + 'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None, + 'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None, + 'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None, + 'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None, + 'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None, + 'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None, + 'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None, + 'xh': None, 'yi': None, 'yo': None, 'zu': None} + jobs.put(("discovery", desit.split(":", 1)[1], None)) + elif desit.split(":", 1)[0] == "channel": + jobs.put(("channel", None, desit.split(":", 1)[1])) + elif desit.split(":", 1)[0] == "playlist": + jobs.put(("playlist", None, desit.split(":", 1)[1])) + elif desit.split(":", 1)[0] == "mixplaylist": + jobs.put(("mixplaylist", None, desit.split(":", 1)[1])) else: print("Ignoring item for now", desit) else: - break + print("Ignoring item for now", desit) else: break @@ -340,8 +309,4 @@ for x in threads: threads.remove(x) del x -if not shouldgetjob: - print("PROTECTION MECHANISM #3 WAS SOMEHOW TRIGERRED") - print("Community Contribution discovery has been disabled for this account, please report this on our Discord as this may have caused some videos to be incorrectly marked as having community contributions disabled.") - -print("Exiting...") +print("Exiting...") \ No newline at end of file diff --git a/youtube_channel.py b/youtube_channel.py index 57ffbbe..c5d2395 100644 --- a/youtube_channel.py +++ b/youtube_channel.py @@ -1,25 +1,39 @@ from requests import session -from youtube_util import getinitialdata, fullyexpand - -# TODO: Rate limit detection, HTTP3? +from youtube_util import getinitialdata, fullyexpand, getapikey, getlver +from time import sleep mysession = session() #extract latest version automatically -try: - lver = getinitialdata(mysession.get("https://www.youtube.com/").text)["responseContext"]["serviceTrackingParams"][2]["params"][2]["value"] -except: - lver = "2.20201002.02.01" +homepage = mysession.get("https://www.youtube.com/").text + +API_KEY = getapikey(homepage) + +params = ( + ('key', API_KEY), +) -#print(lver) -mysession.headers.update({"x-youtube-client-name": "1", "x-youtube-client-version": lver, "Accept-Language": "en-US"}) +API_VERSION = getlver(getinitialdata(homepage)) -def main(channelid: str): +continuationheaders = {"x-youtube-client-name": "1", "x-youtube-client-version": API_VERSION, "Accept-Language": "en-US"} + +del homepage + +def process_channel(channelid: str): playlists = set() shelfres = set() channellist = set() # PLAYLISTS - initdata = getinitialdata(mysession.get("https://www.youtube.com/channel/"+str(channelid)+"/playlists").text) + data = {"context":{"client":{"hl":"en","gl":"US","clientName":"WEB","clientVersion":API_VERSION}},"browseId":channelid,"params":"EglwbGF5bGlzdHM%3D"} + while True: + initdata = mysession.post("https://www.youtube.com/youtubei/v1/browse", params=params, json=data) + if initdata.status_code == 200: + initdata = initdata.json() + break + else: + print("Non-200 API status code, waiting 30 seconds before retrying...") + sleep(30) + CHANNELS_ID = 0 PLAYLISTS_ID = 0 @@ -42,7 +56,7 @@ def main(channelid: str): if "shelfRenderer" in itemint.keys(): shelfres.add(itemint["shelfRenderer"]["title"]["runs"][0]["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"]) elif "gridRenderer" in itemint.keys(): - playlistsint = fullyexpand(itemint["gridRenderer"])["items"] + playlistsint = fullyexpand(itemint["gridRenderer"], mysession, continuationheaders)["items"] for playlist in playlistsint: playlists.add(playlist["gridPlaylistRenderer"]["playlistId"]) @@ -50,8 +64,18 @@ def main(channelid: str): channellist.add(playlist["gridPlaylistRenderer"]["shortBylineText"]["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"]) for item in shelfres: - shelfiteminitdata = getinitialdata(mysession.get("https://www.youtube.com/"+str(item)).text) - playlistsint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][PLAYLISTS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"])["items"] + while True: + shelfintp = mysession.get("https://www.youtube.com/"+str(item)) + if not """

Sorry for the interruption. We have been receiving a large volume of requests from your network.

+ +

To continue with your YouTube experience, please fill out the form below.

""" in shelfintp.text and shelfintp.status_code == 200: + break + else: + print("Non-200 status code, waiting 30 seconds before retrying...") + sleep(30) + + shelfiteminitdata = getinitialdata(shelfintp.text) + playlistsint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][PLAYLISTS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"], mysession, continuationheaders)["items"] for playlist in playlistsint: playlists.add(playlist["gridPlaylistRenderer"]["playlistId"]) @@ -61,7 +85,16 @@ def main(channelid: str): # CHANNELS cshelfres = set() - initdata = getinitialdata(mysession.get("https://www.youtube.com/channel/"+str(channelid)+"/channels").text) + # PLAYLISTS + data = {"context":{"client":{"hl":"en","gl":"US","clientName":"WEB","clientVersion":API_VERSION}},"browseId":channelid,"params":"EghjaGFubmVscw%3D%3D"} + while True: + initdata = mysession.post("https://www.youtube.com/youtubei/v1/browse", params=params, json=data) + if initdata.status_code == 200: + initdata = initdata.json() + break + else: + print("Non-200 API status code, waiting 30 seconds before retrying...") + sleep(30) shelflist = initdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"] @@ -70,14 +103,24 @@ def main(channelid: str): if "shelfRenderer" in itemint.keys(): cshelfres.add(itemint["shelfRenderer"]["title"]["runs"][0]["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"]) elif "gridRenderer" in itemint.keys(): - chanlistint = fullyexpand(itemint["gridRenderer"])["items"] + chanlistint = fullyexpand(itemint["gridRenderer"], mysession, continuationheaders)["items"] for channel in chanlistint: channellist.add(channel["gridChannelRenderer"]["channelId"]) for item in cshelfres: - shelfiteminitdata = getinitialdata(mysession.get("https://www.youtube.com/"+str(item)).text) - chanlistint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"])["items"] + while True: + shelfintc = mysession.get("https://www.youtube.com/"+str(item)) + if not """

Sorry for the interruption. We have been receiving a large volume of requests from your network.

+ +

To continue with your YouTube experience, please fill out the form below.

""" in shelfintc.text and shelfintc.status_code == 200: + break + else: + print("Non-200 status code, waiting 30 seconds before retrying...") + sleep(30) + + shelfiteminitdata = getinitialdata(shelfintc.text) + chanlistint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"], mysession, continuationheaders)["items"] for channel in chanlistint: channellist.add(channel["gridChannelRenderer"]["channelId"]) @@ -89,7 +132,7 @@ if __name__ == "__main__": chanl = argv chanl.pop(0) for channel in chanl: - print(main(channel)) + print(process_channel(channel)) # SAMPLES: # UCqj7Cz7revf5maW9g5pgNcg lots of playlists @@ -103,4 +146,4 @@ if __name__ == "__main__": # UCJOh5FKisc0hUlEeWFBlD-w no subscriptions, plenty of featured channels -# UC7fjJERoGTs_eOKk-nn7RMw fair number of featured channels +# UC7fjJERoGTs_eOKk-nn7RMw fair number of featured channels \ No newline at end of file diff --git a/youtube_util.py b/youtube_util.py index 54fe2cf..81d466a 100644 --- a/youtube_util.py +++ b/youtube_util.py @@ -1,6 +1,8 @@ -from requests import session from json import loads from urllib.parse import unquote +from time import sleep + +import requests def getinitialdata(html: str): for line in html.splitlines(): @@ -8,19 +10,27 @@ def getinitialdata(html: str): return loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1]) return {} -mysession = session() +def getapikey(html: str): + return html.split('"INNERTUBE_API_KEY":"', 1)[-1].split('"', 1)[0] + #extract latest version automatically -try: - lver = getinitialdata(mysession.get("https://www.youtube.com/").text)["responseContext"]["serviceTrackingParams"][2]["params"][2]["value"] -except: - lver = "2.20201002.02.01" -mysession.headers.update({"x-youtube-client-name": "1", "x-youtube-client-version": lver, "Accept-Language": "en-US"}) +def getlver(initialdata: dict): + try: + return initialdata["responseContext"]["serviceTrackingParams"][2]["params"][2]["value"] + except: + return "2.20201016.02.00" -def fullyexpand(inputdict: dict): +def fullyexpand(inputdict: dict, mysession: requests.session, continuationheaders: dict): lastrequestj = inputdict while "continuations" in lastrequestj.keys(): - lastrequest = mysession.get("https://www.youtube.com/browse_ajax?continuation="+unquote(lastrequestj["continuations"][0]["nextContinuationData"]["continuation"])) + while True: + lastrequest = mysession.get("https://www.youtube.com/browse_ajax?continuation="+unquote(lastrequestj["continuations"][0]["nextContinuationData"]["continuation"]), headers=continuationheaders) + if lastrequest.status_code == 200: + break + else: + print("Non-200 API status code, waiting 30 seconds before retrying...") + sleep(30) lastrequestj = lastrequest.json()[1]["response"]["continuationContents"]["gridContinuation"] inputdict["items"].extend(lastrequestj["items"]) - return inputdict + return inputdict \ No newline at end of file