From 2db5e71e876028da011e2d9dfa1cf15f7bf5a98a Mon Sep 17 00:00:00 2001 From: tech234a <46801700+tech234a@users.noreply.github.com> Date: Tue, 13 Oct 2020 21:11:53 -0400 Subject: [PATCH 01/11] Disable credit retrieval --- discovery.py | 25 +------------------------ tracker.py | 2 +- worker.py | 12 +++++------- 3 files changed, 7 insertions(+), 32 deletions(-) diff --git a/discovery.py b/discovery.py index c6ad4a5..bf3c24c 100644 --- a/discovery.py +++ b/discovery.py @@ -123,34 +123,11 @@ def getmetadata(mysession, vid, ccenabledonly=False): except BaseException as e: print(e) print("Exception in discovery, continuing anyway") - - creditdata = {} - - if not ccenabledonly: - try: - mdinfo = initdata["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][1]["videoSecondaryInfoRenderer"]["metadataRowContainer"]["metadataRowContainerRenderer"]["rows"] - for item in mdinfo: - if item["metadataRowRenderer"]["title"]["simpleText"].startswith("Caption author"): #the request to /watch needs to be in English for this to work - try: - desl = langcodes[item["metadataRowRenderer"]["title"]["simpleText"].split("(", 1)[1][:-1]] - except KeyError as e: - #print(e) - print("Language code conversion error, using language name") - desl = item["metadataRowRenderer"]["title"]["simpleText"].split("(", 1)[1][:-1] - creditdata[desl] = [] - for itemint in item["metadataRowRenderer"]["contents"]: - creditdata[desl].append({"name": itemint["runs"][0]["text"], "channel": itemint["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"]}) - - except KeyError as e: - #print("Video does not have credits") - pass - #raise - #print(e) if initplay and (initdata or ccenabledonly): break - return ccenabled, creditdata, recvids, recchans, recmixes, recplayl + return ccenabled, recvids, recchans, recmixes, recplayl if __name__ == "__main__": from sys import argv diff --git a/tracker.py b/tracker.py index f3782bf..63bd35a 100644 --- a/tracker.py +++ b/tracker.py @@ -9,7 +9,7 @@ from os.path import isfile from json import loads # https://github.com/ArchiveTeam/tencent-weibo-grab/blob/9bae5f9747e014db9227821a9c11557267967023/pipeline.py -VERSION = "20201002.01" +VERSION = "20201014.01" TRACKER_ID = "ext-yt-communitycontribs" TRACKER_HOST = "trackerproxy.meo.ws" diff --git a/worker.py b/worker.py index bf9738c..a614c05 100644 --- a/worker.py +++ b/worker.py @@ -135,11 +135,9 @@ def threadrunner(): print(e) print("Error in retrieving information, waiting 30 seconds and trying again") sleep(30) - if info[0] or info[1]: # ccenabled or creditdata + if info[0]: # ccenabled or creditdata if not isdir("out/"+str(vid).strip()): mkdir("out/"+str(vid).strip()) - if info[1]: - open("out/"+str(vid).strip()+"/"+str(vid).strip()+"_published_credits.json", "w").write(dumps(info[1])) if info[0]: for langcode in langs: @@ -153,13 +151,13 @@ def threadrunner(): jobs.put(("complete", None, "video:"+vid)) - for videodisc in info[2]: + for videodisc in info[1]: jobs.put(("submitdiscovery", videodisc, tracker.ItemType.Video)) - for channeldisc in info[3]: + for channeldisc in info[2]: jobs.put(("submitdiscovery", channeldisc, tracker.ItemType.Channel)) - for mixdisc in info[4]: + for mixdisc in info[3]: jobs.put(("submitdiscovery", mixdisc, tracker.ItemType.MixPlaylist)) - for playldisc in info[5]: + for playldisc in info[4]: jobs.put(("submitdiscovery", playldisc, tracker.ItemType.Playlist)) #jobs.put(("complete", None, "video:"+vid)) From 76af3fd211064e99da3568cd1deaa98d6ee846b1 Mon Sep 17 00:00:00 2001 From: tech234a <46801700+tech234a@users.noreply.github.com> Date: Tue, 13 Oct 2020 21:22:29 -0400 Subject: [PATCH 02/11] Fix comment --- worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/worker.py b/worker.py index a614c05..7b59b3b 100644 --- a/worker.py +++ b/worker.py @@ -135,7 +135,7 @@ def threadrunner(): print(e) print("Error in retrieving information, waiting 30 seconds and trying again") sleep(30) - if info[0]: # ccenabled or creditdata + if info[0]: # ccenabled if not isdir("out/"+str(vid).strip()): mkdir("out/"+str(vid).strip()) From f7b92768dbeb68f42e5753436354f2e310a676eb Mon Sep 17 00:00:00 2001 From: tech234a <46801700+tech234a@users.noreply.github.com> Date: Sat, 17 Oct 2020 00:25:27 -0400 Subject: [PATCH 03/11] Cleanup --- worker.py | 118 ++++++++++++++++++------------------------------------ 1 file changed, 39 insertions(+), 79 deletions(-) diff --git a/worker.py b/worker.py index 7b59b3b..4685541 100644 --- a/worker.py +++ b/worker.py @@ -3,7 +3,7 @@ import requests from time import sleep from os import mkdir, rmdir, listdir, system, environ from os.path import isdir, isfile, getsize -from json import dumps, loads +from json import loads from youtube_channel import main @@ -87,7 +87,6 @@ open("cookies.txt", "w").write("""# HTTP Cookie File del cookies validationtimes = 0 -shouldgetjob = True #Graceful Shutdown class GracefulKiller: @@ -102,20 +101,8 @@ class GracefulKiller: gkiller = GracefulKiller() -#REMOVED PANIC MECHANISM! -""" -enres = getmetadata(mysession, "IjJKfe-0Ty0", True)[0] -if not enres: - print("Community Contribution discovery has been disabled for this account, please report this on our Discord as this may have caused some videos to be incorrectly marked as having community contributions disabled.") - shouldgetjob = False - gkiller.kill_now = True #exit the script - -del enres -""" - #microtasks def threadrunner(): - global shouldgetjob global validationtimes jobs = Queue() ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False}) @@ -124,7 +111,6 @@ def threadrunner(): task, vid, args = jobs.get() if task == "submitdiscovery": tracker.add_item_to_tracker(args, vid) - #pass elif task == "discovery": while True: @@ -159,9 +145,6 @@ def threadrunner(): jobs.put(("submitdiscovery", mixdisc, tracker.ItemType.MixPlaylist)) for playldisc in info[4]: jobs.put(("submitdiscovery", playldisc, tracker.ItemType.Playlist)) - - #jobs.put(("complete", None, "video:"+vid)) - #pass elif task == "subtitles": subprrun(mysession, args, vid, "default", needforcemetadata, needforcecaptions) @@ -198,17 +181,14 @@ def threadrunner(): elif task == "mixplaylist": try: wptext = mysession.get("https://www.youtube.com/watch?v=jNQXAC9IVRw&list="+desit.split(":", 1)[1]).text - #chanl = set() + #channel handling not needed here because we will get it from the video for line in wptext.splitlines(): if line.strip().startswith('window["ytInitialData"] = '): initdata = loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1]) for itemyvp in initdata["contents"]["twoColumnWatchNextResults"]["playlist"]["playlist"]["contents"]: jobs.put(("submitdiscovery", itemyvp["playlistPanelVideoRenderer"]["videoId"], tracker.ItemType.Video)) - #chanl.add(itemyvp["playlistPanelVideoRenderer"]["longBylineText"]["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"]) - - #for itemn in chanl: - # jobs.put(("submitdiscovery", itemn, tracker.ItemType.Channel)) + jobs.put(("complete", None, "mixplaylist:"+args)) except: print("Mix Playlist error, ignoring but not marking as complete...", "https://www.youtube.com/watch?v=jNQXAC9IVRw&list="+desit.split(":", 1)[1]) @@ -262,60 +242,44 @@ def threadrunner(): # get a new task from tracker collect() #cleanup - #Protection Mechanism Disarmed - """ - #check that the account has community contributions enabled every 50th item - validationtimes += 1 - if not validationtimes % 50: - enres = getmetadata(mysession, "IjJKfe-0Ty0", True)[0] - if not enres: - print("Community Contribution discovery has been disabled for this account, please report this on our Discord as this may have caused some videos to be incorrectly marked as having community contributions disabled.") - shouldgetjob = False - gkiller.kill_now = True #exit the script - del enres - """ - - if shouldgetjob: - desit = tracker.request_item_from_tracker() - print("New task:", desit) - - if desit: - if desit.split(":", 1)[0] == "video": - needforcemetadata = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None, - 'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None, - 'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None, - 'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None, - 'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None, - 'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None, - 'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None, - 'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None, - 'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None, - 'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None, - 'xh': None, 'yi': None, 'yo': None, 'zu': None} - needforcecaptions = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None, - 'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None, - 'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None, - 'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None, - 'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None, - 'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None, - 'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None, - 'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None, - 'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None, - 'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None, - 'xh': None, 'yi': None, 'yo': None, 'zu': None} - jobs.put(("discovery", desit.split(":", 1)[1], None)) - elif desit.split(":", 1)[0] == "channel": - jobs.put(("channel", None, desit.split(":", 1)[1])) - elif desit.split(":", 1)[0] == "playlist": - jobs.put(("playlist", None, desit.split(":", 1)[1])) - elif desit.split(":", 1)[0] == "mixplaylist": - jobs.put(("mixplaylist", None, desit.split(":", 1)[1])) - else: - print("Ignoring item for now", desit) + desit = tracker.request_item_from_tracker() + print("New task:", desit) + + if desit: + if desit.split(":", 1)[0] == "video": + needforcemetadata = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None, + 'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None, + 'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None, + 'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None, + 'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None, + 'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None, + 'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None, + 'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None, + 'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None, + 'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None, + 'xh': None, 'yi': None, 'yo': None, 'zu': None} + needforcecaptions = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None, + 'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None, + 'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None, + 'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None, + 'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None, + 'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None, + 'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None, + 'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None, + 'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None, + 'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None, + 'xh': None, 'yi': None, 'yo': None, 'zu': None} + jobs.put(("discovery", desit.split(":", 1)[1], None)) + elif desit.split(":", 1)[0] == "channel": + jobs.put(("channel", None, desit.split(":", 1)[1])) + elif desit.split(":", 1)[0] == "playlist": + jobs.put(("playlist", None, desit.split(":", 1)[1])) + elif desit.split(":", 1)[0] == "mixplaylist": + jobs.put(("mixplaylist", None, desit.split(":", 1)[1])) else: print("Ignoring item for now", desit) else: - break + print("Ignoring item for now", desit) else: break @@ -338,8 +302,4 @@ for x in threads: threads.remove(x) del x -if not shouldgetjob: - print("PROTECTION MECHANISM #3 WAS SOMEHOW TRIGERRED") - print("Community Contribution discovery has been disabled for this account, please report this on our Discord as this may have caused some videos to be incorrectly marked as having community contributions disabled.") - -print("Exiting...") +print("Exiting...") \ No newline at end of file From 80d125ace717a238e193f94778905ca6afdd1a44 Mon Sep 17 00:00:00 2001 From: tech234a <46801700+tech234a@users.noreply.github.com> Date: Sat, 17 Oct 2020 00:29:57 -0400 Subject: [PATCH 04/11] Detect failures when transferring to target --- worker.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/worker.py b/worker.py index 4685541..bdf4372 100644 --- a/worker.py +++ b/worker.py @@ -219,10 +219,17 @@ def threadrunner(): print("Waiting 5 minutes...") sleep(300) - if targetloc.startswith("rsync"): - system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 directory/"+args.split(":", 1)[1]+"/ "+targetloc) - elif targetloc.startswith("http"): - system("curl -F "+args.split(":", 1)[1]+".zip=@directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip "+targetloc) + while True: + if targetloc.startswith("rsync"): + exitinfo = system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 directory/"+args.split(":", 1)[1]+"/ "+targetloc) + elif targetloc.startswith("http"): + exitinfo = system("curl -F "+args.split(":", 1)[1]+".zip=@directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip "+targetloc) + + if exitinfo == 0: # note that on Unix this isn't necessarily the exit code but it's still 0 upon successful exit + break + else: + print("Error in sending data to target, waiting 30 seconds and trying again.") + sleep(30) size = getsize("directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip") From 3e43461c4376b03247d03e5023958d219a9c3b65 Mon Sep 17 00:00:00 2001 From: tech234a <46801700+tech234a@users.noreply.github.com> Date: Sat, 17 Oct 2020 00:34:04 -0400 Subject: [PATCH 05/11] Update youtube-util --- worker.py | 4 ++-- youtube_channel.py | 44 ++++++++++++++++++++++++++------------------ youtube_util.py | 23 +++++++++++++---------- 3 files changed, 41 insertions(+), 30 deletions(-) diff --git a/worker.py b/worker.py index bdf4372..40904bb 100644 --- a/worker.py +++ b/worker.py @@ -5,7 +5,7 @@ from os import mkdir, rmdir, listdir, system, environ from os.path import isdir, isfile, getsize from json import loads -from youtube_channel import main +from youtube_channel import process_channel import signal @@ -159,7 +159,7 @@ def threadrunner(): jobs.put(("submitdiscovery", itemyv["id"], tracker.ItemType.Video)) #channel created playlists - y = main(desit.split(":", 1)[1]) + y = process_channel(desit.split(":", 1)[1]) for itemyv in y["playlists"]: jobs.put(("submitdiscovery", itemyv, tracker.ItemType.Playlist)) for itemyv in y["channels"]: diff --git a/youtube_channel.py b/youtube_channel.py index 57ffbbe..37db418 100644 --- a/youtube_channel.py +++ b/youtube_channel.py @@ -1,25 +1,31 @@ from requests import session -from youtube_util import getinitialdata, fullyexpand - -# TODO: Rate limit detection, HTTP3? +from youtube_util import getinitialdata, fullyexpand, getapikey, getlver mysession = session() #extract latest version automatically -try: - lver = getinitialdata(mysession.get("https://www.youtube.com/").text)["responseContext"]["serviceTrackingParams"][2]["params"][2]["value"] -except: - lver = "2.20201002.02.01" +homepage = mysession.get("https://www.youtube.com/").text + +API_KEY = getapikey(homepage) + +params = ( + ('key', API_KEY), +) -#print(lver) -mysession.headers.update({"x-youtube-client-name": "1", "x-youtube-client-version": lver, "Accept-Language": "en-US"}) +API_VERSION = getlver(getinitialdata(homepage)) -def main(channelid: str): +continuationheaders = {"x-youtube-client-name": "1", "x-youtube-client-version": API_VERSION, "Accept-Language": "en-US"} + +del homepage + +def process_channel(channelid: str): playlists = set() shelfres = set() channellist = set() # PLAYLISTS - initdata = getinitialdata(mysession.get("https://www.youtube.com/channel/"+str(channelid)+"/playlists").text) + data = {"context":{"client":{"hl":"en","gl":"US","clientName":"WEB","clientVersion":API_VERSION}},"browseId":channelid,"params":"EglwbGF5bGlzdHM%3D"} + initdata = mysession.post("https://www.youtube.com/youtubei/v1/browse", params=params, json=data).json() + CHANNELS_ID = 0 PLAYLISTS_ID = 0 @@ -42,7 +48,7 @@ def main(channelid: str): if "shelfRenderer" in itemint.keys(): shelfres.add(itemint["shelfRenderer"]["title"]["runs"][0]["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"]) elif "gridRenderer" in itemint.keys(): - playlistsint = fullyexpand(itemint["gridRenderer"])["items"] + playlistsint = fullyexpand(itemint["gridRenderer"], mysession, continuationheaders)["items"] for playlist in playlistsint: playlists.add(playlist["gridPlaylistRenderer"]["playlistId"]) @@ -51,7 +57,7 @@ def main(channelid: str): for item in shelfres: shelfiteminitdata = getinitialdata(mysession.get("https://www.youtube.com/"+str(item)).text) - playlistsint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][PLAYLISTS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"])["items"] + playlistsint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][PLAYLISTS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"], mysession, continuationheaders)["items"] for playlist in playlistsint: playlists.add(playlist["gridPlaylistRenderer"]["playlistId"]) @@ -61,7 +67,9 @@ def main(channelid: str): # CHANNELS cshelfres = set() - initdata = getinitialdata(mysession.get("https://www.youtube.com/channel/"+str(channelid)+"/channels").text) + # PLAYLISTS + data = {"context":{"client":{"hl":"en","gl":"US","clientName":"WEB","clientVersion":API_VERSION}},"browseId":channelid,"params":"EghjaGFubmVscw%3D%3D"} + initdata = mysession.post("https://www.youtube.com/youtubei/v1/browse", params=params, json=data).json() shelflist = initdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"] @@ -70,14 +78,14 @@ def main(channelid: str): if "shelfRenderer" in itemint.keys(): cshelfres.add(itemint["shelfRenderer"]["title"]["runs"][0]["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"]) elif "gridRenderer" in itemint.keys(): - chanlistint = fullyexpand(itemint["gridRenderer"])["items"] + chanlistint = fullyexpand(itemint["gridRenderer"], mysession, continuationheaders)["items"] for channel in chanlistint: channellist.add(channel["gridChannelRenderer"]["channelId"]) for item in cshelfres: shelfiteminitdata = getinitialdata(mysession.get("https://www.youtube.com/"+str(item)).text) - chanlistint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"])["items"] + chanlistint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"], mysession, continuationheaders)["items"] for channel in chanlistint: channellist.add(channel["gridChannelRenderer"]["channelId"]) @@ -89,7 +97,7 @@ if __name__ == "__main__": chanl = argv chanl.pop(0) for channel in chanl: - print(main(channel)) + print(process_channel(channel)) # SAMPLES: # UCqj7Cz7revf5maW9g5pgNcg lots of playlists @@ -103,4 +111,4 @@ if __name__ == "__main__": # UCJOh5FKisc0hUlEeWFBlD-w no subscriptions, plenty of featured channels -# UC7fjJERoGTs_eOKk-nn7RMw fair number of featured channels +# UC7fjJERoGTs_eOKk-nn7RMw fair number of featured channels \ No newline at end of file diff --git a/youtube_util.py b/youtube_util.py index 54fe2cf..4a73996 100644 --- a/youtube_util.py +++ b/youtube_util.py @@ -1,26 +1,29 @@ -from requests import session from json import loads from urllib.parse import unquote +import requests + def getinitialdata(html: str): for line in html.splitlines(): if line.strip().startswith('window["ytInitialData"] = '): return loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1]) return {} -mysession = session() +def getapikey(html: str): + return html.split('"INNERTUBE_API_KEY":"', 1)[-1].split('"', 1)[0] + #extract latest version automatically -try: - lver = getinitialdata(mysession.get("https://www.youtube.com/").text)["responseContext"]["serviceTrackingParams"][2]["params"][2]["value"] -except: - lver = "2.20201002.02.01" -mysession.headers.update({"x-youtube-client-name": "1", "x-youtube-client-version": lver, "Accept-Language": "en-US"}) +def getlver(initialdata: dict): + try: + return initialdata["responseContext"]["serviceTrackingParams"][2]["params"][2]["value"] + except: + return "2.20201002.02.01" -def fullyexpand(inputdict: dict): +def fullyexpand(inputdict: dict, mysession: requests.session, continuationheaders: dict): lastrequestj = inputdict while "continuations" in lastrequestj.keys(): - lastrequest = mysession.get("https://www.youtube.com/browse_ajax?continuation="+unquote(lastrequestj["continuations"][0]["nextContinuationData"]["continuation"])) + lastrequest = mysession.get("https://www.youtube.com/browse_ajax?continuation="+unquote(lastrequestj["continuations"][0]["nextContinuationData"]["continuation"]), headers=continuationheaders) lastrequestj = lastrequest.json()[1]["response"]["continuationContents"]["gridContinuation"] inputdict["items"].extend(lastrequestj["items"]) - return inputdict + return inputdict \ No newline at end of file From 6ce4345d10152bf3d5fefdf557145a57968d1a40 Mon Sep 17 00:00:00 2001 From: tech234a <46801700+tech234a@users.noreply.github.com> Date: Sat, 17 Oct 2020 00:53:25 -0400 Subject: [PATCH 06/11] Basic rate limit detection for youtube-util --- youtube_channel.py | 23 +++++++++++++++++++---- youtube_util.py | 9 ++++++++- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/youtube_channel.py b/youtube_channel.py index 37db418..2a4b133 100644 --- a/youtube_channel.py +++ b/youtube_channel.py @@ -1,5 +1,6 @@ from requests import session from youtube_util import getinitialdata, fullyexpand, getapikey, getlver +from time import sleep mysession = session() #extract latest version automatically @@ -24,7 +25,14 @@ def process_channel(channelid: str): # PLAYLISTS data = {"context":{"client":{"hl":"en","gl":"US","clientName":"WEB","clientVersion":API_VERSION}},"browseId":channelid,"params":"EglwbGF5bGlzdHM%3D"} - initdata = mysession.post("https://www.youtube.com/youtubei/v1/browse", params=params, json=data).json() + while True: + initdata = mysession.post("https://www.youtube.com/youtubei/v1/browse", params=params, json=data) + if initdata.status_code == 200: + initdata = initdata.json() + break + else: + print("Non-200 API status code, waiting 30 seconds before retrying...") + sleep(30) CHANNELS_ID = 0 @@ -67,10 +75,17 @@ def process_channel(channelid: str): # CHANNELS cshelfres = set() - # PLAYLISTS + # PLAYLISTS data = {"context":{"client":{"hl":"en","gl":"US","clientName":"WEB","clientVersion":API_VERSION}},"browseId":channelid,"params":"EghjaGFubmVscw%3D%3D"} - initdata = mysession.post("https://www.youtube.com/youtubei/v1/browse", params=params, json=data).json() - + while True: + initdata = mysession.post("https://www.youtube.com/youtubei/v1/browse", params=params, json=data).json() + if initdata.status_code == 200: + initdata = initdata.json() + break + else: + print("Non-200 API status code, waiting 30 seconds before retrying...") + sleep(30) + shelflist = initdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"] for item in shelflist: diff --git a/youtube_util.py b/youtube_util.py index 4a73996..8bf506e 100644 --- a/youtube_util.py +++ b/youtube_util.py @@ -1,5 +1,6 @@ from json import loads from urllib.parse import unquote +from time import sleep import requests @@ -22,7 +23,13 @@ def getlver(initialdata: dict): def fullyexpand(inputdict: dict, mysession: requests.session, continuationheaders: dict): lastrequestj = inputdict while "continuations" in lastrequestj.keys(): - lastrequest = mysession.get("https://www.youtube.com/browse_ajax?continuation="+unquote(lastrequestj["continuations"][0]["nextContinuationData"]["continuation"]), headers=continuationheaders) + while True: + lastrequest = mysession.get("https://www.youtube.com/browse_ajax?continuation="+unquote(lastrequestj["continuations"][0]["nextContinuationData"]["continuation"]), headers=continuationheaders) + if lastrequest.status_code == 200: + break + else: + print("Non-200 API status code, waiting 30 seconds before retrying...") + sleep(30) lastrequestj = lastrequest.json()[1]["response"]["continuationContents"]["gridContinuation"] inputdict["items"].extend(lastrequestj["items"]) From 82b08c62c4a46dc5e6fef0bb60b2d3b7ace42ce0 Mon Sep 17 00:00:00 2001 From: tech234a <46801700+tech234a@users.noreply.github.com> Date: Sat, 17 Oct 2020 00:54:49 -0400 Subject: [PATCH 07/11] Update default API version --- youtube_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_util.py b/youtube_util.py index 8bf506e..81d466a 100644 --- a/youtube_util.py +++ b/youtube_util.py @@ -18,7 +18,7 @@ def getlver(initialdata: dict): try: return initialdata["responseContext"]["serviceTrackingParams"][2]["params"][2]["value"] except: - return "2.20201002.02.01" + return "2.20201016.02.00" def fullyexpand(inputdict: dict, mysession: requests.session, continuationheaders: dict): lastrequestj = inputdict From 7a70bc25b59624be82f354d1586091de648ddfe6 Mon Sep 17 00:00:00 2001 From: tech234a <46801700+tech234a@users.noreply.github.com> Date: Sat, 17 Oct 2020 01:52:16 -0400 Subject: [PATCH 08/11] youtube-util: validation for shelf items --- youtube_channel.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/youtube_channel.py b/youtube_channel.py index 2a4b133..93f6750 100644 --- a/youtube_channel.py +++ b/youtube_channel.py @@ -64,7 +64,17 @@ def process_channel(channelid: str): channellist.add(playlist["gridPlaylistRenderer"]["shortBylineText"]["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"]) for item in shelfres: - shelfiteminitdata = getinitialdata(mysession.get("https://www.youtube.com/"+str(item)).text) + while True: + shelfintp = mysession.get("https://www.youtube.com/"+str(item)) + if not """

Sorry for the interruption. We have been receiving a large volume of requests from your network.

+ +

To continue with your YouTube experience, please fill out the form below.

""" in shelfintp.text and not shelfintp.status_code == 200: + break + else: + print("Non-200 status code, waiting 30 seconds before retrying...") + sleep(30) + + shelfiteminitdata = getinitialdata(shelfintp.text) playlistsint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][PLAYLISTS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"], mysession, continuationheaders)["items"] for playlist in playlistsint: @@ -85,7 +95,7 @@ def process_channel(channelid: str): else: print("Non-200 API status code, waiting 30 seconds before retrying...") sleep(30) - + shelflist = initdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"] for item in shelflist: @@ -99,7 +109,17 @@ def process_channel(channelid: str): channellist.add(channel["gridChannelRenderer"]["channelId"]) for item in cshelfres: - shelfiteminitdata = getinitialdata(mysession.get("https://www.youtube.com/"+str(item)).text) + while True: + shelfintc = mysession.get("https://www.youtube.com/"+str(item)) + if not """

Sorry for the interruption. We have been receiving a large volume of requests from your network.

+ +

To continue with your YouTube experience, please fill out the form below.

""" in shelfintc.text and not shelfintc.status_code == 200: + break + else: + print("Non-200 status code, waiting 30 seconds before retrying...") + sleep(30) + + shelfiteminitdata = getinitialdata(shelfintc.text) chanlistint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"], mysession, continuationheaders)["items"] for channel in chanlistint: From a6f6567cb036b931523455028d8fae6b8269e8cb Mon Sep 17 00:00:00 2001 From: tech234a <46801700+tech234a@users.noreply.github.com> Date: Sat, 17 Oct 2020 01:52:51 -0400 Subject: [PATCH 09/11] Update version --- tracker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tracker.py b/tracker.py index 63bd35a..a78ec20 100644 --- a/tracker.py +++ b/tracker.py @@ -9,7 +9,7 @@ from os.path import isfile from json import loads # https://github.com/ArchiveTeam/tencent-weibo-grab/blob/9bae5f9747e014db9227821a9c11557267967023/pipeline.py -VERSION = "20201014.01" +VERSION = "20201017.01" TRACKER_ID = "ext-yt-communitycontribs" TRACKER_HOST = "trackerproxy.meo.ws" From 19f1d7fdc83272d6a0227cc72caea57eae832286 Mon Sep 17 00:00:00 2001 From: tech234a <46801700+tech234a@users.noreply.github.com> Date: Sat, 17 Oct 2020 01:54:03 -0400 Subject: [PATCH 10/11] youtube-util: bug fix --- youtube_channel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_channel.py b/youtube_channel.py index 93f6750..842ea91 100644 --- a/youtube_channel.py +++ b/youtube_channel.py @@ -88,7 +88,7 @@ def process_channel(channelid: str): # PLAYLISTS data = {"context":{"client":{"hl":"en","gl":"US","clientName":"WEB","clientVersion":API_VERSION}},"browseId":channelid,"params":"EghjaGFubmVscw%3D%3D"} while True: - initdata = mysession.post("https://www.youtube.com/youtubei/v1/browse", params=params, json=data).json() + initdata = mysession.post("https://www.youtube.com/youtubei/v1/browse", params=params, json=data) if initdata.status_code == 200: initdata = initdata.json() break From 5b9784ddbf7c9256f19617f306c31567ddc72bc7 Mon Sep 17 00:00:00 2001 From: tech234a <46801700+tech234a@users.noreply.github.com> Date: Sat, 17 Oct 2020 01:57:06 -0400 Subject: [PATCH 11/11] youtube-util: Bug fix --- youtube_channel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_channel.py b/youtube_channel.py index 842ea91..c5d2395 100644 --- a/youtube_channel.py +++ b/youtube_channel.py @@ -68,7 +68,7 @@ def process_channel(channelid: str): shelfintp = mysession.get("https://www.youtube.com/"+str(item)) if not """

Sorry for the interruption. We have been receiving a large volume of requests from your network.

-

To continue with your YouTube experience, please fill out the form below.

""" in shelfintp.text and not shelfintp.status_code == 200: +

To continue with your YouTube experience, please fill out the form below.

""" in shelfintp.text and shelfintp.status_code == 200: break else: print("Non-200 status code, waiting 30 seconds before retrying...") @@ -113,7 +113,7 @@ def process_channel(channelid: str): shelfintc = mysession.get("https://www.youtube.com/"+str(item)) if not """

Sorry for the interruption. We have been receiving a large volume of requests from your network.

-

To continue with your YouTube experience, please fill out the form below.

""" in shelfintc.text and not shelfintc.status_code == 200: +

To continue with your YouTube experience, please fill out the form below.

""" in shelfintc.text and shelfintc.status_code == 200: break else: print("Non-200 status code, waiting 30 seconds before retrying...")