Browse Source

Merge pull request #11 from Data-Horde/disable-credits

20201017.1
master
tech234a 3 years ago
committed by GitHub
parent
commit
ca9acb37ca
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 142 additions and 147 deletions
  1. +1
    -24
      discovery.py
  2. +1
    -1
      tracker.py
  3. +57
    -92
      worker.py
  4. +63
    -20
      youtube_channel.py
  5. +20
    -10
      youtube_util.py

+ 1
- 24
discovery.py View File

@@ -123,34 +123,11 @@ def getmetadata(mysession, vid, ccenabledonly=False):
except BaseException as e: except BaseException as e:
print(e) print(e)
print("Exception in discovery, continuing anyway") print("Exception in discovery, continuing anyway")
creditdata = {}
if not ccenabledonly:
try:
mdinfo = initdata["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][1]["videoSecondaryInfoRenderer"]["metadataRowContainer"]["metadataRowContainerRenderer"]["rows"]
for item in mdinfo:
if item["metadataRowRenderer"]["title"]["simpleText"].startswith("Caption author"): #the request to /watch needs to be in English for this to work
try:
desl = langcodes[item["metadataRowRenderer"]["title"]["simpleText"].split("(", 1)[1][:-1]]
except KeyError as e:
#print(e)
print("Language code conversion error, using language name")
desl = item["metadataRowRenderer"]["title"]["simpleText"].split("(", 1)[1][:-1]
creditdata[desl] = []
for itemint in item["metadataRowRenderer"]["contents"]:
creditdata[desl].append({"name": itemint["runs"][0]["text"], "channel": itemint["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"]})
except KeyError as e:
#print("Video does not have credits")
pass
#raise
#print(e)
if initplay and (initdata or ccenabledonly): if initplay and (initdata or ccenabledonly):
break break
return ccenabled, creditdata, recvids, recchans, recmixes, recplayl
return ccenabled, recvids, recchans, recmixes, recplayl
if __name__ == "__main__": if __name__ == "__main__":
from sys import argv from sys import argv


+ 1
- 1
tracker.py View File

@@ -9,7 +9,7 @@ from os.path import isfile
from json import loads from json import loads


# https://github.com/ArchiveTeam/tencent-weibo-grab/blob/9bae5f9747e014db9227821a9c11557267967023/pipeline.py # https://github.com/ArchiveTeam/tencent-weibo-grab/blob/9bae5f9747e014db9227821a9c11557267967023/pipeline.py
VERSION = "20201002.01"
VERSION = "20201017.01"


TRACKER_ID = "ext-yt-communitycontribs" TRACKER_ID = "ext-yt-communitycontribs"
TRACKER_HOST = "trackerproxy.meo.ws" TRACKER_HOST = "trackerproxy.meo.ws"


+ 57
- 92
worker.py View File

@@ -3,9 +3,9 @@ import requests
from time import sleep from time import sleep
from os import mkdir, rmdir, listdir, system, environ from os import mkdir, rmdir, listdir, system, environ
from os.path import isdir, isfile, getsize from os.path import isdir, isfile, getsize
from json import dumps, loads
from json import loads


from youtube_channel import main
from youtube_channel import process_channel


import signal import signal


@@ -87,7 +87,6 @@ open("cookies.txt", "w").write("""# HTTP Cookie File
del cookies del cookies


validationtimes = 0 validationtimes = 0
shouldgetjob = True


#Graceful Shutdown #Graceful Shutdown
class GracefulKiller: class GracefulKiller:
@@ -102,20 +101,8 @@ class GracefulKiller:


gkiller = GracefulKiller() gkiller = GracefulKiller()


#REMOVED PANIC MECHANISM!
"""
enres = getmetadata(mysession, "IjJKfe-0Ty0", True)[0]
if not enres:
print("Community Contribution discovery has been disabled for this account, please report this on our Discord as this may have caused some videos to be incorrectly marked as having community contributions disabled.")
shouldgetjob = False
gkiller.kill_now = True #exit the script

del enres
"""

#microtasks #microtasks
def threadrunner(): def threadrunner():
global shouldgetjob
global validationtimes global validationtimes
jobs = Queue() jobs = Queue()
ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False}) ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False})
@@ -124,7 +111,6 @@ def threadrunner():
task, vid, args = jobs.get() task, vid, args = jobs.get()
if task == "submitdiscovery": if task == "submitdiscovery":
tracker.add_item_to_tracker(args, vid) tracker.add_item_to_tracker(args, vid)
#pass
elif task == "discovery": elif task == "discovery":
while True: while True:
@@ -135,11 +121,9 @@ def threadrunner():
print(e) print(e)
print("Error in retrieving information, waiting 30 seconds and trying again") print("Error in retrieving information, waiting 30 seconds and trying again")
sleep(30) sleep(30)
if info[0] or info[1]: # ccenabled or creditdata
if info[0]: # ccenabled
if not isdir("out/"+str(vid).strip()): if not isdir("out/"+str(vid).strip()):
mkdir("out/"+str(vid).strip()) mkdir("out/"+str(vid).strip())
if info[1]:
open("out/"+str(vid).strip()+"/"+str(vid).strip()+"_published_credits.json", "w").write(dumps(info[1]))


if info[0]: if info[0]:
for langcode in langs: for langcode in langs:
@@ -153,17 +137,14 @@ def threadrunner():


jobs.put(("complete", None, "video:"+vid)) jobs.put(("complete", None, "video:"+vid))


for videodisc in info[2]:
for videodisc in info[1]:
jobs.put(("submitdiscovery", videodisc, tracker.ItemType.Video)) jobs.put(("submitdiscovery", videodisc, tracker.ItemType.Video))
for channeldisc in info[3]:
for channeldisc in info[2]:
jobs.put(("submitdiscovery", channeldisc, tracker.ItemType.Channel)) jobs.put(("submitdiscovery", channeldisc, tracker.ItemType.Channel))
for mixdisc in info[4]:
for mixdisc in info[3]:
jobs.put(("submitdiscovery", mixdisc, tracker.ItemType.MixPlaylist)) jobs.put(("submitdiscovery", mixdisc, tracker.ItemType.MixPlaylist))
for playldisc in info[5]:
for playldisc in info[4]:
jobs.put(("submitdiscovery", playldisc, tracker.ItemType.Playlist)) jobs.put(("submitdiscovery", playldisc, tracker.ItemType.Playlist))
#jobs.put(("complete", None, "video:"+vid))
#pass


elif task == "subtitles": elif task == "subtitles":
subprrun(mysession, args, vid, "default", needforcemetadata, needforcecaptions) subprrun(mysession, args, vid, "default", needforcemetadata, needforcecaptions)
@@ -178,7 +159,7 @@ def threadrunner():
jobs.put(("submitdiscovery", itemyv["id"], tracker.ItemType.Video)) jobs.put(("submitdiscovery", itemyv["id"], tracker.ItemType.Video))


#channel created playlists #channel created playlists
y = main(desit.split(":", 1)[1])
y = process_channel(desit.split(":", 1)[1])
for itemyv in y["playlists"]: for itemyv in y["playlists"]:
jobs.put(("submitdiscovery", itemyv, tracker.ItemType.Playlist)) jobs.put(("submitdiscovery", itemyv, tracker.ItemType.Playlist))
for itemyv in y["channels"]: for itemyv in y["channels"]:
@@ -200,17 +181,14 @@ def threadrunner():
elif task == "mixplaylist": elif task == "mixplaylist":
try: try:
wptext = mysession.get("https://www.youtube.com/watch?v=jNQXAC9IVRw&list="+desit.split(":", 1)[1]).text wptext = mysession.get("https://www.youtube.com/watch?v=jNQXAC9IVRw&list="+desit.split(":", 1)[1]).text
#chanl = set()
#channel handling not needed here because we will get it from the video #channel handling not needed here because we will get it from the video
for line in wptext.splitlines(): for line in wptext.splitlines():
if line.strip().startswith('window["ytInitialData"] = '): if line.strip().startswith('window["ytInitialData"] = '):
initdata = loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1]) initdata = loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1])
for itemyvp in initdata["contents"]["twoColumnWatchNextResults"]["playlist"]["playlist"]["contents"]: for itemyvp in initdata["contents"]["twoColumnWatchNextResults"]["playlist"]["playlist"]["contents"]:
jobs.put(("submitdiscovery", itemyvp["playlistPanelVideoRenderer"]["videoId"], tracker.ItemType.Video)) jobs.put(("submitdiscovery", itemyvp["playlistPanelVideoRenderer"]["videoId"], tracker.ItemType.Video))
#chanl.add(itemyvp["playlistPanelVideoRenderer"]["longBylineText"]["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"])
#for itemn in chanl:
# jobs.put(("submitdiscovery", itemn, tracker.ItemType.Channel))

jobs.put(("complete", None, "mixplaylist:"+args)) jobs.put(("complete", None, "mixplaylist:"+args))
except: except:
print("Mix Playlist error, ignoring but not marking as complete...", "https://www.youtube.com/watch?v=jNQXAC9IVRw&list="+desit.split(":", 1)[1]) print("Mix Playlist error, ignoring but not marking as complete...", "https://www.youtube.com/watch?v=jNQXAC9IVRw&list="+desit.split(":", 1)[1])
@@ -241,10 +219,17 @@ def threadrunner():
print("Waiting 5 minutes...") print("Waiting 5 minutes...")
sleep(300) sleep(300)


if targetloc.startswith("rsync"):
system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 directory/"+args.split(":", 1)[1]+"/ "+targetloc)
elif targetloc.startswith("http"):
system("curl -F "+args.split(":", 1)[1]+".zip=@directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip "+targetloc)
while True:
if targetloc.startswith("rsync"):
exitinfo = system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 directory/"+args.split(":", 1)[1]+"/ "+targetloc)
elif targetloc.startswith("http"):
exitinfo = system("curl -F "+args.split(":", 1)[1]+".zip=@directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip "+targetloc)

if exitinfo == 0: # note that on Unix this isn't necessarily the exit code but it's still 0 upon successful exit
break
else:
print("Error in sending data to target, waiting 30 seconds and trying again.")
sleep(30)




size = getsize("directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip") size = getsize("directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip")
@@ -264,60 +249,44 @@ def threadrunner():
# get a new task from tracker # get a new task from tracker
collect() #cleanup collect() #cleanup


#Protection Mechanism Disarmed
"""
#check that the account has community contributions enabled every 50th item
validationtimes += 1
if not validationtimes % 50:
enres = getmetadata(mysession, "IjJKfe-0Ty0", True)[0]
if not enres:
print("Community Contribution discovery has been disabled for this account, please report this on our Discord as this may have caused some videos to be incorrectly marked as having community contributions disabled.")
shouldgetjob = False
gkiller.kill_now = True #exit the script
del enres
"""

if shouldgetjob:
desit = tracker.request_item_from_tracker()
print("New task:", desit)

if desit:
if desit.split(":", 1)[0] == "video":
needforcemetadata = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
'xh': None, 'yi': None, 'yo': None, 'zu': None}
needforcecaptions = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
'xh': None, 'yi': None, 'yo': None, 'zu': None}
jobs.put(("discovery", desit.split(":", 1)[1], None))
elif desit.split(":", 1)[0] == "channel":
jobs.put(("channel", None, desit.split(":", 1)[1]))
elif desit.split(":", 1)[0] == "playlist":
jobs.put(("playlist", None, desit.split(":", 1)[1]))
elif desit.split(":", 1)[0] == "mixplaylist":
jobs.put(("mixplaylist", None, desit.split(":", 1)[1]))
else:
print("Ignoring item for now", desit)
desit = tracker.request_item_from_tracker()
print("New task:", desit)

if desit:
if desit.split(":", 1)[0] == "video":
needforcemetadata = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
'xh': None, 'yi': None, 'yo': None, 'zu': None}
needforcecaptions = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
'xh': None, 'yi': None, 'yo': None, 'zu': None}
jobs.put(("discovery", desit.split(":", 1)[1], None))
elif desit.split(":", 1)[0] == "channel":
jobs.put(("channel", None, desit.split(":", 1)[1]))
elif desit.split(":", 1)[0] == "playlist":
jobs.put(("playlist", None, desit.split(":", 1)[1]))
elif desit.split(":", 1)[0] == "mixplaylist":
jobs.put(("mixplaylist", None, desit.split(":", 1)[1]))
else: else:
print("Ignoring item for now", desit) print("Ignoring item for now", desit)
else: else:
break
print("Ignoring item for now", desit)
else: else:
break break
@@ -340,8 +309,4 @@ for x in threads:
threads.remove(x) threads.remove(x)
del x del x


if not shouldgetjob:
print("PROTECTION MECHANISM #3 WAS SOMEHOW TRIGERRED")
print("Community Contribution discovery has been disabled for this account, please report this on our Discord as this may have caused some videos to be incorrectly marked as having community contributions disabled.")

print("Exiting...")
print("Exiting...")

+ 63
- 20
youtube_channel.py View File

@@ -1,25 +1,39 @@
from requests import session from requests import session
from youtube_util import getinitialdata, fullyexpand

# TODO: Rate limit detection, HTTP3?
from youtube_util import getinitialdata, fullyexpand, getapikey, getlver
from time import sleep


mysession = session() mysession = session()
#extract latest version automatically #extract latest version automatically
try:
lver = getinitialdata(mysession.get("https://www.youtube.com/").text)["responseContext"]["serviceTrackingParams"][2]["params"][2]["value"]
except:
lver = "2.20201002.02.01"
homepage = mysession.get("https://www.youtube.com/").text

API_KEY = getapikey(homepage)

params = (
('key', API_KEY),
)


#print(lver)
mysession.headers.update({"x-youtube-client-name": "1", "x-youtube-client-version": lver, "Accept-Language": "en-US"})
API_VERSION = getlver(getinitialdata(homepage))


def main(channelid: str):
continuationheaders = {"x-youtube-client-name": "1", "x-youtube-client-version": API_VERSION, "Accept-Language": "en-US"}

del homepage

def process_channel(channelid: str):
playlists = set() playlists = set()
shelfres = set() shelfres = set()
channellist = set() channellist = set()


# PLAYLISTS # PLAYLISTS
initdata = getinitialdata(mysession.get("https://www.youtube.com/channel/"+str(channelid)+"/playlists").text)
data = {"context":{"client":{"hl":"en","gl":"US","clientName":"WEB","clientVersion":API_VERSION}},"browseId":channelid,"params":"EglwbGF5bGlzdHM%3D"}
while True:
initdata = mysession.post("https://www.youtube.com/youtubei/v1/browse", params=params, json=data)
if initdata.status_code == 200:
initdata = initdata.json()
break
else:
print("Non-200 API status code, waiting 30 seconds before retrying...")
sleep(30)



CHANNELS_ID = 0 CHANNELS_ID = 0
PLAYLISTS_ID = 0 PLAYLISTS_ID = 0
@@ -42,7 +56,7 @@ def main(channelid: str):
if "shelfRenderer" in itemint.keys(): if "shelfRenderer" in itemint.keys():
shelfres.add(itemint["shelfRenderer"]["title"]["runs"][0]["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"]) shelfres.add(itemint["shelfRenderer"]["title"]["runs"][0]["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"])
elif "gridRenderer" in itemint.keys(): elif "gridRenderer" in itemint.keys():
playlistsint = fullyexpand(itemint["gridRenderer"])["items"]
playlistsint = fullyexpand(itemint["gridRenderer"], mysession, continuationheaders)["items"]


for playlist in playlistsint: for playlist in playlistsint:
playlists.add(playlist["gridPlaylistRenderer"]["playlistId"]) playlists.add(playlist["gridPlaylistRenderer"]["playlistId"])
@@ -50,8 +64,18 @@ def main(channelid: str):
channellist.add(playlist["gridPlaylistRenderer"]["shortBylineText"]["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"]) channellist.add(playlist["gridPlaylistRenderer"]["shortBylineText"]["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"])


for item in shelfres: for item in shelfres:
shelfiteminitdata = getinitialdata(mysession.get("https://www.youtube.com/"+str(item)).text)
playlistsint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][PLAYLISTS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"])["items"]
while True:
shelfintp = mysession.get("https://www.youtube.com/"+str(item))
if not """</div><div id="content" class=" content-alignment" role="main"><p class='largeText'>Sorry for the interruption. We have been receiving a large volume of requests from your network.</p>

<p>To continue with your YouTube experience, please fill out the form below.</p>""" in shelfintp.text and shelfintp.status_code == 200:
break
else:
print("Non-200 status code, waiting 30 seconds before retrying...")
sleep(30)

shelfiteminitdata = getinitialdata(shelfintp.text)
playlistsint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][PLAYLISTS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"], mysession, continuationheaders)["items"]


for playlist in playlistsint: for playlist in playlistsint:
playlists.add(playlist["gridPlaylistRenderer"]["playlistId"]) playlists.add(playlist["gridPlaylistRenderer"]["playlistId"])
@@ -61,7 +85,16 @@ def main(channelid: str):
# CHANNELS # CHANNELS
cshelfres = set() cshelfres = set()


initdata = getinitialdata(mysession.get("https://www.youtube.com/channel/"+str(channelid)+"/channels").text)
# PLAYLISTS
data = {"context":{"client":{"hl":"en","gl":"US","clientName":"WEB","clientVersion":API_VERSION}},"browseId":channelid,"params":"EghjaGFubmVscw%3D%3D"}
while True:
initdata = mysession.post("https://www.youtube.com/youtubei/v1/browse", params=params, json=data)
if initdata.status_code == 200:
initdata = initdata.json()
break
else:
print("Non-200 API status code, waiting 30 seconds before retrying...")
sleep(30)


shelflist = initdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"] shelflist = initdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"]


@@ -70,14 +103,24 @@ def main(channelid: str):
if "shelfRenderer" in itemint.keys(): if "shelfRenderer" in itemint.keys():
cshelfres.add(itemint["shelfRenderer"]["title"]["runs"][0]["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"]) cshelfres.add(itemint["shelfRenderer"]["title"]["runs"][0]["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"])
elif "gridRenderer" in itemint.keys(): elif "gridRenderer" in itemint.keys():
chanlistint = fullyexpand(itemint["gridRenderer"])["items"]
chanlistint = fullyexpand(itemint["gridRenderer"], mysession, continuationheaders)["items"]


for channel in chanlistint: for channel in chanlistint:
channellist.add(channel["gridChannelRenderer"]["channelId"]) channellist.add(channel["gridChannelRenderer"]["channelId"])


for item in cshelfres: for item in cshelfres:
shelfiteminitdata = getinitialdata(mysession.get("https://www.youtube.com/"+str(item)).text)
chanlistint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"])["items"]
while True:
shelfintc = mysession.get("https://www.youtube.com/"+str(item))
if not """</div><div id="content" class=" content-alignment" role="main"><p class='largeText'>Sorry for the interruption. We have been receiving a large volume of requests from your network.</p>

<p>To continue with your YouTube experience, please fill out the form below.</p>""" in shelfintc.text and shelfintc.status_code == 200:
break
else:
print("Non-200 status code, waiting 30 seconds before retrying...")
sleep(30)

shelfiteminitdata = getinitialdata(shelfintc.text)
chanlistint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"], mysession, continuationheaders)["items"]


for channel in chanlistint: for channel in chanlistint:
channellist.add(channel["gridChannelRenderer"]["channelId"]) channellist.add(channel["gridChannelRenderer"]["channelId"])
@@ -89,7 +132,7 @@ if __name__ == "__main__":
chanl = argv chanl = argv
chanl.pop(0) chanl.pop(0)
for channel in chanl: for channel in chanl:
print(main(channel))
print(process_channel(channel))


# SAMPLES: # SAMPLES:
# UCqj7Cz7revf5maW9g5pgNcg lots of playlists # UCqj7Cz7revf5maW9g5pgNcg lots of playlists
@@ -103,4 +146,4 @@ if __name__ == "__main__":


# UCJOh5FKisc0hUlEeWFBlD-w no subscriptions, plenty of featured channels # UCJOh5FKisc0hUlEeWFBlD-w no subscriptions, plenty of featured channels


# UC7fjJERoGTs_eOKk-nn7RMw fair number of featured channels
# UC7fjJERoGTs_eOKk-nn7RMw fair number of featured channels

+ 20
- 10
youtube_util.py View File

@@ -1,6 +1,8 @@
from requests import session
from json import loads from json import loads
from urllib.parse import unquote from urllib.parse import unquote
from time import sleep

import requests


def getinitialdata(html: str): def getinitialdata(html: str):
for line in html.splitlines(): for line in html.splitlines():
@@ -8,19 +10,27 @@ def getinitialdata(html: str):
return loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1]) return loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1])
return {} return {}


mysession = session()
def getapikey(html: str):
return html.split('"INNERTUBE_API_KEY":"', 1)[-1].split('"', 1)[0]

#extract latest version automatically #extract latest version automatically
try:
lver = getinitialdata(mysession.get("https://www.youtube.com/").text)["responseContext"]["serviceTrackingParams"][2]["params"][2]["value"]
except:
lver = "2.20201002.02.01"
mysession.headers.update({"x-youtube-client-name": "1", "x-youtube-client-version": lver, "Accept-Language": "en-US"})
def getlver(initialdata: dict):
try:
return initialdata["responseContext"]["serviceTrackingParams"][2]["params"][2]["value"]
except:
return "2.20201016.02.00"


def fullyexpand(inputdict: dict):
def fullyexpand(inputdict: dict, mysession: requests.session, continuationheaders: dict):
lastrequestj = inputdict lastrequestj = inputdict
while "continuations" in lastrequestj.keys(): while "continuations" in lastrequestj.keys():
lastrequest = mysession.get("https://www.youtube.com/browse_ajax?continuation="+unquote(lastrequestj["continuations"][0]["nextContinuationData"]["continuation"]))
while True:
lastrequest = mysession.get("https://www.youtube.com/browse_ajax?continuation="+unquote(lastrequestj["continuations"][0]["nextContinuationData"]["continuation"]), headers=continuationheaders)
if lastrequest.status_code == 200:
break
else:
print("Non-200 API status code, waiting 30 seconds before retrying...")
sleep(30)
lastrequestj = lastrequest.json()[1]["response"]["continuationContents"]["gridContinuation"] lastrequestj = lastrequest.json()[1]["response"]["continuationContents"]["gridContinuation"]
inputdict["items"].extend(lastrequestj["items"]) inputdict["items"].extend(lastrequestj["items"])


return inputdict
return inputdict

Loading…
Cancel
Save