Browse Source

Merge pull request #11 from Data-Horde/disable-credits

20201017.1
master
tech234a 3 years ago
committed by GitHub
parent
commit
ca9acb37ca
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 142 additions and 147 deletions
  1. +1
    -24
      discovery.py
  2. +1
    -1
      tracker.py
  3. +57
    -92
      worker.py
  4. +63
    -20
      youtube_channel.py
  5. +20
    -10
      youtube_util.py

+ 1
- 24
discovery.py View File

@@ -123,34 +123,11 @@ def getmetadata(mysession, vid, ccenabledonly=False):
except BaseException as e:
print(e)
print("Exception in discovery, continuing anyway")
creditdata = {}
if not ccenabledonly:
try:
mdinfo = initdata["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][1]["videoSecondaryInfoRenderer"]["metadataRowContainer"]["metadataRowContainerRenderer"]["rows"]
for item in mdinfo:
if item["metadataRowRenderer"]["title"]["simpleText"].startswith("Caption author"): #the request to /watch needs to be in English for this to work
try:
desl = langcodes[item["metadataRowRenderer"]["title"]["simpleText"].split("(", 1)[1][:-1]]
except KeyError as e:
#print(e)
print("Language code conversion error, using language name")
desl = item["metadataRowRenderer"]["title"]["simpleText"].split("(", 1)[1][:-1]
creditdata[desl] = []
for itemint in item["metadataRowRenderer"]["contents"]:
creditdata[desl].append({"name": itemint["runs"][0]["text"], "channel": itemint["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"]})
except KeyError as e:
#print("Video does not have credits")
pass
#raise
#print(e)
if initplay and (initdata or ccenabledonly):
break
return ccenabled, creditdata, recvids, recchans, recmixes, recplayl
return ccenabled, recvids, recchans, recmixes, recplayl
if __name__ == "__main__":
from sys import argv


+ 1
- 1
tracker.py View File

@@ -9,7 +9,7 @@ from os.path import isfile
from json import loads

# https://github.com/ArchiveTeam/tencent-weibo-grab/blob/9bae5f9747e014db9227821a9c11557267967023/pipeline.py
VERSION = "20201002.01"
VERSION = "20201017.01"

TRACKER_ID = "ext-yt-communitycontribs"
TRACKER_HOST = "trackerproxy.meo.ws"


+ 57
- 92
worker.py View File

@@ -3,9 +3,9 @@ import requests
from time import sleep
from os import mkdir, rmdir, listdir, system, environ
from os.path import isdir, isfile, getsize
from json import dumps, loads
from json import loads

from youtube_channel import main
from youtube_channel import process_channel

import signal

@@ -87,7 +87,6 @@ open("cookies.txt", "w").write("""# HTTP Cookie File
del cookies

validationtimes = 0
shouldgetjob = True

#Graceful Shutdown
class GracefulKiller:
@@ -102,20 +101,8 @@ class GracefulKiller:

gkiller = GracefulKiller()

#REMOVED PANIC MECHANISM!
"""
enres = getmetadata(mysession, "IjJKfe-0Ty0", True)[0]
if not enres:
print("Community Contribution discovery has been disabled for this account, please report this on our Discord as this may have caused some videos to be incorrectly marked as having community contributions disabled.")
shouldgetjob = False
gkiller.kill_now = True #exit the script

del enres
"""

#microtasks
def threadrunner():
global shouldgetjob
global validationtimes
jobs = Queue()
ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False})
@@ -124,7 +111,6 @@ def threadrunner():
task, vid, args = jobs.get()
if task == "submitdiscovery":
tracker.add_item_to_tracker(args, vid)
#pass
elif task == "discovery":
while True:
@@ -135,11 +121,9 @@ def threadrunner():
print(e)
print("Error in retrieving information, waiting 30 seconds and trying again")
sleep(30)
if info[0] or info[1]: # ccenabled or creditdata
if info[0]: # ccenabled
if not isdir("out/"+str(vid).strip()):
mkdir("out/"+str(vid).strip())
if info[1]:
open("out/"+str(vid).strip()+"/"+str(vid).strip()+"_published_credits.json", "w").write(dumps(info[1]))

if info[0]:
for langcode in langs:
@@ -153,17 +137,14 @@ def threadrunner():

jobs.put(("complete", None, "video:"+vid))

for videodisc in info[2]:
for videodisc in info[1]:
jobs.put(("submitdiscovery", videodisc, tracker.ItemType.Video))
for channeldisc in info[3]:
for channeldisc in info[2]:
jobs.put(("submitdiscovery", channeldisc, tracker.ItemType.Channel))
for mixdisc in info[4]:
for mixdisc in info[3]:
jobs.put(("submitdiscovery", mixdisc, tracker.ItemType.MixPlaylist))
for playldisc in info[5]:
for playldisc in info[4]:
jobs.put(("submitdiscovery", playldisc, tracker.ItemType.Playlist))
#jobs.put(("complete", None, "video:"+vid))
#pass

elif task == "subtitles":
subprrun(mysession, args, vid, "default", needforcemetadata, needforcecaptions)
@@ -178,7 +159,7 @@ def threadrunner():
jobs.put(("submitdiscovery", itemyv["id"], tracker.ItemType.Video))

#channel created playlists
y = main(desit.split(":", 1)[1])
y = process_channel(desit.split(":", 1)[1])
for itemyv in y["playlists"]:
jobs.put(("submitdiscovery", itemyv, tracker.ItemType.Playlist))
for itemyv in y["channels"]:
@@ -200,17 +181,14 @@ def threadrunner():
elif task == "mixplaylist":
try:
wptext = mysession.get("https://www.youtube.com/watch?v=jNQXAC9IVRw&list="+desit.split(":", 1)[1]).text
#chanl = set()
#channel handling not needed here because we will get it from the video
for line in wptext.splitlines():
if line.strip().startswith('window["ytInitialData"] = '):
initdata = loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1])
for itemyvp in initdata["contents"]["twoColumnWatchNextResults"]["playlist"]["playlist"]["contents"]:
jobs.put(("submitdiscovery", itemyvp["playlistPanelVideoRenderer"]["videoId"], tracker.ItemType.Video))
#chanl.add(itemyvp["playlistPanelVideoRenderer"]["longBylineText"]["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"])
#for itemn in chanl:
# jobs.put(("submitdiscovery", itemn, tracker.ItemType.Channel))

jobs.put(("complete", None, "mixplaylist:"+args))
except:
print("Mix Playlist error, ignoring but not marking as complete...", "https://www.youtube.com/watch?v=jNQXAC9IVRw&list="+desit.split(":", 1)[1])
@@ -241,10 +219,17 @@ def threadrunner():
print("Waiting 5 minutes...")
sleep(300)

if targetloc.startswith("rsync"):
system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 directory/"+args.split(":", 1)[1]+"/ "+targetloc)
elif targetloc.startswith("http"):
system("curl -F "+args.split(":", 1)[1]+".zip=@directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip "+targetloc)
while True:
if targetloc.startswith("rsync"):
exitinfo = system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 directory/"+args.split(":", 1)[1]+"/ "+targetloc)
elif targetloc.startswith("http"):
exitinfo = system("curl -F "+args.split(":", 1)[1]+".zip=@directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip "+targetloc)

if exitinfo == 0: # note that on Unix this isn't necessarily the exit code but it's still 0 upon successful exit
break
else:
print("Error in sending data to target, waiting 30 seconds and trying again.")
sleep(30)


size = getsize("directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip")
@@ -264,60 +249,44 @@ def threadrunner():
# get a new task from tracker
collect() #cleanup

#Protection Mechanism Disarmed
"""
#check that the account has community contributions enabled every 50th item
validationtimes += 1
if not validationtimes % 50:
enres = getmetadata(mysession, "IjJKfe-0Ty0", True)[0]
if not enres:
print("Community Contribution discovery has been disabled for this account, please report this on our Discord as this may have caused some videos to be incorrectly marked as having community contributions disabled.")
shouldgetjob = False
gkiller.kill_now = True #exit the script
del enres
"""

if shouldgetjob:
desit = tracker.request_item_from_tracker()
print("New task:", desit)

if desit:
if desit.split(":", 1)[0] == "video":
needforcemetadata = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
'xh': None, 'yi': None, 'yo': None, 'zu': None}
needforcecaptions = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
'xh': None, 'yi': None, 'yo': None, 'zu': None}
jobs.put(("discovery", desit.split(":", 1)[1], None))
elif desit.split(":", 1)[0] == "channel":
jobs.put(("channel", None, desit.split(":", 1)[1]))
elif desit.split(":", 1)[0] == "playlist":
jobs.put(("playlist", None, desit.split(":", 1)[1]))
elif desit.split(":", 1)[0] == "mixplaylist":
jobs.put(("mixplaylist", None, desit.split(":", 1)[1]))
else:
print("Ignoring item for now", desit)
desit = tracker.request_item_from_tracker()
print("New task:", desit)

if desit:
if desit.split(":", 1)[0] == "video":
needforcemetadata = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
'xh': None, 'yi': None, 'yo': None, 'zu': None}
needforcecaptions = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
'xh': None, 'yi': None, 'yo': None, 'zu': None}
jobs.put(("discovery", desit.split(":", 1)[1], None))
elif desit.split(":", 1)[0] == "channel":
jobs.put(("channel", None, desit.split(":", 1)[1]))
elif desit.split(":", 1)[0] == "playlist":
jobs.put(("playlist", None, desit.split(":", 1)[1]))
elif desit.split(":", 1)[0] == "mixplaylist":
jobs.put(("mixplaylist", None, desit.split(":", 1)[1]))
else:
print("Ignoring item for now", desit)
else:
break
print("Ignoring item for now", desit)
else:
break
@@ -340,8 +309,4 @@ for x in threads:
threads.remove(x)
del x

if not shouldgetjob:
print("PROTECTION MECHANISM #3 WAS SOMEHOW TRIGERRED")
print("Community Contribution discovery has been disabled for this account, please report this on our Discord as this may have caused some videos to be incorrectly marked as having community contributions disabled.")

print("Exiting...")
print("Exiting...")

+ 63
- 20
youtube_channel.py View File

@@ -1,25 +1,39 @@
from requests import session
from youtube_util import getinitialdata, fullyexpand

# TODO: Rate limit detection, HTTP3?
from youtube_util import getinitialdata, fullyexpand, getapikey, getlver
from time import sleep

mysession = session()
#extract latest version automatically
try:
lver = getinitialdata(mysession.get("https://www.youtube.com/").text)["responseContext"]["serviceTrackingParams"][2]["params"][2]["value"]
except:
lver = "2.20201002.02.01"
homepage = mysession.get("https://www.youtube.com/").text

API_KEY = getapikey(homepage)

params = (
('key', API_KEY),
)

#print(lver)
mysession.headers.update({"x-youtube-client-name": "1", "x-youtube-client-version": lver, "Accept-Language": "en-US"})
API_VERSION = getlver(getinitialdata(homepage))

def main(channelid: str):
continuationheaders = {"x-youtube-client-name": "1", "x-youtube-client-version": API_VERSION, "Accept-Language": "en-US"}

del homepage

def process_channel(channelid: str):
playlists = set()
shelfres = set()
channellist = set()

# PLAYLISTS
initdata = getinitialdata(mysession.get("https://www.youtube.com/channel/"+str(channelid)+"/playlists").text)
data = {"context":{"client":{"hl":"en","gl":"US","clientName":"WEB","clientVersion":API_VERSION}},"browseId":channelid,"params":"EglwbGF5bGlzdHM%3D"}
while True:
initdata = mysession.post("https://www.youtube.com/youtubei/v1/browse", params=params, json=data)
if initdata.status_code == 200:
initdata = initdata.json()
break
else:
print("Non-200 API status code, waiting 30 seconds before retrying...")
sleep(30)


CHANNELS_ID = 0
PLAYLISTS_ID = 0
@@ -42,7 +56,7 @@ def main(channelid: str):
if "shelfRenderer" in itemint.keys():
shelfres.add(itemint["shelfRenderer"]["title"]["runs"][0]["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"])
elif "gridRenderer" in itemint.keys():
playlistsint = fullyexpand(itemint["gridRenderer"])["items"]
playlistsint = fullyexpand(itemint["gridRenderer"], mysession, continuationheaders)["items"]

for playlist in playlistsint:
playlists.add(playlist["gridPlaylistRenderer"]["playlistId"])
@@ -50,8 +64,18 @@ def main(channelid: str):
channellist.add(playlist["gridPlaylistRenderer"]["shortBylineText"]["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"])

for item in shelfres:
shelfiteminitdata = getinitialdata(mysession.get("https://www.youtube.com/"+str(item)).text)
playlistsint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][PLAYLISTS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"])["items"]
while True:
shelfintp = mysession.get("https://www.youtube.com/"+str(item))
if not """</div><div id="content" class=" content-alignment" role="main"><p class='largeText'>Sorry for the interruption. We have been receiving a large volume of requests from your network.</p>

<p>To continue with your YouTube experience, please fill out the form below.</p>""" in shelfintp.text and shelfintp.status_code == 200:
break
else:
print("Non-200 status code, waiting 30 seconds before retrying...")
sleep(30)

shelfiteminitdata = getinitialdata(shelfintp.text)
playlistsint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][PLAYLISTS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"], mysession, continuationheaders)["items"]

for playlist in playlistsint:
playlists.add(playlist["gridPlaylistRenderer"]["playlistId"])
@@ -61,7 +85,16 @@ def main(channelid: str):
# CHANNELS
cshelfres = set()

initdata = getinitialdata(mysession.get("https://www.youtube.com/channel/"+str(channelid)+"/channels").text)
# PLAYLISTS
data = {"context":{"client":{"hl":"en","gl":"US","clientName":"WEB","clientVersion":API_VERSION}},"browseId":channelid,"params":"EghjaGFubmVscw%3D%3D"}
while True:
initdata = mysession.post("https://www.youtube.com/youtubei/v1/browse", params=params, json=data)
if initdata.status_code == 200:
initdata = initdata.json()
break
else:
print("Non-200 API status code, waiting 30 seconds before retrying...")
sleep(30)

shelflist = initdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"]

@@ -70,14 +103,24 @@ def main(channelid: str):
if "shelfRenderer" in itemint.keys():
cshelfres.add(itemint["shelfRenderer"]["title"]["runs"][0]["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"])
elif "gridRenderer" in itemint.keys():
chanlistint = fullyexpand(itemint["gridRenderer"])["items"]
chanlistint = fullyexpand(itemint["gridRenderer"], mysession, continuationheaders)["items"]

for channel in chanlistint:
channellist.add(channel["gridChannelRenderer"]["channelId"])

for item in cshelfres:
shelfiteminitdata = getinitialdata(mysession.get("https://www.youtube.com/"+str(item)).text)
chanlistint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"])["items"]
while True:
shelfintc = mysession.get("https://www.youtube.com/"+str(item))
if not """</div><div id="content" class=" content-alignment" role="main"><p class='largeText'>Sorry for the interruption. We have been receiving a large volume of requests from your network.</p>

<p>To continue with your YouTube experience, please fill out the form below.</p>""" in shelfintc.text and shelfintc.status_code == 200:
break
else:
print("Non-200 status code, waiting 30 seconds before retrying...")
sleep(30)

shelfiteminitdata = getinitialdata(shelfintc.text)
chanlistint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"], mysession, continuationheaders)["items"]

for channel in chanlistint:
channellist.add(channel["gridChannelRenderer"]["channelId"])
@@ -89,7 +132,7 @@ if __name__ == "__main__":
chanl = argv
chanl.pop(0)
for channel in chanl:
print(main(channel))
print(process_channel(channel))

# SAMPLES:
# UCqj7Cz7revf5maW9g5pgNcg lots of playlists
@@ -103,4 +146,4 @@ if __name__ == "__main__":

# UCJOh5FKisc0hUlEeWFBlD-w no subscriptions, plenty of featured channels

# UC7fjJERoGTs_eOKk-nn7RMw fair number of featured channels
# UC7fjJERoGTs_eOKk-nn7RMw fair number of featured channels

+ 20
- 10
youtube_util.py View File

@@ -1,6 +1,8 @@
from requests import session
from json import loads
from urllib.parse import unquote
from time import sleep

import requests

def getinitialdata(html: str):
for line in html.splitlines():
@@ -8,19 +10,27 @@ def getinitialdata(html: str):
return loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1])
return {}

mysession = session()
def getapikey(html: str):
return html.split('"INNERTUBE_API_KEY":"', 1)[-1].split('"', 1)[0]

#extract latest version automatically
try:
lver = getinitialdata(mysession.get("https://www.youtube.com/").text)["responseContext"]["serviceTrackingParams"][2]["params"][2]["value"]
except:
lver = "2.20201002.02.01"
mysession.headers.update({"x-youtube-client-name": "1", "x-youtube-client-version": lver, "Accept-Language": "en-US"})
def getlver(initialdata: dict):
try:
return initialdata["responseContext"]["serviceTrackingParams"][2]["params"][2]["value"]
except:
return "2.20201016.02.00"

def fullyexpand(inputdict: dict):
def fullyexpand(inputdict: dict, mysession: requests.session, continuationheaders: dict):
lastrequestj = inputdict
while "continuations" in lastrequestj.keys():
lastrequest = mysession.get("https://www.youtube.com/browse_ajax?continuation="+unquote(lastrequestj["continuations"][0]["nextContinuationData"]["continuation"]))
while True:
lastrequest = mysession.get("https://www.youtube.com/browse_ajax?continuation="+unquote(lastrequestj["continuations"][0]["nextContinuationData"]["continuation"]), headers=continuationheaders)
if lastrequest.status_code == 200:
break
else:
print("Non-200 API status code, waiting 30 seconds before retrying...")
sleep(30)
lastrequestj = lastrequest.json()[1]["response"]["continuationContents"]["gridContinuation"]
inputdict["items"].extend(lastrequestj["items"])

return inputdict
return inputdict

Loading…
Cancel
Save