From 2db5e71e876028da011e2d9dfa1cf15f7bf5a98a Mon Sep 17 00:00:00 2001
From: tech234a <46801700+tech234a@users.noreply.github.com>
Date: Tue, 13 Oct 2020 21:11:53 -0400
Subject: [PATCH 01/11] Disable credit retrieval
---
discovery.py | 25 +------------------------
tracker.py | 2 +-
worker.py | 12 +++++-------
3 files changed, 7 insertions(+), 32 deletions(-)
diff --git a/discovery.py b/discovery.py
index c6ad4a5..bf3c24c 100644
--- a/discovery.py
+++ b/discovery.py
@@ -123,34 +123,11 @@ def getmetadata(mysession, vid, ccenabledonly=False):
except BaseException as e:
print(e)
print("Exception in discovery, continuing anyway")
-
- creditdata = {}
-
- if not ccenabledonly:
- try:
- mdinfo = initdata["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][1]["videoSecondaryInfoRenderer"]["metadataRowContainer"]["metadataRowContainerRenderer"]["rows"]
- for item in mdinfo:
- if item["metadataRowRenderer"]["title"]["simpleText"].startswith("Caption author"): #the request to /watch needs to be in English for this to work
- try:
- desl = langcodes[item["metadataRowRenderer"]["title"]["simpleText"].split("(", 1)[1][:-1]]
- except KeyError as e:
- #print(e)
- print("Language code conversion error, using language name")
- desl = item["metadataRowRenderer"]["title"]["simpleText"].split("(", 1)[1][:-1]
- creditdata[desl] = []
- for itemint in item["metadataRowRenderer"]["contents"]:
- creditdata[desl].append({"name": itemint["runs"][0]["text"], "channel": itemint["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"]})
-
- except KeyError as e:
- #print("Video does not have credits")
- pass
- #raise
- #print(e)
if initplay and (initdata or ccenabledonly):
break
- return ccenabled, creditdata, recvids, recchans, recmixes, recplayl
+ return ccenabled, recvids, recchans, recmixes, recplayl
if __name__ == "__main__":
from sys import argv
diff --git a/tracker.py b/tracker.py
index f3782bf..63bd35a 100644
--- a/tracker.py
+++ b/tracker.py
@@ -9,7 +9,7 @@ from os.path import isfile
from json import loads
# https://github.com/ArchiveTeam/tencent-weibo-grab/blob/9bae5f9747e014db9227821a9c11557267967023/pipeline.py
-VERSION = "20201002.01"
+VERSION = "20201014.01"
TRACKER_ID = "ext-yt-communitycontribs"
TRACKER_HOST = "trackerproxy.meo.ws"
diff --git a/worker.py b/worker.py
index bf9738c..a614c05 100644
--- a/worker.py
+++ b/worker.py
@@ -135,11 +135,9 @@ def threadrunner():
print(e)
print("Error in retrieving information, waiting 30 seconds and trying again")
sleep(30)
- if info[0] or info[1]: # ccenabled or creditdata
+ if info[0]: # ccenabled or creditdata
if not isdir("out/"+str(vid).strip()):
mkdir("out/"+str(vid).strip())
- if info[1]:
- open("out/"+str(vid).strip()+"/"+str(vid).strip()+"_published_credits.json", "w").write(dumps(info[1]))
if info[0]:
for langcode in langs:
@@ -153,13 +151,13 @@ def threadrunner():
jobs.put(("complete", None, "video:"+vid))
- for videodisc in info[2]:
+ for videodisc in info[1]:
jobs.put(("submitdiscovery", videodisc, tracker.ItemType.Video))
- for channeldisc in info[3]:
+ for channeldisc in info[2]:
jobs.put(("submitdiscovery", channeldisc, tracker.ItemType.Channel))
- for mixdisc in info[4]:
+ for mixdisc in info[3]:
jobs.put(("submitdiscovery", mixdisc, tracker.ItemType.MixPlaylist))
- for playldisc in info[5]:
+ for playldisc in info[4]:
jobs.put(("submitdiscovery", playldisc, tracker.ItemType.Playlist))
#jobs.put(("complete", None, "video:"+vid))
From 76af3fd211064e99da3568cd1deaa98d6ee846b1 Mon Sep 17 00:00:00 2001
From: tech234a <46801700+tech234a@users.noreply.github.com>
Date: Tue, 13 Oct 2020 21:22:29 -0400
Subject: [PATCH 02/11] Fix comment
---
worker.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/worker.py b/worker.py
index a614c05..7b59b3b 100644
--- a/worker.py
+++ b/worker.py
@@ -135,7 +135,7 @@ def threadrunner():
print(e)
print("Error in retrieving information, waiting 30 seconds and trying again")
sleep(30)
- if info[0]: # ccenabled or creditdata
+ if info[0]: # ccenabled
if not isdir("out/"+str(vid).strip()):
mkdir("out/"+str(vid).strip())
From f7b92768dbeb68f42e5753436354f2e310a676eb Mon Sep 17 00:00:00 2001
From: tech234a <46801700+tech234a@users.noreply.github.com>
Date: Sat, 17 Oct 2020 00:25:27 -0400
Subject: [PATCH 03/11] Cleanup
---
worker.py | 118 ++++++++++++++++++------------------------------------
1 file changed, 39 insertions(+), 79 deletions(-)
diff --git a/worker.py b/worker.py
index 7b59b3b..4685541 100644
--- a/worker.py
+++ b/worker.py
@@ -3,7 +3,7 @@ import requests
from time import sleep
from os import mkdir, rmdir, listdir, system, environ
from os.path import isdir, isfile, getsize
-from json import dumps, loads
+from json import loads
from youtube_channel import main
@@ -87,7 +87,6 @@ open("cookies.txt", "w").write("""# HTTP Cookie File
del cookies
validationtimes = 0
-shouldgetjob = True
#Graceful Shutdown
class GracefulKiller:
@@ -102,20 +101,8 @@ class GracefulKiller:
gkiller = GracefulKiller()
-#REMOVED PANIC MECHANISM!
-"""
-enres = getmetadata(mysession, "IjJKfe-0Ty0", True)[0]
-if not enres:
- print("Community Contribution discovery has been disabled for this account, please report this on our Discord as this may have caused some videos to be incorrectly marked as having community contributions disabled.")
- shouldgetjob = False
- gkiller.kill_now = True #exit the script
-
-del enres
-"""
-
#microtasks
def threadrunner():
- global shouldgetjob
global validationtimes
jobs = Queue()
ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False})
@@ -124,7 +111,6 @@ def threadrunner():
task, vid, args = jobs.get()
if task == "submitdiscovery":
tracker.add_item_to_tracker(args, vid)
- #pass
elif task == "discovery":
while True:
@@ -159,9 +145,6 @@ def threadrunner():
jobs.put(("submitdiscovery", mixdisc, tracker.ItemType.MixPlaylist))
for playldisc in info[4]:
jobs.put(("submitdiscovery", playldisc, tracker.ItemType.Playlist))
-
- #jobs.put(("complete", None, "video:"+vid))
- #pass
elif task == "subtitles":
subprrun(mysession, args, vid, "default", needforcemetadata, needforcecaptions)
@@ -198,17 +181,14 @@ def threadrunner():
elif task == "mixplaylist":
try:
wptext = mysession.get("https://www.youtube.com/watch?v=jNQXAC9IVRw&list="+desit.split(":", 1)[1]).text
- #chanl = set()
+
#channel handling not needed here because we will get it from the video
for line in wptext.splitlines():
if line.strip().startswith('window["ytInitialData"] = '):
initdata = loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1])
for itemyvp in initdata["contents"]["twoColumnWatchNextResults"]["playlist"]["playlist"]["contents"]:
jobs.put(("submitdiscovery", itemyvp["playlistPanelVideoRenderer"]["videoId"], tracker.ItemType.Video))
- #chanl.add(itemyvp["playlistPanelVideoRenderer"]["longBylineText"]["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"])
-
- #for itemn in chanl:
- # jobs.put(("submitdiscovery", itemn, tracker.ItemType.Channel))
+
jobs.put(("complete", None, "mixplaylist:"+args))
except:
print("Mix Playlist error, ignoring but not marking as complete...", "https://www.youtube.com/watch?v=jNQXAC9IVRw&list="+desit.split(":", 1)[1])
@@ -262,60 +242,44 @@ def threadrunner():
# get a new task from tracker
collect() #cleanup
- #Protection Mechanism Disarmed
- """
- #check that the account has community contributions enabled every 50th item
- validationtimes += 1
- if not validationtimes % 50:
- enres = getmetadata(mysession, "IjJKfe-0Ty0", True)[0]
- if not enres:
- print("Community Contribution discovery has been disabled for this account, please report this on our Discord as this may have caused some videos to be incorrectly marked as having community contributions disabled.")
- shouldgetjob = False
- gkiller.kill_now = True #exit the script
- del enres
- """
-
- if shouldgetjob:
- desit = tracker.request_item_from_tracker()
- print("New task:", desit)
-
- if desit:
- if desit.split(":", 1)[0] == "video":
- needforcemetadata = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
- 'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
- 'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
- 'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
- 'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
- 'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
- 'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
- 'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
- 'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
- 'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
- 'xh': None, 'yi': None, 'yo': None, 'zu': None}
- needforcecaptions = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
- 'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
- 'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
- 'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
- 'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
- 'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
- 'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
- 'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
- 'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
- 'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
- 'xh': None, 'yi': None, 'yo': None, 'zu': None}
- jobs.put(("discovery", desit.split(":", 1)[1], None))
- elif desit.split(":", 1)[0] == "channel":
- jobs.put(("channel", None, desit.split(":", 1)[1]))
- elif desit.split(":", 1)[0] == "playlist":
- jobs.put(("playlist", None, desit.split(":", 1)[1]))
- elif desit.split(":", 1)[0] == "mixplaylist":
- jobs.put(("mixplaylist", None, desit.split(":", 1)[1]))
- else:
- print("Ignoring item for now", desit)
+ desit = tracker.request_item_from_tracker()
+ print("New task:", desit)
+
+ if desit:
+ if desit.split(":", 1)[0] == "video":
+ needforcemetadata = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
+ 'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
+ 'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
+ 'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
+ 'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
+ 'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
+ 'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
+ 'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
+ 'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
+ 'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
+ 'xh': None, 'yi': None, 'yo': None, 'zu': None}
+ needforcecaptions = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
+ 'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
+ 'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
+ 'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
+ 'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
+ 'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
+ 'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
+ 'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
+ 'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
+ 'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
+ 'xh': None, 'yi': None, 'yo': None, 'zu': None}
+ jobs.put(("discovery", desit.split(":", 1)[1], None))
+ elif desit.split(":", 1)[0] == "channel":
+ jobs.put(("channel", None, desit.split(":", 1)[1]))
+ elif desit.split(":", 1)[0] == "playlist":
+ jobs.put(("playlist", None, desit.split(":", 1)[1]))
+ elif desit.split(":", 1)[0] == "mixplaylist":
+ jobs.put(("mixplaylist", None, desit.split(":", 1)[1]))
else:
print("Ignoring item for now", desit)
else:
- break
+ print("Ignoring item for now", desit)
else:
break
@@ -338,8 +302,4 @@ for x in threads:
threads.remove(x)
del x
-if not shouldgetjob:
- print("PROTECTION MECHANISM #3 WAS SOMEHOW TRIGERRED")
- print("Community Contribution discovery has been disabled for this account, please report this on our Discord as this may have caused some videos to be incorrectly marked as having community contributions disabled.")
-
-print("Exiting...")
+print("Exiting...")
\ No newline at end of file
From 80d125ace717a238e193f94778905ca6afdd1a44 Mon Sep 17 00:00:00 2001
From: tech234a <46801700+tech234a@users.noreply.github.com>
Date: Sat, 17 Oct 2020 00:29:57 -0400
Subject: [PATCH 04/11] Detect failures when transferring to target
---
worker.py | 15 +++++++++++----
1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/worker.py b/worker.py
index 4685541..bdf4372 100644
--- a/worker.py
+++ b/worker.py
@@ -219,10 +219,17 @@ def threadrunner():
print("Waiting 5 minutes...")
sleep(300)
- if targetloc.startswith("rsync"):
- system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 directory/"+args.split(":", 1)[1]+"/ "+targetloc)
- elif targetloc.startswith("http"):
- system("curl -F "+args.split(":", 1)[1]+".zip=@directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip "+targetloc)
+ while True:
+ if targetloc.startswith("rsync"):
+ exitinfo = system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 directory/"+args.split(":", 1)[1]+"/ "+targetloc)
+ elif targetloc.startswith("http"):
+ exitinfo = system("curl -F "+args.split(":", 1)[1]+".zip=@directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip "+targetloc)
+
+ if exitinfo == 0: # note that on Unix this isn't necessarily the exit code but it's still 0 upon successful exit
+ break
+ else:
+ print("Error in sending data to target, waiting 30 seconds and trying again.")
+ sleep(30)
size = getsize("directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip")
From 3e43461c4376b03247d03e5023958d219a9c3b65 Mon Sep 17 00:00:00 2001
From: tech234a <46801700+tech234a@users.noreply.github.com>
Date: Sat, 17 Oct 2020 00:34:04 -0400
Subject: [PATCH 05/11] Update youtube-util
---
worker.py | 4 ++--
youtube_channel.py | 44 ++++++++++++++++++++++++++------------------
youtube_util.py | 23 +++++++++++++----------
3 files changed, 41 insertions(+), 30 deletions(-)
diff --git a/worker.py b/worker.py
index bdf4372..40904bb 100644
--- a/worker.py
+++ b/worker.py
@@ -5,7 +5,7 @@ from os import mkdir, rmdir, listdir, system, environ
from os.path import isdir, isfile, getsize
from json import loads
-from youtube_channel import main
+from youtube_channel import process_channel
import signal
@@ -159,7 +159,7 @@ def threadrunner():
jobs.put(("submitdiscovery", itemyv["id"], tracker.ItemType.Video))
#channel created playlists
- y = main(desit.split(":", 1)[1])
+ y = process_channel(desit.split(":", 1)[1])
for itemyv in y["playlists"]:
jobs.put(("submitdiscovery", itemyv, tracker.ItemType.Playlist))
for itemyv in y["channels"]:
diff --git a/youtube_channel.py b/youtube_channel.py
index 57ffbbe..37db418 100644
--- a/youtube_channel.py
+++ b/youtube_channel.py
@@ -1,25 +1,31 @@
from requests import session
-from youtube_util import getinitialdata, fullyexpand
-
-# TODO: Rate limit detection, HTTP3?
+from youtube_util import getinitialdata, fullyexpand, getapikey, getlver
mysession = session()
#extract latest version automatically
-try:
- lver = getinitialdata(mysession.get("https://www.youtube.com/").text)["responseContext"]["serviceTrackingParams"][2]["params"][2]["value"]
-except:
- lver = "2.20201002.02.01"
+homepage = mysession.get("https://www.youtube.com/").text
+
+API_KEY = getapikey(homepage)
+
+params = (
+ ('key', API_KEY),
+)
-#print(lver)
-mysession.headers.update({"x-youtube-client-name": "1", "x-youtube-client-version": lver, "Accept-Language": "en-US"})
+API_VERSION = getlver(getinitialdata(homepage))
-def main(channelid: str):
+continuationheaders = {"x-youtube-client-name": "1", "x-youtube-client-version": API_VERSION, "Accept-Language": "en-US"}
+
+del homepage
+
+def process_channel(channelid: str):
playlists = set()
shelfres = set()
channellist = set()
# PLAYLISTS
- initdata = getinitialdata(mysession.get("https://www.youtube.com/channel/"+str(channelid)+"/playlists").text)
+ data = {"context":{"client":{"hl":"en","gl":"US","clientName":"WEB","clientVersion":API_VERSION}},"browseId":channelid,"params":"EglwbGF5bGlzdHM%3D"}
+ initdata = mysession.post("https://www.youtube.com/youtubei/v1/browse", params=params, json=data).json()
+
CHANNELS_ID = 0
PLAYLISTS_ID = 0
@@ -42,7 +48,7 @@ def main(channelid: str):
if "shelfRenderer" in itemint.keys():
shelfres.add(itemint["shelfRenderer"]["title"]["runs"][0]["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"])
elif "gridRenderer" in itemint.keys():
- playlistsint = fullyexpand(itemint["gridRenderer"])["items"]
+ playlistsint = fullyexpand(itemint["gridRenderer"], mysession, continuationheaders)["items"]
for playlist in playlistsint:
playlists.add(playlist["gridPlaylistRenderer"]["playlistId"])
@@ -51,7 +57,7 @@ def main(channelid: str):
for item in shelfres:
shelfiteminitdata = getinitialdata(mysession.get("https://www.youtube.com/"+str(item)).text)
- playlistsint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][PLAYLISTS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"])["items"]
+ playlistsint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][PLAYLISTS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"], mysession, continuationheaders)["items"]
for playlist in playlistsint:
playlists.add(playlist["gridPlaylistRenderer"]["playlistId"])
@@ -61,7 +67,9 @@ def main(channelid: str):
# CHANNELS
cshelfres = set()
- initdata = getinitialdata(mysession.get("https://www.youtube.com/channel/"+str(channelid)+"/channels").text)
+ # PLAYLISTS
+ data = {"context":{"client":{"hl":"en","gl":"US","clientName":"WEB","clientVersion":API_VERSION}},"browseId":channelid,"params":"EghjaGFubmVscw%3D%3D"}
+ initdata = mysession.post("https://www.youtube.com/youtubei/v1/browse", params=params, json=data).json()
shelflist = initdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"]
@@ -70,14 +78,14 @@ def main(channelid: str):
if "shelfRenderer" in itemint.keys():
cshelfres.add(itemint["shelfRenderer"]["title"]["runs"][0]["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"])
elif "gridRenderer" in itemint.keys():
- chanlistint = fullyexpand(itemint["gridRenderer"])["items"]
+ chanlistint = fullyexpand(itemint["gridRenderer"], mysession, continuationheaders)["items"]
for channel in chanlistint:
channellist.add(channel["gridChannelRenderer"]["channelId"])
for item in cshelfres:
shelfiteminitdata = getinitialdata(mysession.get("https://www.youtube.com/"+str(item)).text)
- chanlistint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"])["items"]
+ chanlistint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"], mysession, continuationheaders)["items"]
for channel in chanlistint:
channellist.add(channel["gridChannelRenderer"]["channelId"])
@@ -89,7 +97,7 @@ if __name__ == "__main__":
chanl = argv
chanl.pop(0)
for channel in chanl:
- print(main(channel))
+ print(process_channel(channel))
# SAMPLES:
# UCqj7Cz7revf5maW9g5pgNcg lots of playlists
@@ -103,4 +111,4 @@ if __name__ == "__main__":
# UCJOh5FKisc0hUlEeWFBlD-w no subscriptions, plenty of featured channels
-# UC7fjJERoGTs_eOKk-nn7RMw fair number of featured channels
+# UC7fjJERoGTs_eOKk-nn7RMw fair number of featured channels
\ No newline at end of file
diff --git a/youtube_util.py b/youtube_util.py
index 54fe2cf..4a73996 100644
--- a/youtube_util.py
+++ b/youtube_util.py
@@ -1,26 +1,29 @@
-from requests import session
from json import loads
from urllib.parse import unquote
+import requests
+
def getinitialdata(html: str):
for line in html.splitlines():
if line.strip().startswith('window["ytInitialData"] = '):
return loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1])
return {}
-mysession = session()
+def getapikey(html: str):
+ return html.split('"INNERTUBE_API_KEY":"', 1)[-1].split('"', 1)[0]
+
#extract latest version automatically
-try:
- lver = getinitialdata(mysession.get("https://www.youtube.com/").text)["responseContext"]["serviceTrackingParams"][2]["params"][2]["value"]
-except:
- lver = "2.20201002.02.01"
-mysession.headers.update({"x-youtube-client-name": "1", "x-youtube-client-version": lver, "Accept-Language": "en-US"})
+def getlver(initialdata: dict):
+ try:
+ return initialdata["responseContext"]["serviceTrackingParams"][2]["params"][2]["value"]
+ except:
+ return "2.20201002.02.01"
-def fullyexpand(inputdict: dict):
+def fullyexpand(inputdict: dict, mysession: requests.session, continuationheaders: dict):
lastrequestj = inputdict
while "continuations" in lastrequestj.keys():
- lastrequest = mysession.get("https://www.youtube.com/browse_ajax?continuation="+unquote(lastrequestj["continuations"][0]["nextContinuationData"]["continuation"]))
+ lastrequest = mysession.get("https://www.youtube.com/browse_ajax?continuation="+unquote(lastrequestj["continuations"][0]["nextContinuationData"]["continuation"]), headers=continuationheaders)
lastrequestj = lastrequest.json()[1]["response"]["continuationContents"]["gridContinuation"]
inputdict["items"].extend(lastrequestj["items"])
- return inputdict
+ return inputdict
\ No newline at end of file
From 6ce4345d10152bf3d5fefdf557145a57968d1a40 Mon Sep 17 00:00:00 2001
From: tech234a <46801700+tech234a@users.noreply.github.com>
Date: Sat, 17 Oct 2020 00:53:25 -0400
Subject: [PATCH 06/11] Basic rate limit detection for youtube-util
---
youtube_channel.py | 23 +++++++++++++++++++----
youtube_util.py | 9 ++++++++-
2 files changed, 27 insertions(+), 5 deletions(-)
diff --git a/youtube_channel.py b/youtube_channel.py
index 37db418..2a4b133 100644
--- a/youtube_channel.py
+++ b/youtube_channel.py
@@ -1,5 +1,6 @@
from requests import session
from youtube_util import getinitialdata, fullyexpand, getapikey, getlver
+from time import sleep
mysession = session()
#extract latest version automatically
@@ -24,7 +25,14 @@ def process_channel(channelid: str):
# PLAYLISTS
data = {"context":{"client":{"hl":"en","gl":"US","clientName":"WEB","clientVersion":API_VERSION}},"browseId":channelid,"params":"EglwbGF5bGlzdHM%3D"}
- initdata = mysession.post("https://www.youtube.com/youtubei/v1/browse", params=params, json=data).json()
+ while True:
+ initdata = mysession.post("https://www.youtube.com/youtubei/v1/browse", params=params, json=data)
+ if initdata.status_code == 200:
+ initdata = initdata.json()
+ break
+ else:
+ print("Non-200 API status code, waiting 30 seconds before retrying...")
+ sleep(30)
CHANNELS_ID = 0
@@ -67,10 +75,17 @@ def process_channel(channelid: str):
# CHANNELS
cshelfres = set()
- # PLAYLISTS
+ # PLAYLISTS
data = {"context":{"client":{"hl":"en","gl":"US","clientName":"WEB","clientVersion":API_VERSION}},"browseId":channelid,"params":"EghjaGFubmVscw%3D%3D"}
- initdata = mysession.post("https://www.youtube.com/youtubei/v1/browse", params=params, json=data).json()
-
+ while True:
+ initdata = mysession.post("https://www.youtube.com/youtubei/v1/browse", params=params, json=data).json()
+ if initdata.status_code == 200:
+ initdata = initdata.json()
+ break
+ else:
+ print("Non-200 API status code, waiting 30 seconds before retrying...")
+ sleep(30)
+
shelflist = initdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"]
for item in shelflist:
diff --git a/youtube_util.py b/youtube_util.py
index 4a73996..8bf506e 100644
--- a/youtube_util.py
+++ b/youtube_util.py
@@ -1,5 +1,6 @@
from json import loads
from urllib.parse import unquote
+from time import sleep
import requests
@@ -22,7 +23,13 @@ def getlver(initialdata: dict):
def fullyexpand(inputdict: dict, mysession: requests.session, continuationheaders: dict):
lastrequestj = inputdict
while "continuations" in lastrequestj.keys():
- lastrequest = mysession.get("https://www.youtube.com/browse_ajax?continuation="+unquote(lastrequestj["continuations"][0]["nextContinuationData"]["continuation"]), headers=continuationheaders)
+ while True:
+ lastrequest = mysession.get("https://www.youtube.com/browse_ajax?continuation="+unquote(lastrequestj["continuations"][0]["nextContinuationData"]["continuation"]), headers=continuationheaders)
+ if lastrequest.status_code == 200:
+ break
+ else:
+ print("Non-200 API status code, waiting 30 seconds before retrying...")
+ sleep(30)
lastrequestj = lastrequest.json()[1]["response"]["continuationContents"]["gridContinuation"]
inputdict["items"].extend(lastrequestj["items"])
From 82b08c62c4a46dc5e6fef0bb60b2d3b7ace42ce0 Mon Sep 17 00:00:00 2001
From: tech234a <46801700+tech234a@users.noreply.github.com>
Date: Sat, 17 Oct 2020 00:54:49 -0400
Subject: [PATCH 07/11] Update default API version
---
youtube_util.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/youtube_util.py b/youtube_util.py
index 8bf506e..81d466a 100644
--- a/youtube_util.py
+++ b/youtube_util.py
@@ -18,7 +18,7 @@ def getlver(initialdata: dict):
try:
return initialdata["responseContext"]["serviceTrackingParams"][2]["params"][2]["value"]
except:
- return "2.20201002.02.01"
+ return "2.20201016.02.00"
def fullyexpand(inputdict: dict, mysession: requests.session, continuationheaders: dict):
lastrequestj = inputdict
From 7a70bc25b59624be82f354d1586091de648ddfe6 Mon Sep 17 00:00:00 2001
From: tech234a <46801700+tech234a@users.noreply.github.com>
Date: Sat, 17 Oct 2020 01:52:16 -0400
Subject: [PATCH 08/11] youtube-util: validation for shelf items
---
youtube_channel.py | 26 +++++++++++++++++++++++---
1 file changed, 23 insertions(+), 3 deletions(-)
diff --git a/youtube_channel.py b/youtube_channel.py
index 2a4b133..93f6750 100644
--- a/youtube_channel.py
+++ b/youtube_channel.py
@@ -64,7 +64,17 @@ def process_channel(channelid: str):
channellist.add(playlist["gridPlaylistRenderer"]["shortBylineText"]["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"])
for item in shelfres:
- shelfiteminitdata = getinitialdata(mysession.get("https://www.youtube.com/"+str(item)).text)
+ while True:
+ shelfintp = mysession.get("https://www.youtube.com/"+str(item))
+ if not """
Sorry for the interruption. We have been receiving a large volume of requests from your network.
+
+
To continue with your YouTube experience, please fill out the form below.
""" in shelfintp.text and not shelfintp.status_code == 200:
+ break
+ else:
+ print("Non-200 status code, waiting 30 seconds before retrying...")
+ sleep(30)
+
+ shelfiteminitdata = getinitialdata(shelfintp.text)
playlistsint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][PLAYLISTS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"], mysession, continuationheaders)["items"]
for playlist in playlistsint:
@@ -85,7 +95,7 @@ def process_channel(channelid: str):
else:
print("Non-200 API status code, waiting 30 seconds before retrying...")
sleep(30)
-
+
shelflist = initdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"]
for item in shelflist:
@@ -99,7 +109,17 @@ def process_channel(channelid: str):
channellist.add(channel["gridChannelRenderer"]["channelId"])
for item in cshelfres:
- shelfiteminitdata = getinitialdata(mysession.get("https://www.youtube.com/"+str(item)).text)
+ while True:
+ shelfintc = mysession.get("https://www.youtube.com/"+str(item))
+ if not """
Sorry for the interruption. We have been receiving a large volume of requests from your network.
+
+
To continue with your YouTube experience, please fill out the form below.
""" in shelfintc.text and not shelfintc.status_code == 200:
+ break
+ else:
+ print("Non-200 status code, waiting 30 seconds before retrying...")
+ sleep(30)
+
+ shelfiteminitdata = getinitialdata(shelfintc.text)
chanlistint = fullyexpand(shelfiteminitdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][CHANNELS_ID]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"], mysession, continuationheaders)["items"]
for channel in chanlistint:
From a6f6567cb036b931523455028d8fae6b8269e8cb Mon Sep 17 00:00:00 2001
From: tech234a <46801700+tech234a@users.noreply.github.com>
Date: Sat, 17 Oct 2020 01:52:51 -0400
Subject: [PATCH 09/11] Update version
---
tracker.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tracker.py b/tracker.py
index 63bd35a..a78ec20 100644
--- a/tracker.py
+++ b/tracker.py
@@ -9,7 +9,7 @@ from os.path import isfile
from json import loads
# https://github.com/ArchiveTeam/tencent-weibo-grab/blob/9bae5f9747e014db9227821a9c11557267967023/pipeline.py
-VERSION = "20201014.01"
+VERSION = "20201017.01"
TRACKER_ID = "ext-yt-communitycontribs"
TRACKER_HOST = "trackerproxy.meo.ws"
From 19f1d7fdc83272d6a0227cc72caea57eae832286 Mon Sep 17 00:00:00 2001
From: tech234a <46801700+tech234a@users.noreply.github.com>
Date: Sat, 17 Oct 2020 01:54:03 -0400
Subject: [PATCH 10/11] youtube-util: bug fix
---
youtube_channel.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/youtube_channel.py b/youtube_channel.py
index 93f6750..842ea91 100644
--- a/youtube_channel.py
+++ b/youtube_channel.py
@@ -88,7 +88,7 @@ def process_channel(channelid: str):
# PLAYLISTS
data = {"context":{"client":{"hl":"en","gl":"US","clientName":"WEB","clientVersion":API_VERSION}},"browseId":channelid,"params":"EghjaGFubmVscw%3D%3D"}
while True:
- initdata = mysession.post("https://www.youtube.com/youtubei/v1/browse", params=params, json=data).json()
+ initdata = mysession.post("https://www.youtube.com/youtubei/v1/browse", params=params, json=data)
if initdata.status_code == 200:
initdata = initdata.json()
break
From 5b9784ddbf7c9256f19617f306c31567ddc72bc7 Mon Sep 17 00:00:00 2001
From: tech234a <46801700+tech234a@users.noreply.github.com>
Date: Sat, 17 Oct 2020 01:57:06 -0400
Subject: [PATCH 11/11] youtube-util: Bug fix
---
youtube_channel.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/youtube_channel.py b/youtube_channel.py
index 842ea91..c5d2395 100644
--- a/youtube_channel.py
+++ b/youtube_channel.py
@@ -68,7 +68,7 @@ def process_channel(channelid: str):
shelfintp = mysession.get("https://www.youtube.com/"+str(item))
if not """
Sorry for the interruption. We have been receiving a large volume of requests from your network.
-
To continue with your YouTube experience, please fill out the form below.
""" in shelfintp.text and not shelfintp.status_code == 200:
+
To continue with your YouTube experience, please fill out the form below.
""" in shelfintp.text and shelfintp.status_code == 200:
break
else:
print("Non-200 status code, waiting 30 seconds before retrying...")
@@ -113,7 +113,7 @@ def process_channel(channelid: str):
shelfintc = mysession.get("https://www.youtube.com/"+str(item))
if not """
Sorry for the interruption. We have been receiving a large volume of requests from your network.
-
To continue with your YouTube experience, please fill out the form below.
""" in shelfintc.text and not shelfintc.status_code == 200:
+
To continue with your YouTube experience, please fill out the form below.
""" in shelfintc.text and shelfintc.status_code == 200:
break
else:
print("Non-200 status code, waiting 30 seconds before retrying...")