Browse Source

Update to one thread per task, fix discovery exception, prevent complete event from occurring more than once

pull/8/head
tech234a 3 years ago
parent
commit
afcdd991fa
5 changed files with 95 additions and 57 deletions
  1. +0
    -2
      README.md
  2. +30
    -28
      discovery.py
  3. +26
    -10
      export.py
  4. +1
    -1
      tracker.py
  5. +38
    -16
      worker.py

+ 0
- 2
README.md View File

@@ -8,8 +8,6 @@ Ensure that `python` 3.8.5, `zip`, `curl`, and `rsync` are installed on your sys
### Archiving Worker:
After completing the above setup steps, simply run `python3 worker.py`.
Note: there may be a problem with `rsync` and running the script on WSL... please alert me if rsync stalls on WSL for you too.
### Heroku
A wrapper repo for free and easy deployment and environment configuration, as well automatic updates every 24-27.6 hours is available. Deploy up to 5 instances of it to a free Heroku account (total max monthly runtime 550 hours) with no need for credit card verification by clicking the button below.


+ 30
- 28
discovery.py View File

@@ -13,7 +13,7 @@ def getmetadata(mysession, vid):
wpage = mysession.get("https://www.youtube.com/watch", params=params)
if not """</div><div id="content" class=" content-alignment" role="main"><p class='largeText'>Sorry for the interruption. We have been receiving a large volume of requests from your network.</p>
<p>To continue with your YouTube experience, please fill out the form below.</p>""" in wpage.text:
<p>To continue with your YouTube experience, please fill out the form below.</p>""" in wpage.text and not wpage.status_code == 429:
break
else:
print("Captcha detected, waiting 30 seconds")
@@ -74,35 +74,37 @@ def getmetadata(mysession, vid):
initdata = loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1])
if "contents" in initdata.keys(): #prevent exception
try:
for recmd in initdata["contents"]["twoColumnWatchNextResults"]["secondaryResults"]["secondaryResults"]["results"]:
#auto is like the others
if "compactAutoplayRenderer" in recmd.keys():
recmd = recmd["compactAutoplayRenderer"]["contents"][0]
if "compactVideoRenderer" in recmd.keys():
recvids.add(recmd["compactVideoRenderer"]["videoId"])
try:
recchans.add(recmd["compactVideoRenderer"]["channelId"])
except KeyError as e:
if "results" in initdata["contents"]["twoColumnWatchNextResults"]["secondaryResults"]["secondaryResults"].keys():
for recmd in initdata["contents"]["twoColumnWatchNextResults"]["secondaryResults"]["secondaryResults"]["results"]:
#auto is like the others
if "compactAutoplayRenderer" in recmd.keys():
recmd = recmd["compactAutoplayRenderer"]["contents"][0]
if "compactVideoRenderer" in recmd.keys():
recvids.add(recmd["compactVideoRenderer"]["videoId"])
try:
recchans.add(recmd["compactVideoRenderer"]["longBylineText"]["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"])
recchans.add(recmd["compactVideoRenderer"]["channelId"])
except KeyError as e:
print("Channel extract error")
#raise
#print("Unable to extract channel:")
#print(recmd["compactVideoRenderer"])
elif "compactPlaylistRenderer" in recmd.keys():
recplayl.add(recmd["compactPlaylistRenderer"]["playlistId"])
if "navigationEndpoint" in recmd["compactPlaylistRenderer"].keys():
recvids.add(recmd["compactPlaylistRenderer"]["navigationEndpoint"]["watchEndpoint"]["videoId"])
if "navigationEndpoint" in recmd["compactPlaylistRenderer"]["shortBylineText"].keys():
recchans.add(recmd["compactPlaylistRenderer"]["shortBylineText"]["navigationEndpoint"]["browseEndpoint"]["browseId"])
elif "compactRadioRenderer" in recmd.keys(): #mix playlist
recmixes.add(recmd["compactRadioRenderer"]["playlistId"])
# todo: find out if channels can be suggested
except:
try:
recchans.add(recmd["compactVideoRenderer"]["longBylineText"]["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"])
except KeyError as e:
print("Channel extract error")
#raise
#print("Unable to extract channel:")
#print(recmd["compactVideoRenderer"])
elif "compactPlaylistRenderer" in recmd.keys():
recplayl.add(recmd["compactPlaylistRenderer"]["playlistId"])
if "navigationEndpoint" in recmd["compactPlaylistRenderer"].keys():
recvids.add(recmd["compactPlaylistRenderer"]["navigationEndpoint"]["watchEndpoint"]["videoId"])
if "navigationEndpoint" in recmd["compactPlaylistRenderer"]["shortBylineText"].keys():
recchans.add(recmd["compactPlaylistRenderer"]["shortBylineText"]["navigationEndpoint"]["browseEndpoint"]["browseId"])
elif "compactRadioRenderer" in recmd.keys(): #mix playlist
recmixes.add(recmd["compactRadioRenderer"]["playlistId"])
# todo: find out if channels can be suggested
except BaseException as e:
print(e)
print("Exception in discovery, continuing anyway")
creditdata = {}


+ 26
- 10
export.py View File

@@ -79,13 +79,25 @@ class MyHTMLParser(HTMLParser):
elif self.get_starttag_text() and self.get_starttag_text().startswith('<div id="original-video-title"'):
self.inittitle += data

def subprrun(jobs, mysession, langcode, vid, mode):
def subprrun(mysession, langcode, vid, mode, needforcemetadata, needforcecaptions):
if mode == "forceedit-metadata":
while needforcemetadata[langcode] == None: #extra logic
print("Awaiting forcemetadata")
sleep(1)
if needforcemetadata[langcode] == False:
#print("forcemetadata not needed")
return True #nothing needs to be done, otherwise, continue

if mode == "forceedit-captions":
while needforcecaptions[langcode] == None: #extra logic
print("Awaiting forcecaptions")
sleep(1)
if needforcecaptions[langcode] == False:
#print("forcecaptions not needed")
return True #nothing needs to be done, otherwise, continue

collect() #cleanup memory

if not "forceedit" in mode:
retval = 3
else:
retval = 1
vid = vid.strip()
print(langcode, vid)

@@ -158,12 +170,16 @@ def subprrun(jobs, mysession, langcode, vid, mode):

if not "forceedit" in mode:
if '&amp;forceedit=metadata&amp;tab=metadata">See latest</a>' in inttext:
jobs.put(("subtitles-forceedit-metadata", vid, langcode))
retval -= 1
print("Need forcemetadata")
needforcemetadata[langcode] = True
else:
needforcemetadata[langcode] = False

if '<li id="captions-editor-nav-captions" role="tab" data-state="published" class="published">' in inttext:
jobs.put(("subtitles-forceedit-captions", vid, langcode))
retval -= 1
print("Need forcecaptions")
needforcecaptions[langcode] = True
else:
needforcecaptions[langcode] = False

if 'id="reject-captions-button"' in inttext or 'id="reject-metadata-button"' in inttext or 'data-state="published"' in inttext or 'title="The video owner already provided subtitles/CC"' in inttext: #quick way of checking if this page is worth parsing
parser = MyHTMLParser()
@@ -225,7 +241,7 @@ def subprrun(jobs, mysession, langcode, vid, mode):
del vid
del pparams

return retval
return True

# if __name__ == "__main__":
# from os import environ, mkdir


+ 1
- 1
tracker.py View File

@@ -9,7 +9,7 @@ from os.path import isfile
from json import loads

# https://github.com/ArchiveTeam/tencent-weibo-grab/blob/9bae5f9747e014db9227821a9c11557267967023/pipeline.py
VERSION = "20200924.03"
VERSION = "20200924.06"

TRACKER_ID = "ext-yt-communitycontribs"
TRACKER_HOST = "trackerproxy.meo.ws"


+ 38
- 16
worker.py View File

@@ -23,8 +23,6 @@ from export import subprrun
#useful Queue example: https://stackoverflow.com/a/54658363
jobs = Queue()

langcnt = {}

try:
mkdir("out")
except:
@@ -100,13 +98,12 @@ class GracefulKiller:
gkiller = GracefulKiller()

#microtasks
def threadrunner(jobs: Queue):
global langcnt
def threadrunner():
jobs = Queue()
ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False})
while True:
if not jobs.empty():
task, vid, args = jobs.get()

if task == "submitdiscovery":
tracker.add_item_to_tracker(args, vid)
elif task == "discovery":
@@ -116,7 +113,7 @@ def threadrunner(jobs: Queue):
break
except BaseException as e:
print(e)
print("Error in retrieving information, waiting 30 seconds")
print("Error in retrieving information, waiting 30 seconds and trying again")
sleep(30)
if info[0] or info[1]: # ccenabled or creditdata
if not isdir("out/"+str(vid).strip()):
@@ -125,11 +122,16 @@ def threadrunner(jobs: Queue):
open("out/"+str(vid).strip()+"/"+str(vid).strip()+"_published_credits.json", "w").write(dumps(info[1]))

if info[0]:
langcnt[vid] = 0
for langcode in langs:
jobs.put(("subtitles", vid, langcode))
else:
jobs.put(("complete", None, "video:"+vid))

for langcode in langs:
jobs.put(("subtitles-forceedit-metadata", vid, langcode))

for langcode in langs:
jobs.put(("subtitles-forceedit-captions", vid, langcode))

jobs.put(("complete", None, "video:"+vid))

for videodisc in info[2]:
jobs.put(("submitdiscovery", videodisc, tracker.ItemType.Video))
@@ -141,14 +143,11 @@ def threadrunner(jobs: Queue):
jobs.put(("submitdiscovery", playldisc, tracker.ItemType.Playlist))

elif task == "subtitles":
retval = subprrun(jobs, mysession, args, vid, "default")
langcnt[vid] += retval
if langcnt[vid] >= 585:
jobs.put(("complete", None, "video:"+vid))
subprrun(mysession, args, vid, "default", needforcemetadata, needforcecaptions)
elif task == "subtitles-forceedit-captions":
subprrun(jobs, mysession, args, vid, "forceedit-captions")
subprrun(mysession, args, vid, "forceedit-captions", needforcemetadata, needforcecaptions)
elif task == "subtitles-forceedit-metadata":
subprrun(jobs, mysession, args, vid, "forceedit-metadata")
subprrun(mysession, args, vid, "forceedit-metadata", needforcemetadata, needforcecaptions)
elif task == "channel":
try:
y = ydl.extract_info("https://www.youtube.com/channel/"+desit.split(":", 1)[1], download=False)
@@ -216,8 +215,31 @@ def threadrunner(jobs: Queue):
collect() #cleanup
desit = tracker.request_item_from_tracker()
print("New task:", desit)

if desit:
if desit.split(":", 1)[0] == "video":
needforcemetadata = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
'xh': None, 'yi': None, 'yo': None, 'zu': None}
needforcecaptions = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
'xh': None, 'yi': None, 'yo': None, 'zu': None}
jobs.put(("discovery", desit.split(":", 1)[1], None))
elif desit.split(":", 1)[0] == "channel":
jobs.put(("channel", None, desit.split(":", 1)[1]))
@@ -238,7 +260,7 @@ if HEROKU:
THREADCNT = 20
#now create the rest of the threads
for i in range(THREADCNT):
runthread = Thread(target=threadrunner, args=(jobs,))
runthread = Thread(target=threadrunner)
runthread.start()
threads.append(runthread)
del runthread


Loading…
Cancel
Save