From 9d3b4e98562e4aa2aabff88370929ac4e9986138 Mon Sep 17 00:00:00 2001 From: tech234a <46801700+tech234a@users.noreply.github.com> Date: Wed, 23 Sep 2020 18:47:38 -0400 Subject: [PATCH 01/13] WIP microtasks --- worker.py | 144 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 93 insertions(+), 51 deletions(-) diff --git a/worker.py b/worker.py index a134c95..08b0915 100644 --- a/worker.py +++ b/worker.py @@ -20,47 +20,16 @@ from gc import collect from discovery import getmetadata from export import subprrun -batchcontent = [] -actualitems = [] +#useful Queue example: https://stackoverflow.com/a/54658363 +jobs = Queue() + +langcnt = {} + HEROKU = False if isfile("../Procfile"): HEROKU = True -def batchfunc(): - ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False}) - - if not HEROKU: - desqsize = 51 - elif HEROKU: - desqsize = 251 - - while jobs.qsize() < desqsize: - desit = tracker.request_item_from_tracker() - if desit: - if desit.split(":", 1)[0] == "video": - jobs.put(desit.split(":", 1)[1]) - elif desit.split(":", 1)[0] == "channel": - y = ydl.extract_info("https://www.youtube.com/channel/"+desit.split(":", 1)[1], download=False) - for itemyv in y["entries"]: - tracker.add_item_to_tracker(tracker.ItemType.Video, itemyv["id"]) - elif desit.split(":", 1)[0] == "playlist": - y = ydl.extract_info("https://www.youtube.com/playlist?list="+desit.split(":", 1)[1], download=False) - for itemyvp in y["entries"]: - tracker.add_item_to_tracker(tracker.ItemType.Video, itemyvp["id"]) - else: - print("Ignoring item for now", desit) - else: - print("Ignoring item for now", desit) - - batchcontent.append(desit.split(":", 1)[1]) - actualitems.append(desit) - -def submitfunc(submitqueue): - while not submitqueue.empty(): - itype, ival = submitqueue.get() - tracker.add_item_to_tracker(itype, ival) - langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br', 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl', 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE', @@ -73,16 +42,6 @@ langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo', 'xh', 'yi', 'yo', 'zu'] -#useful Queue example: https://stackoverflow.com/a/54658363 -jobs = Queue() - -ccenabledl = [] - -recvids = set() -recchans = set() -recmixes = set() -recplayl = set() - #HSID, SSID, SID cookies required if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys(): cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]} @@ -128,6 +87,93 @@ class GracefulKiller: gkiller = GracefulKiller() +#minitasks +def threadrunner(jobs: Queue): + global langcnt + ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False}) + while not gkiller.kill_now: + if not jobs.empty(): + task, vid, args = jobs.get() + + if task == "submitdiscovery": + tracker.add_item_to_tracker(args, vid) + elif task == "discovery": + pass + elif task == "subtitles": + pass + elif task == "channel": + y = ydl.extract_info("https://www.youtube.com/channel/"+desit.split(":", 1)[1], download=False) + for itemyv in y["entries"]: + jobs.put(("submitdiscovery", itemyv["id"], tracker.ItemType.Video)) + jobs.put(("complete", None, "channel:"+args)) + elif task == "playlist": + y = ydl.extract_info("https://www.youtube.com/playlist?list="+desit.split(":", 1)[1], download=False) + for itemyvp in y["entries"]: + jobs.put(("submitdiscovery", itemyvp["id"], tracker.ItemType.Video)) + jobs.put(("complete", None, "playlist:"+args)) + elif task == "complete": + size = 0 + if ":" in args: + if args.split(":", 1)[0] == "video": + if isfile("directory/"+args.split(":", 1)[1]+".zip"): + size = getsize("directory/"+args.split(":", 1)[1]+".zip") + tracker.mark_item_as_done(args, size) + else: + # get a new task from tracker + desit = tracker.request_item_from_tracker() + if desit: + if desit.split(":", 1)[0] == "video": + jobs.put(("discovery", desit.split(":", 1)[1], None)) + elif desit.split(":", 1)[0] == "channel": + jobs.put(("channel", None, desit.split(":", 1)[1])) + elif desit.split(":", 1)[0] == "playlist": + jobs.put(("playlist", None, desit.split(":", 1)[1])) + else: + print("Ignoring item for now", desit) + else: + print("Ignoring item for now", desit) + + batchcontent.append(desit.split(":", 1)[1]) + actualitems.append(desit) + + + + +batchcontent = [] +actualitems = [] + + +def batchfunc(): + + + if not HEROKU: + desqsize = 51 + elif HEROKU: + desqsize = 251 + + while jobs.qsize() < desqsize: + + +def submitfunc(submitqueue): + while not submitqueue.empty(): + itype, ival = submitqueue.get() + tracker.add_item_to_tracker(itype, ival) + + + + + +ccenabledl = [] + +recvids = set() +recchans = set() +recmixes = set() +recplayl = set() + + + + + def prrun(): while not jobs.empty(): global recvids @@ -314,11 +360,7 @@ while not gkiller.kill_now: # Report the batch as complete for itemb in actualitems: - size = 0 - if ":" in itemb: - if itemb.split(":", 1)[0] == "video": - if isfile("directory/"+itemb.split(":", 1)[1]+".zip"): - size = getsize("directory/"+itemb.split(":", 1)[1]+".zip") + tracker.mark_item_as_done(itemb, size) # clear the output directories From fb4b423da25103a0f8fd7f40d779d989b4e3cb06 Mon Sep 17 00:00:00 2001 From: tech234a <46801700+tech234a@users.noreply.github.com> Date: Wed, 23 Sep 2020 20:26:38 -0400 Subject: [PATCH 02/13] More WIP --- export.py | 374 +++++++++++++++++++++++++++--------------------------- worker.py | 45 +++++-- 2 files changed, 216 insertions(+), 203 deletions(-) diff --git a/export.py b/export.py index 309f010..76053dd 100644 --- a/export.py +++ b/export.py @@ -79,206 +79,202 @@ class MyHTMLParser(HTMLParser): elif self.get_starttag_text() and self.get_starttag_text().startswith('