Improve threading, try to improve memory efficiency, use Queue

3 years ago · 438b5555d8
--- a/export.py
+++ b/export.py
@@ -25,26 +25,17 @@ def timedelta_to_sbv_timestamp(timedelta_timestamp):
    return "%1d:%02d:%02d.%03d" % (hrs, mins, secs, msecs)


 import requests
 from bs4 import BeautifulSoup
 from datetime import timedelta
 import threading

 from os import mkdir

 from json import loads, dumps


 #HSID, SSID, SID cookies required
 cookies = loads(open("config.json").read())
 mysession = requests.session()

 mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
 from json import dumps

 class subtitlethread(threading.Thread):
    def run(self):
        langcode, vid = self.getName().split(";", 1)
 import requests

 def subprrun(jobs, headers):
    while not jobs.empty():
        langcode, vid = jobs.get()
        print(langcode, vid)
        pparams = (
            ("v", vid),
            ("lang", langcode),
@@ -55,24 +46,32 @@ class subtitlethread(threading.Thread):
            ("o", "U")
        )

        page = mysession.get("https://www.youtube.com/timedtext_editor", params=pparams)
        page = requests.get("https://www.youtube.com/timedtext_editor", headers=headers, params=pparams)

        assert not "accounts.google.com" in page.url, "Please supply authentication cookie information in config.json. See README.md for more information."

        soup = BeautifulSoup(page.text, features="html5lib")
        del page

        divs = soup.find_all("div", class_="timed-event-line")

        outtext = ""

        for item in divs:
        myfs = open("out/"+vid+"/"+vid+"_"+langcode+".sbv", "w", encoding="utf-8")
        while divs:
            item = divs.pop(0)
            text = item.find("textarea").text
            startms = int(item.find("input", class_="event-start-time")["data-start-ms"])
            endms = int(item.find("input", class_="event-end-time")["data-end-ms"])

            outtext += timedelta_to_sbv_timestamp(timedelta(milliseconds=startms)) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=endms)) + "\n" + text + "\n\n"

        open("out/"+vid+"/"+vid+"_"+langcode+".sbv", "w", encoding="utf-8").write(outtext[:-1])
            myfs.write(timedelta_to_sbv_timestamp(timedelta(milliseconds=startms)) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=endms)) + "\n" + text + "\n")
            
            del item
            del text
            del startms
            del endms
            if divs:
                myfs.write("\n")
        del divs
        del myfs

        if soup.find("li", id="captions-editor-nav-metadata")["data-state"] != "locked":
            metadata = {}
@@ -84,42 +83,13 @@ class subtitlethread(threading.Thread):
            metadata["description"] = soup.find("textarea", id="metadata-description").text

            open("out/"+vid+"/"+vid+"_"+langcode+".json", "w", encoding="utf-8").write(dumps(metadata))
            del metadata

        del soup
        del langcode
        del vid
        del pparams

        jobs.task_done()

 def getsubs(vid):
    langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br', 
    'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl', 
    'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE', 
    'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha', 
    'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw', 
    'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan', 
    'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR', 
    'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn', 
    'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta', 
    'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo', 
    'xh', 'yi', 'yo', 'zu']

    threads = []
    for langcode in langs:
        runthread = subtitlethread(name = langcode+";"+vid)
        runthread.start()
        threads.append(runthread)

    for x in threads:
        x.join()

    return True

 if __name__ == "__main__":
    from sys import argv
    vidl = argv
    vidl.pop(0)
    for video in vidl:
        try:
            mkdir("out")
        except:
            pass
        try:
            mkdir("out/"+video)
        except:
            pass
        getsubs(video)
    return True
--- a/worker.py
+++ b/worker.py
@@ -1,25 +1,57 @@
 from threading import Thread
 import requests
 from time import sleep
 from os import mkdir
 from os.path import isdir
 from json import dumps
 import threading
 from json import dumps, loads

 from shutil import make_archive, rmtree

 from queue import Queue

 from discovery import getmetadata
 from export import getsubs
 from export import subprrun

 WORKER_VERSION  = 1
 SERVER_BASE_URL = "http://localhost:5000"

 class batchthread(threading.Thread):
    def run(self):
        item = self.getName()
 langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br', 
    'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl', 
    'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE', 
    'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha', 
    'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw', 
    'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan', 
    'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR', 
    'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn', 
    'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta', 
    'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo', 
    'xh', 'yi', 'yo', 'zu']

 #useful Queue example: https://stackoverflow.com/a/54658363
 jobs = Queue()

 ccenabledl = []

 recvids  = set()
 recchans = set()
 recmixes = set()
 recplayl = set()

 #HSID, SSID, SID cookies required
 cookies = loads(open("config.json").read())

 headers = {"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",}
 del cookies

 def prrun():
    while not jobs.empty():
        global recvids
        global recchans
        global recmixes
        global recplayl
        global ccenabledl

        item = jobs.get()

        print("Video ID:", str(item).strip())
        while True:
@@ -29,7 +61,7 @@ class batchthread(threading.Thread):
            except BaseException as e:
                print(e)
                print("Error in retrieving information, waiting 30 seconds")
                raise
                #raise
                sleep(30)

        # Add any discovered videos
@@ -46,19 +78,10 @@ class batchthread(threading.Thread):
            open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))

        if info[0]: #ccenabled
            while True:
                gsres = False
                try:
                    gsres = getsubs(str(item).strip())
                except BaseException as e:
                    print(e)
                if gsres:
                    break
                else:
                    print("Error in retrieving subtitles, waiting 30 seconds")
                    sleep(30)
            ccenabledl.append(item)
        jobs.task_done()

        return True
    return True


 # Get a worker ID
@@ -81,11 +104,6 @@ while True:
    except:
        pass

    recvids  = set()
    recchans = set()
    recmixes = set()
    recplayl = set()

    # Get a batch ID
    while True:
        params = (
@@ -96,33 +114,79 @@ while True:

        if batchrequest.status_code == 200:
            batchinfo = batchrequest.json()
            break
        else:
            print("Error in retrieving batch assignment, will attempt again in 10 minutes")
            sleep(600)
            if batchinfo["content"] != "Fail":
                break
            
        print("Error in retrieving batch assignment, will attempt again in 10 minutes")
        sleep(600)

    print("Received batch ID:", batchinfo["batchID"], "Content:", batchinfo["content"])

    # Process the batch
    batchcontent = requests.get(batchinfo["content"]).text.split("\n")

    threads = []
    while batchcontent:
        while len(threads) <= 50 and batchcontent:
            item = batchcontent.pop(0)
            runthread = batchthread(name = item)
            runthread.start()
            threads.append(runthread)
        jobs.put(batchcontent.pop(0))

    threads = []

    for i in range(50):
        runthread = Thread(target=prrun)
        runthread.start()
        threads.append(runthread)
        del runthread

    for x in threads:
        x.join()
        threads.remove(x)
        del x

        for x in threads:
            x.join()
            threads.remove(x)
    open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))
    #clear
    recvids.clear()
    recchans.clear()
    recmixes.clear()
    recplayl.clear()


    subtjobs = Queue()
    while ccenabledl:
        langcontent = langs.copy()
        intvid = ccenabledl.pop(0)

        while langcontent:
            subtjobs.put((langcontent.pop(0), intvid))
        del intvid
        del langcontent

    subthreads = []

    for r in range(5):
        subrunthread = Thread(target=subprrun, args=(subtjobs,headers))
        subrunthread.start()
        subthreads.append(subrunthread)
        del subrunthread

    for xa in subthreads:
        xa.join()
        subthreads.remove(xa)
        del xa
    # while True:
    #     gsres = False
    #     try:
    #         gsres = getsubs(str(item).strip())
    #     except BaseException as e:
    #         print(e)
    #     if gsres:
    #         break
    #     else:
    #         print("Error in retrieving subtitles, waiting 30 seconds")
    #         sleep(30)

    #https://stackoverflow.com/a/11968881

    # TODO: put the data somewhere...
    # TODO: put the discoveries somewhere...
    open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))

    make_archive("out", "zip", "out") #check this