Browse Source

Improve threading, try to improve memory efficiency, use Queue

pull/3/head
tech234a 3 years ago
parent
commit
438b5555d8
2 changed files with 132 additions and 98 deletions
  1. +30
    -60
      export.py
  2. +102
    -38
      worker.py

+ 30
- 60
export.py View File

@@ -25,26 +25,17 @@ def timedelta_to_sbv_timestamp(timedelta_timestamp):
return "%1d:%02d:%02d.%03d" % (hrs, mins, secs, msecs)


import requests
from bs4 import BeautifulSoup
from datetime import timedelta
import threading

from os import mkdir

from json import loads, dumps


#HSID, SSID, SID cookies required
cookies = loads(open("config.json").read())
mysession = requests.session()

mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
from json import dumps

class subtitlethread(threading.Thread):
def run(self):
langcode, vid = self.getName().split(";", 1)
import requests

def subprrun(jobs, headers):
while not jobs.empty():
langcode, vid = jobs.get()
print(langcode, vid)
pparams = (
("v", vid),
("lang", langcode),
@@ -55,24 +46,32 @@ class subtitlethread(threading.Thread):
("o", "U")
)

page = mysession.get("https://www.youtube.com/timedtext_editor", params=pparams)
page = requests.get("https://www.youtube.com/timedtext_editor", headers=headers, params=pparams)

assert not "accounts.google.com" in page.url, "Please supply authentication cookie information in config.json. See README.md for more information."

soup = BeautifulSoup(page.text, features="html5lib")
del page

divs = soup.find_all("div", class_="timed-event-line")

outtext = ""
for item in divs:
myfs = open("out/"+vid+"/"+vid+"_"+langcode+".sbv", "w", encoding="utf-8")
while divs:
item = divs.pop(0)
text = item.find("textarea").text
startms = int(item.find("input", class_="event-start-time")["data-start-ms"])
endms = int(item.find("input", class_="event-end-time")["data-end-ms"])

outtext += timedelta_to_sbv_timestamp(timedelta(milliseconds=startms)) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=endms)) + "\n" + text + "\n\n"

open("out/"+vid+"/"+vid+"_"+langcode+".sbv", "w", encoding="utf-8").write(outtext[:-1])
myfs.write(timedelta_to_sbv_timestamp(timedelta(milliseconds=startms)) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=endms)) + "\n" + text + "\n")
del item
del text
del startms
del endms
if divs:
myfs.write("\n")
del divs
del myfs

if soup.find("li", id="captions-editor-nav-metadata")["data-state"] != "locked":
metadata = {}
@@ -84,42 +83,13 @@ class subtitlethread(threading.Thread):
metadata["description"] = soup.find("textarea", id="metadata-description").text

open("out/"+vid+"/"+vid+"_"+langcode+".json", "w", encoding="utf-8").write(dumps(metadata))
del metadata

del soup
del langcode
del vid
del pparams

jobs.task_done()

def getsubs(vid):
langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
'xh', 'yi', 'yo', 'zu']

threads = []
for langcode in langs:
runthread = subtitlethread(name = langcode+";"+vid)
runthread.start()
threads.append(runthread)

for x in threads:
x.join()

return True

if __name__ == "__main__":
from sys import argv
vidl = argv
vidl.pop(0)
for video in vidl:
try:
mkdir("out")
except:
pass
try:
mkdir("out/"+video)
except:
pass
getsubs(video)
return True

+ 102
- 38
worker.py View File

@@ -1,25 +1,57 @@
from threading import Thread
import requests
from time import sleep
from os import mkdir
from os.path import isdir
from json import dumps
import threading
from json import dumps, loads

from shutil import make_archive, rmtree

from queue import Queue

from discovery import getmetadata
from export import getsubs
from export import subprrun

WORKER_VERSION = 1
SERVER_BASE_URL = "http://localhost:5000"

class batchthread(threading.Thread):
def run(self):
item = self.getName()
langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
'xh', 'yi', 'yo', 'zu']

#useful Queue example: https://stackoverflow.com/a/54658363
jobs = Queue()

ccenabledl = []

recvids = set()
recchans = set()
recmixes = set()
recplayl = set()

#HSID, SSID, SID cookies required
cookies = loads(open("config.json").read())

headers = {"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",}
del cookies

def prrun():
while not jobs.empty():
global recvids
global recchans
global recmixes
global recplayl
global ccenabledl

item = jobs.get()

print("Video ID:", str(item).strip())
while True:
@@ -29,7 +61,7 @@ class batchthread(threading.Thread):
except BaseException as e:
print(e)
print("Error in retrieving information, waiting 30 seconds")
raise
#raise
sleep(30)

# Add any discovered videos
@@ -46,19 +78,10 @@ class batchthread(threading.Thread):
open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))

if info[0]: #ccenabled
while True:
gsres = False
try:
gsres = getsubs(str(item).strip())
except BaseException as e:
print(e)
if gsres:
break
else:
print("Error in retrieving subtitles, waiting 30 seconds")
sleep(30)
ccenabledl.append(item)
jobs.task_done()

return True
return True


# Get a worker ID
@@ -81,11 +104,6 @@ while True:
except:
pass

recvids = set()
recchans = set()
recmixes = set()
recplayl = set()

# Get a batch ID
while True:
params = (
@@ -96,33 +114,79 @@ while True:

if batchrequest.status_code == 200:
batchinfo = batchrequest.json()
break
else:
print("Error in retrieving batch assignment, will attempt again in 10 minutes")
sleep(600)
if batchinfo["content"] != "Fail":
break
print("Error in retrieving batch assignment, will attempt again in 10 minutes")
sleep(600)

print("Received batch ID:", batchinfo["batchID"], "Content:", batchinfo["content"])

# Process the batch
batchcontent = requests.get(batchinfo["content"]).text.split("\n")

threads = []
while batchcontent:
while len(threads) <= 50 and batchcontent:
item = batchcontent.pop(0)
runthread = batchthread(name = item)
runthread.start()
threads.append(runthread)
jobs.put(batchcontent.pop(0))

threads = []

for i in range(50):
runthread = Thread(target=prrun)
runthread.start()
threads.append(runthread)
del runthread

for x in threads:
x.join()
threads.remove(x)
del x

for x in threads:
x.join()
threads.remove(x)
open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))
#clear
recvids.clear()
recchans.clear()
recmixes.clear()
recplayl.clear()


subtjobs = Queue()
while ccenabledl:
langcontent = langs.copy()
intvid = ccenabledl.pop(0)

while langcontent:
subtjobs.put((langcontent.pop(0), intvid))
del intvid
del langcontent

subthreads = []

for r in range(5):
subrunthread = Thread(target=subprrun, args=(subtjobs,headers))
subrunthread.start()
subthreads.append(subrunthread)
del subrunthread

for xa in subthreads:
xa.join()
subthreads.remove(xa)
del xa
# while True:
# gsres = False
# try:
# gsres = getsubs(str(item).strip())
# except BaseException as e:
# print(e)
# if gsres:
# break
# else:
# print("Error in retrieving subtitles, waiting 30 seconds")
# sleep(30)

#https://stackoverflow.com/a/11968881

# TODO: put the data somewhere...
# TODO: put the discoveries somewhere...
open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))

make_archive("out", "zip", "out") #check this



Loading…
Cancel
Save