Sfoglia il codice sorgente

Improve threading, try to improve memory efficiency, use Queue

pull/3/head
tech234a 3 anni fa
parent
commit
438b5555d8
2 ha cambiato i file con 132 aggiunte e 98 eliminazioni
  1. +30
    -60
      export.py
  2. +102
    -38
      worker.py

+ 30
- 60
export.py Vedi File

@@ -25,26 +25,17 @@ def timedelta_to_sbv_timestamp(timedelta_timestamp):
return "%1d:%02d:%02d.%03d" % (hrs, mins, secs, msecs) return "%1d:%02d:%02d.%03d" % (hrs, mins, secs, msecs)




import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from datetime import timedelta from datetime import timedelta
import threading

from os import mkdir

from json import loads, dumps


#HSID, SSID, SID cookies required
cookies = loads(open("config.json").read())
mysession = requests.session()


mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
from json import dumps


class subtitlethread(threading.Thread):
def run(self):
langcode, vid = self.getName().split(";", 1)
import requests


def subprrun(jobs, headers):
while not jobs.empty():
langcode, vid = jobs.get()
print(langcode, vid)
pparams = ( pparams = (
("v", vid), ("v", vid),
("lang", langcode), ("lang", langcode),
@@ -55,24 +46,32 @@ class subtitlethread(threading.Thread):
("o", "U") ("o", "U")
) )


page = mysession.get("https://www.youtube.com/timedtext_editor", params=pparams)
page = requests.get("https://www.youtube.com/timedtext_editor", headers=headers, params=pparams)


assert not "accounts.google.com" in page.url, "Please supply authentication cookie information in config.json. See README.md for more information." assert not "accounts.google.com" in page.url, "Please supply authentication cookie information in config.json. See README.md for more information."


soup = BeautifulSoup(page.text, features="html5lib") soup = BeautifulSoup(page.text, features="html5lib")
del page


divs = soup.find_all("div", class_="timed-event-line") divs = soup.find_all("div", class_="timed-event-line")


outtext = ""
for item in divs:
myfs = open("out/"+vid+"/"+vid+"_"+langcode+".sbv", "w", encoding="utf-8")
while divs:
item = divs.pop(0)
text = item.find("textarea").text text = item.find("textarea").text
startms = int(item.find("input", class_="event-start-time")["data-start-ms"]) startms = int(item.find("input", class_="event-start-time")["data-start-ms"])
endms = int(item.find("input", class_="event-end-time")["data-end-ms"]) endms = int(item.find("input", class_="event-end-time")["data-end-ms"])


outtext += timedelta_to_sbv_timestamp(timedelta(milliseconds=startms)) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=endms)) + "\n" + text + "\n\n"

open("out/"+vid+"/"+vid+"_"+langcode+".sbv", "w", encoding="utf-8").write(outtext[:-1])
myfs.write(timedelta_to_sbv_timestamp(timedelta(milliseconds=startms)) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=endms)) + "\n" + text + "\n")
del item
del text
del startms
del endms
if divs:
myfs.write("\n")
del divs
del myfs


if soup.find("li", id="captions-editor-nav-metadata")["data-state"] != "locked": if soup.find("li", id="captions-editor-nav-metadata")["data-state"] != "locked":
metadata = {} metadata = {}
@@ -84,42 +83,13 @@ class subtitlethread(threading.Thread):
metadata["description"] = soup.find("textarea", id="metadata-description").text metadata["description"] = soup.find("textarea", id="metadata-description").text


open("out/"+vid+"/"+vid+"_"+langcode+".json", "w", encoding="utf-8").write(dumps(metadata)) open("out/"+vid+"/"+vid+"_"+langcode+".json", "w", encoding="utf-8").write(dumps(metadata))
del metadata

del soup
del langcode
del vid
del pparams

jobs.task_done()


def getsubs(vid):
langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
'xh', 'yi', 'yo', 'zu']

threads = []
for langcode in langs:
runthread = subtitlethread(name = langcode+";"+vid)
runthread.start()
threads.append(runthread)

for x in threads:
x.join()

return True

if __name__ == "__main__":
from sys import argv
vidl = argv
vidl.pop(0)
for video in vidl:
try:
mkdir("out")
except:
pass
try:
mkdir("out/"+video)
except:
pass
getsubs(video)
return True

+ 102
- 38
worker.py Vedi File

@@ -1,25 +1,57 @@
from threading import Thread
import requests import requests
from time import sleep from time import sleep
from os import mkdir from os import mkdir
from os.path import isdir from os.path import isdir
from json import dumps
import threading
from json import dumps, loads


from shutil import make_archive, rmtree from shutil import make_archive, rmtree


from queue import Queue

from discovery import getmetadata from discovery import getmetadata
from export import getsubs
from export import subprrun


WORKER_VERSION = 1 WORKER_VERSION = 1
SERVER_BASE_URL = "http://localhost:5000" SERVER_BASE_URL = "http://localhost:5000"


class batchthread(threading.Thread):
def run(self):
item = self.getName()
langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
'xh', 'yi', 'yo', 'zu']

#useful Queue example: https://stackoverflow.com/a/54658363
jobs = Queue()

ccenabledl = []

recvids = set()
recchans = set()
recmixes = set()
recplayl = set()

#HSID, SSID, SID cookies required
cookies = loads(open("config.json").read())

headers = {"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",}
del cookies

def prrun():
while not jobs.empty():
global recvids global recvids
global recchans global recchans
global recmixes global recmixes
global recplayl global recplayl
global ccenabledl

item = jobs.get()


print("Video ID:", str(item).strip()) print("Video ID:", str(item).strip())
while True: while True:
@@ -29,7 +61,7 @@ class batchthread(threading.Thread):
except BaseException as e: except BaseException as e:
print(e) print(e)
print("Error in retrieving information, waiting 30 seconds") print("Error in retrieving information, waiting 30 seconds")
raise
#raise
sleep(30) sleep(30)


# Add any discovered videos # Add any discovered videos
@@ -46,19 +78,10 @@ class batchthread(threading.Thread):
open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1])) open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))


if info[0]: #ccenabled if info[0]: #ccenabled
while True:
gsres = False
try:
gsres = getsubs(str(item).strip())
except BaseException as e:
print(e)
if gsres:
break
else:
print("Error in retrieving subtitles, waiting 30 seconds")
sleep(30)
ccenabledl.append(item)
jobs.task_done()


return True
return True




# Get a worker ID # Get a worker ID
@@ -81,11 +104,6 @@ while True:
except: except:
pass pass


recvids = set()
recchans = set()
recmixes = set()
recplayl = set()

# Get a batch ID # Get a batch ID
while True: while True:
params = ( params = (
@@ -96,33 +114,79 @@ while True:


if batchrequest.status_code == 200: if batchrequest.status_code == 200:
batchinfo = batchrequest.json() batchinfo = batchrequest.json()
break
else:
print("Error in retrieving batch assignment, will attempt again in 10 minutes")
sleep(600)
if batchinfo["content"] != "Fail":
break
print("Error in retrieving batch assignment, will attempt again in 10 minutes")
sleep(600)


print("Received batch ID:", batchinfo["batchID"], "Content:", batchinfo["content"]) print("Received batch ID:", batchinfo["batchID"], "Content:", batchinfo["content"])


# Process the batch # Process the batch
batchcontent = requests.get(batchinfo["content"]).text.split("\n") batchcontent = requests.get(batchinfo["content"]).text.split("\n")


threads = []
while batchcontent: while batchcontent:
while len(threads) <= 50 and batchcontent:
item = batchcontent.pop(0)
runthread = batchthread(name = item)
runthread.start()
threads.append(runthread)
jobs.put(batchcontent.pop(0))

threads = []

for i in range(50):
runthread = Thread(target=prrun)
runthread.start()
threads.append(runthread)
del runthread

for x in threads:
x.join()
threads.remove(x)
del x


for x in threads:
x.join()
threads.remove(x)
open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))
#clear
recvids.clear()
recchans.clear()
recmixes.clear()
recplayl.clear()


subtjobs = Queue()
while ccenabledl:
langcontent = langs.copy()
intvid = ccenabledl.pop(0)

while langcontent:
subtjobs.put((langcontent.pop(0), intvid))
del intvid
del langcontent

subthreads = []

for r in range(5):
subrunthread = Thread(target=subprrun, args=(subtjobs,headers))
subrunthread.start()
subthreads.append(subrunthread)
del subrunthread

for xa in subthreads:
xa.join()
subthreads.remove(xa)
del xa
# while True:
# gsres = False
# try:
# gsres = getsubs(str(item).strip())
# except BaseException as e:
# print(e)
# if gsres:
# break
# else:
# print("Error in retrieving subtitles, waiting 30 seconds")
# sleep(30)


#https://stackoverflow.com/a/11968881 #https://stackoverflow.com/a/11968881


# TODO: put the data somewhere... # TODO: put the data somewhere...
# TODO: put the discoveries somewhere... # TODO: put the discoveries somewhere...
open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))


make_archive("out", "zip", "out") #check this make_archive("out", "zip", "out") #check this




Caricamento…
Annulla
Salva