Ver a proveniência

Remove empty dirs, don't create meaningless files, comment cleanup

pull/3/head
tech234a há 3 anos
ascendente
cometimento
9c0629e99f
2 ficheiros alterados com 20 adições e 28 eliminações
  1. +11
    -15
      export.py
  2. +9
    -13
      worker.py

+ 11
- 15
export.py Ver ficheiro

@@ -38,7 +38,6 @@ class MyHTMLParser(HTMLParser):
def __init__(self): def __init__(self):
HTMLParser.__init__(self) HTMLParser.__init__(self)
self.captions = [] self.captions = []
self.captiontext = True
self.title = "" self.title = ""
self.description = "" self.description = ""


@@ -60,25 +59,13 @@ class MyHTMLParser(HTMLParser):
self.captions.append({"startTime": int(self.get_attr(attrs, "data-start-ms")), "text": ""}) self.captions.append({"startTime": int(self.get_attr(attrs, "data-start-ms")), "text": ""})
elif tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-end-time") and not ' data-segment-id="" ' in self.get_starttag_text(): elif tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-end-time") and not ' data-segment-id="" ' in self.get_starttag_text():
self.captions[len(self.captions)-1]["endTime"] = int(self.get_attr(attrs, "data-end-ms")) self.captions[len(self.captions)-1]["endTime"] = int(self.get_attr(attrs, "data-end-ms"))
# elif tag == "textarea" and self.check_attr(attrs, "class", "yt-uix-form-input-textarea event-text goog-textarea"):
# if len(self.captions):
# self.datatarget = len(self.captions)-1
# else:
# self.datatarget = 0
elif tag == "input" and self.check_attr(attrs, "id", "metadata-title"): elif tag == "input" and self.check_attr(attrs, "id", "metadata-title"):
self.title = self.get_attr(attrs, "value") self.title = self.get_attr(attrs, "value")
# elif tag == "textarea" and self.check_attr(attrs, "id", "metadata-description"):
# self.datatarget = "description"

# def handle_endtag(self, tag):
# if tag == "textarea":
# self.datatarget = None


def handle_data(self, data): def handle_data(self, data):
if self.get_starttag_text() and self.get_starttag_text().startswith("<textarea "): if self.get_starttag_text() and self.get_starttag_text().startswith("<textarea "):
if 'name="serve_text"' in self.get_starttag_text() and not 'data-segment-id=""' in self.get_starttag_text(): if 'name="serve_text"' in self.get_starttag_text() and not 'data-segment-id=""' in self.get_starttag_text():
self.captions[len(self.captions)-1]["text"] += data self.captions[len(self.captions)-1]["text"] += data
self.captiontext = True
elif 'id="metadata-description"' in self.get_starttag_text(): elif 'id="metadata-description"' in self.get_starttag_text():
self.description += data self.description += data


@@ -105,7 +92,12 @@ def subprrun(jobs, headers):
parser.feed(page.text) parser.feed(page.text)
del page del page


if parser.captiontext:
captiontext = False
for item in parser.captions:
if item["text"][:-9]:
captiontext = True

if captiontext:
myfs = open("out/"+vid+"/"+vid+"_"+langcode+".sbv", "w", encoding="utf-8") myfs = open("out/"+vid+"/"+vid+"_"+langcode+".sbv", "w", encoding="utf-8")
captions = parser.captions captions = parser.captions
captions.pop(0) #get rid of the fake one captions.pop(0) #get rid of the fake one
@@ -121,9 +113,13 @@ def subprrun(jobs, headers):
myfs.close() myfs.close()
del myfs del myfs


if parser.title or parser.description:
del captiontext

if parser.title or parser.description[:-16]:
metadata = {} metadata = {}
metadata["title"] = parser.title metadata["title"] = parser.title
if metadata["title"] == False:
metadata["title"] = ""
metadata["description"] = parser.description[:-16] metadata["description"] = parser.description[:-16]
open("out/"+vid+"/"+vid+"_"+langcode+".json", "w", encoding="utf-8").write(dumps(metadata)) open("out/"+vid+"/"+vid+"_"+langcode+".json", "w", encoding="utf-8").write(dumps(metadata))
del metadata del metadata


+ 9
- 13
worker.py Ver ficheiro

@@ -1,7 +1,7 @@
from threading import Thread from threading import Thread
import requests import requests
from time import sleep from time import sleep
from os import mkdir
from os import mkdir, rmdir, listdir
from os.path import isdir from os.path import isdir
from json import dumps, loads from json import dumps, loads


@@ -172,18 +172,14 @@ while True:
subthreads.remove(xa) subthreads.remove(xa)
del xa del xa


sleep(1)
# while True:
# gsres = False
# try:
# gsres = getsubs(str(item).strip())
# except BaseException as e:
# print(e)
# if gsres:
# break
# else:
# print("Error in retrieving subtitles, waiting 30 seconds")
# sleep(30)
sleep(1) #wait a second to hopefully allow the other threads to finish

for fol in listdir("out"): #remove extra folders
try:
if isdir("out/"+fol):
rmdir("out/"+fol)
except:
pass


#https://stackoverflow.com/a/11968881 #https://stackoverflow.com/a/11968881




Carregando…
Cancelar
Guardar