瀏覽代碼

Significant optimization

pull/3/head
tech234a 3 年之前
父節點
當前提交
7fd09ed2ae
共有 1 個檔案被更改,包括 38 行新增33 行删除
  1. +38
    -33
      export.py

+ 38
- 33
export.py 查看文件

@@ -88,41 +88,46 @@ def subprrun(jobs, headers):


assert not "accounts.google.com" in page.url, "Please supply authentication cookie information in config.json. See README.md for more information." assert not "accounts.google.com" in page.url, "Please supply authentication cookie information in config.json. See README.md for more information."


parser = MyHTMLParser()
parser.feed(page.text)
inttext = page.text
del page del page


captiontext = False
for item in parser.captions:
if item["text"][:-9]:
captiontext = True

if captiontext:
myfs = open("out/"+vid+"/"+vid+"_"+langcode+".sbv", "w", encoding="utf-8")
captions = parser.captions
captions.pop(0) #get rid of the fake one
while captions:
item = captions.pop(0)

myfs.write(timedelta_to_sbv_timestamp(timedelta(milliseconds=item["startTime"])) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=item["endTime"])) + "\n" + item["text"][:-9] + "\n")
del item
if captions:
myfs.write("\n")
del captions
myfs.close()
del myfs

del captiontext

if parser.title or parser.description[:-16]:
metadata = {}
metadata["title"] = parser.title
if metadata["title"] == False:
metadata["title"] = ""
metadata["description"] = parser.description[:-16]
open("out/"+vid+"/"+vid+"_"+langcode+".json", "w", encoding="utf-8").write(dumps(metadata))
del metadata
if 'id="reject-captions-button"' in inttext or 'id="reject-metadata-button"' in inttext: #quick way of checking if this page is worth parsing
parser = MyHTMLParser()
parser.feed(inttext)

captiontext = False
for item in parser.captions:
if item["text"][:-9]:
captiontext = True

if captiontext:
myfs = open("out/"+vid+"/"+vid+"_"+langcode+".sbv", "w", encoding="utf-8")
captions = parser.captions
captions.pop(0) #get rid of the fake one
while captions:
item = captions.pop(0)

myfs.write(timedelta_to_sbv_timestamp(timedelta(milliseconds=item["startTime"])) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=item["endTime"])) + "\n" + item["text"][:-9] + "\n")
del item
if captions:
myfs.write("\n")
del captions
myfs.close()
del myfs

del captiontext

if parser.title or parser.description[:-16]:
metadata = {}
metadata["title"] = parser.title
if metadata["title"] == False:
metadata["title"] = ""
metadata["description"] = parser.description[:-16]
open("out/"+vid+"/"+vid+"_"+langcode+".json", "w", encoding="utf-8").write(dumps(metadata))
del metadata

del inttext


del langcode del langcode
del vid del vid


Loading…
取消
儲存