瀏覽代碼

Merge pull request #3 from Data-Horde/html.parse

Html.parse
pull/4/head
tech234a 3 年之前
committed by GitHub
父節點
當前提交
663d3a7d89
沒有發現已知的金鑰在資料庫的簽署中 GPG 金鑰 ID: 4AEE18F83AFDEB23
共有 2 個檔案被更改,包括 89 行新增49 行删除
  1. +78
    -36
      export.py
  2. +11
    -13
      worker.py

+ 78
- 36
export.py 查看文件

@@ -25,16 +25,54 @@ def timedelta_to_sbv_timestamp(timedelta_timestamp):
return "%1d:%02d:%02d.%03d" % (hrs, mins, secs, msecs) return "%1d:%02d:%02d.%03d" % (hrs, mins, secs, msecs)




from bs4 import BeautifulSoup
from datetime import timedelta from datetime import timedelta


from json import dumps from json import dumps


import requests import requests


# https://docs.python.org/3/library/html.parser.html
from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.captions = []
self.title = ""
self.description = ""


def check_attr(self, attrs, attr, value):
for item in attrs:
if item[0] == attr and item[1] == value:
return True
return False

def get_attr(self, attrs, attr):
for item in attrs:
if item[0] == attr:
return item[1]
return False

def handle_starttag(self, tag, attrs):
if tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-start-time") and not ' data-segment-id="" ' in self.get_starttag_text():
self.captions.append({"startTime": int(self.get_attr(attrs, "data-start-ms")), "text": ""})
elif tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-end-time") and not ' data-segment-id="" ' in self.get_starttag_text():
self.captions[len(self.captions)-1]["endTime"] = int(self.get_attr(attrs, "data-end-ms"))
elif tag == "input" and self.check_attr(attrs, "id", "metadata-title"):
self.title = self.get_attr(attrs, "value")

def handle_data(self, data):
if self.get_starttag_text() and self.get_starttag_text().startswith("<textarea "):
if 'name="serve_text"' in self.get_starttag_text() and not 'data-segment-id=""' in self.get_starttag_text():
self.captions[len(self.captions)-1]["text"] += data
elif 'id="metadata-description"' in self.get_starttag_text():
self.description += data

def subprrun(jobs, headers): def subprrun(jobs, headers):
while not jobs.empty(): while not jobs.empty():
langcode, vid = jobs.get() langcode, vid = jobs.get()
vid = vid.strip()
print(langcode, vid) print(langcode, vid)
pparams = ( pparams = (
("v", vid), ("v", vid),
@@ -50,43 +88,47 @@ def subprrun(jobs, headers):


assert not "accounts.google.com" in page.url, "Please supply authentication cookie information in config.json. See README.md for more information." assert not "accounts.google.com" in page.url, "Please supply authentication cookie information in config.json. See README.md for more information."


soup = BeautifulSoup(page.text, features="html5lib")
inttext = page.text
del page del page


divs = soup.find_all("div", class_="timed-event-line")

myfs = open("out/"+vid+"/"+vid+"_"+langcode+".sbv", "w", encoding="utf-8")
while divs:
item = divs.pop(0)
text = item.find("textarea").text
startms = int(item.find("input", class_="event-start-time")["data-start-ms"])
endms = int(item.find("input", class_="event-end-time")["data-end-ms"])

myfs.write(timedelta_to_sbv_timestamp(timedelta(milliseconds=startms)) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=endms)) + "\n" + text + "\n")
del item
del text
del startms
del endms
if divs:
myfs.write("\n")
del divs
myfs.close()
del myfs

if soup.find("li", id="captions-editor-nav-metadata")["data-state"] != "locked":
metadata = {}

try:
metadata["title"] = soup.find("input", id="metadata-title")["value"]
except KeyError:
metadata["title"] = ""
metadata["description"] = soup.find("textarea", id="metadata-description").text

open("out/"+vid+"/"+vid+"_"+langcode+".json", "w", encoding="utf-8").write(dumps(metadata))
del metadata

del soup
if 'id="reject-captions-button"' in inttext or 'id="reject-metadata-button"' in inttext: #quick way of checking if this page is worth parsing
parser = MyHTMLParser()
parser.feed(inttext)

captiontext = False
for item in parser.captions:
if item["text"][:-9]:
captiontext = True

if captiontext:
myfs = open("out/"+vid+"/"+vid+"_"+langcode+".sbv", "w", encoding="utf-8")
captions = parser.captions
captions.pop(0) #get rid of the fake one
while captions:
item = captions.pop(0)

myfs.write(timedelta_to_sbv_timestamp(timedelta(milliseconds=item["startTime"])) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=item["endTime"])) + "\n" + item["text"][:-9] + "\n")
del item
if captions:
myfs.write("\n")
del captions
myfs.close()
del myfs

del captiontext

if parser.title or parser.description[:-16]:
metadata = {}
metadata["title"] = parser.title
if metadata["title"] == False:
metadata["title"] = ""
metadata["description"] = parser.description[:-16]
open("out/"+vid+"/"+vid+"_"+langcode+".json", "w", encoding="utf-8").write(dumps(metadata))
del metadata

del inttext

del langcode del langcode
del vid del vid
del pparams del pparams


+ 11
- 13
worker.py 查看文件

@@ -1,7 +1,7 @@
from threading import Thread from threading import Thread
import requests import requests
from time import sleep from time import sleep
from os import mkdir
from os import mkdir, rmdir, listdir
from os.path import isdir from os.path import isdir
from json import dumps, loads from json import dumps, loads


@@ -168,20 +168,18 @@ while True:
del subrunthread del subrunthread


for xa in subthreads: for xa in subthreads:
xa.join()
xa.join() #bug (occurred once: the script ended before the last thread finished)
subthreads.remove(xa) subthreads.remove(xa)
del xa del xa
# while True:
# gsres = False
# try:
# gsres = getsubs(str(item).strip())
# except BaseException as e:
# print(e)
# if gsres:
# break
# else:
# print("Error in retrieving subtitles, waiting 30 seconds")
# sleep(30)

sleep(1) #wait a second to hopefully allow the other threads to finish

for fol in listdir("out"): #remove extra folders
try:
if isdir("out/"+fol):
rmdir("out/"+fol)
except:
pass


#https://stackoverflow.com/a/11968881 #https://stackoverflow.com/a/11968881




Loading…
取消
儲存