@@ -25,16 +25,54 @@ def timedelta_to_sbv_timestamp(timedelta_timestamp):
return "%1d:%02d:%02d.%03d" % (hrs, mins, secs, msecs)
return "%1d:%02d:%02d.%03d" % (hrs, mins, secs, msecs)
from bs4 import BeautifulSoup
from datetime import timedelta
from datetime import timedelta
from json import dumps
from json import dumps
import requests
import requests
# https://docs.python.org/3/library/html.parser.html
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.captions = []
self.title = ""
self.description = ""
def check_attr(self, attrs, attr, value):
for item in attrs:
if item[0] == attr and item[1] == value:
return True
return False
def get_attr(self, attrs, attr):
for item in attrs:
if item[0] == attr:
return item[1]
return False
def handle_starttag(self, tag, attrs):
if tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-start-time") and not ' data-segment-id="" ' in self.get_starttag_text():
self.captions.append({"startTime": int(self.get_attr(attrs, "data-start-ms")), "text": ""})
elif tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-end-time") and not ' data-segment-id="" ' in self.get_starttag_text():
self.captions[len(self.captions)-1]["endTime"] = int(self.get_attr(attrs, "data-end-ms"))
elif tag == "input" and self.check_attr(attrs, "id", "metadata-title"):
self.title = self.get_attr(attrs, "value")
def handle_data(self, data):
if self.get_starttag_text() and self.get_starttag_text().startswith("<textarea "):
if 'name="serve_text"' in self.get_starttag_text() and not 'data-segment-id=""' in self.get_starttag_text():
self.captions[len(self.captions)-1]["text"] += data
elif 'id="metadata-description"' in self.get_starttag_text():
self.description += data
def subprrun(jobs, headers):
def subprrun(jobs, headers):
while not jobs.empty():
while not jobs.empty():
langcode, vid = jobs.get()
langcode, vid = jobs.get()
vid = vid.strip()
print(langcode, vid)
print(langcode, vid)
pparams = (
pparams = (
("v", vid),
("v", vid),
@@ -50,43 +88,47 @@ def subprrun(jobs, headers):
assert not "accounts.google.com" in page.url, "Please supply authentication cookie information in config.json. See README.md for more information."
assert not "accounts.google.com" in page.url, "Please supply authentication cookie information in config.json. See README.md for more information."
soup = BeautifulSoup(page.text, features="html5lib")
inttext = page.text
del page
del page
divs = soup.find_all("div", class_="timed-event-line")
myfs = open("out/"+vid+"/"+vid+"_"+langcode+".sbv", "w", encoding="utf-8")
while divs:
item = divs.pop(0)
text = item.find("textarea").text
startms = int(item.find("input", class_="event-start-time")["data-start-ms"])
endms = int(item.find("input", class_="event-end-time")["data-end-ms"])
myfs.write(timedelta_to_sbv_timestamp(timedelta(milliseconds=startms)) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=endms)) + "\n" + text + "\n")
del item
del text
del startms
del endms
if divs:
myfs.write("\n")
del divs
myfs.close()
del myfs
if soup.find("li", id="captions-editor-nav-metadata")["data-state"] != "locked":
metadata = {}
try:
metadata["title"] = soup.find("input", id="metadata-title")["value"]
except KeyError:
metadata["title"] = ""
metadata["description"] = soup.find("textarea", id="metadata-description").text
open("out/"+vid+"/"+vid+"_"+langcode+".json", "w", encoding="utf-8").write(dumps(metadata))
del metadata
del soup
if 'id="reject-captions-button"' in inttext or 'id="reject-metadata-button"' in inttext: #quick way of checking if this page is worth parsing
parser = MyHTMLParser()
parser.feed(inttext)
captiontext = False
for item in parser.captions:
if item["text"][:-9]:
captiontext = True
if captiontext:
myfs = open("out/"+vid+"/"+vid+"_"+langcode+".sbv", "w", encoding="utf-8")
captions = parser.captions
captions.pop(0) #get rid of the fake one
while captions:
item = captions.pop(0)
myfs.write(timedelta_to_sbv_timestamp(timedelta(milliseconds=item["startTime"])) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=item["endTime"])) + "\n" + item["text"][:-9] + "\n")
del item
if captions:
myfs.write("\n")
del captions
myfs.close()
del myfs
del captiontext
if parser.title or parser.description[:-16]:
metadata = {}
metadata["title"] = parser.title
if metadata["title"] == False:
metadata["title"] = ""
metadata["description"] = parser.description[:-16]
open("out/"+vid+"/"+vid+"_"+langcode+".json", "w", encoding="utf-8").write(dumps(metadata))
del metadata
del inttext
del langcode
del langcode
del vid
del vid
del pparams
del pparams