diff --git a/export.py b/export.py index 5927541..83cb543 100644 --- a/export.py +++ b/export.py @@ -26,12 +26,46 @@ def timedelta_to_sbv_timestamp(timedelta_timestamp): from bs4 import BeautifulSoup +import html.parser from datetime import timedelta from json import dumps import requests +# https://docs.python.org/3/library/html.parser.html +from html.parser import HTMLParser + +class MyHTMLParser(HTMLParser): + def __init__(self): + self.captions = [] + + def check_attr(self, attrs, attr, value): + for item in attrs: + if item[0] == attr and item[1] == value: + return True + return False + + def get_attr(self, attrs, attr): + for item in attrs: + if item[0] == attr: + return item[1] + return False + + def handle_starttag(self, tag, attrs): + if tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-start-time"): + self.captions.append({"startTime": self.get_attr("data-start-ms")}) + elif tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-end-time"): + self.captions[len(self.captions-1)]["endTime"] = self.get_attr("data-end-ms")}) + elif tag == "textarea" and self.check_attr(attrs, "class", "yt-uix-form-input-textarea event-text goog-textarea"): + pass #do this + + #def handle_endtag(self, tag): + # print("Encountered an end tag :", tag) + + def handle_data(self, data): + print("Encountered some data :", data) + def subprrun(jobs, headers): while not jobs.empty(): langcode, vid = jobs.get() @@ -50,8 +84,10 @@ def subprrun(jobs, headers): assert not "accounts.google.com" in page.url, "Please supply authentication cookie information in config.json. See README.md for more information." - soup = BeautifulSoup(page.text, features="html5lib") - del page + parser = MyHTMLParser() + parser.feed(page.text) + #soup = BeautifulSoup(page.text, features="html5lib") + #del page divs = soup.find_all("div", class_="timed-event-line") @@ -64,6 +100,8 @@ def subprrun(jobs, headers): myfs.write(timedelta_to_sbv_timestamp(timedelta(milliseconds=startms)) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=endms)) + "\n" + text + "\n") + #text.decompose() + item.decompose() del item del text del startms @@ -86,6 +124,7 @@ def subprrun(jobs, headers): open("out/"+vid+"/"+vid+"_"+langcode+".json", "w", encoding="utf-8").write(dumps(metadata)) del metadata + soup.decompose() del soup del langcode del vid