diff --git a/export.py b/export.py index 5927541..612c90d 100644 --- a/export.py +++ b/export.py @@ -25,16 +25,54 @@ def timedelta_to_sbv_timestamp(timedelta_timestamp): return "%1d:%02d:%02d.%03d" % (hrs, mins, secs, msecs) -from bs4 import BeautifulSoup from datetime import timedelta from json import dumps import requests +# https://docs.python.org/3/library/html.parser.html +from html.parser import HTMLParser + +class MyHTMLParser(HTMLParser): + def __init__(self): + HTMLParser.__init__(self) + self.captions = [] + self.title = "" + self.description = "" + + + def check_attr(self, attrs, attr, value): + for item in attrs: + if item[0] == attr and item[1] == value: + return True + return False + + def get_attr(self, attrs, attr): + for item in attrs: + if item[0] == attr: + return item[1] + return False + + def handle_starttag(self, tag, attrs): + if tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-start-time") and not ' data-segment-id="" ' in self.get_starttag_text(): + self.captions.append({"startTime": int(self.get_attr(attrs, "data-start-ms")), "text": ""}) + elif tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-end-time") and not ' data-segment-id="" ' in self.get_starttag_text(): + self.captions[len(self.captions)-1]["endTime"] = int(self.get_attr(attrs, "data-end-ms")) + elif tag == "input" and self.check_attr(attrs, "id", "metadata-title"): + self.title = self.get_attr(attrs, "value") + + def handle_data(self, data): + if self.get_starttag_text() and self.get_starttag_text().startswith("