From 6177c024677de63bde4611989cddf1a87b90e627 Mon Sep 17 00:00:00 2001 From: tech234a <46801700+tech234a@users.noreply.github.com> Date: Sun, 20 Sep 2020 23:12:17 -0400 Subject: [PATCH] Further work on html.parser --- export.py | 98 ++++++++++++++++++++++++++++--------------------------- worker.py | 4 ++- 2 files changed, 53 insertions(+), 49 deletions(-) diff --git a/export.py b/export.py index 83cb543..3236050 100644 --- a/export.py +++ b/export.py @@ -25,8 +25,6 @@ def timedelta_to_sbv_timestamp(timedelta_timestamp): return "%1d:%02d:%02d.%03d" % (hrs, mins, secs, msecs) -from bs4 import BeautifulSoup -import html.parser from datetime import timedelta from json import dumps @@ -38,7 +36,12 @@ from html.parser import HTMLParser class MyHTMLParser(HTMLParser): def __init__(self): + HTMLParser.__init__(self) self.captions = [] + self.captiontext = True + self.title = "" + self.description = "" + def check_attr(self, attrs, attr, value): for item in attrs: @@ -53,22 +56,36 @@ class MyHTMLParser(HTMLParser): return False def handle_starttag(self, tag, attrs): - if tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-start-time"): - self.captions.append({"startTime": self.get_attr("data-start-ms")}) - elif tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-end-time"): - self.captions[len(self.captions-1)]["endTime"] = self.get_attr("data-end-ms")}) - elif tag == "textarea" and self.check_attr(attrs, "class", "yt-uix-form-input-textarea event-text goog-textarea"): - pass #do this - - #def handle_endtag(self, tag): - # print("Encountered an end tag :", tag) + if tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-start-time") and not ' data-segment-id="" ' in self.get_starttag_text(): + self.captions.append({"startTime": int(self.get_attr(attrs, "data-start-ms")), "text": ""}) + elif tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-end-time") and not ' data-segment-id="" ' in self.get_starttag_text(): + self.captions[len(self.captions)-1]["endTime"] = int(self.get_attr(attrs, "data-end-ms")) + # elif tag == "textarea" and self.check_attr(attrs, "class", "yt-uix-form-input-textarea event-text goog-textarea"): + # if len(self.captions): + # self.datatarget = len(self.captions)-1 + # else: + # self.datatarget = 0 + elif tag == "input" and self.check_attr(attrs, "id", "metadata-title"): + self.title = self.get_attr(attrs, "value") + # elif tag == "textarea" and self.check_attr(attrs, "id", "metadata-description"): + # self.datatarget = "description" + + # def handle_endtag(self, tag): + # if tag == "textarea": + # self.datatarget = None def handle_data(self, data): - print("Encountered some data :", data) + if self.get_starttag_text() and self.get_starttag_text().startswith("