Procházet zdrojové kódy

WIP Implementation of html.parse (incomplete)

pull/3/head
tech234a před 3 roky
rodič
revize
f1e96f90d7
1 změnil soubory, kde provedl 41 přidání a 2 odebrání
  1. +41
    -2
      export.py

+ 41
- 2
export.py Zobrazit soubor

@@ -26,12 +26,46 @@ def timedelta_to_sbv_timestamp(timedelta_timestamp):




from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import html.parser
from datetime import timedelta from datetime import timedelta


from json import dumps from json import dumps


import requests import requests


# https://docs.python.org/3/library/html.parser.html
from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
def __init__(self):
self.captions = []

def check_attr(self, attrs, attr, value):
for item in attrs:
if item[0] == attr and item[1] == value:
return True
return False

def get_attr(self, attrs, attr):
for item in attrs:
if item[0] == attr:
return item[1]
return False

def handle_starttag(self, tag, attrs):
if tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-start-time"):
self.captions.append({"startTime": self.get_attr("data-start-ms")})
elif tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-end-time"):
self.captions[len(self.captions-1)]["endTime"] = self.get_attr("data-end-ms")})
elif tag == "textarea" and self.check_attr(attrs, "class", "yt-uix-form-input-textarea event-text goog-textarea"):
pass #do this

#def handle_endtag(self, tag):
# print("Encountered an end tag :", tag)

def handle_data(self, data):
print("Encountered some data :", data)

def subprrun(jobs, headers): def subprrun(jobs, headers):
while not jobs.empty(): while not jobs.empty():
langcode, vid = jobs.get() langcode, vid = jobs.get()
@@ -50,8 +84,10 @@ def subprrun(jobs, headers):


assert not "accounts.google.com" in page.url, "Please supply authentication cookie information in config.json. See README.md for more information." assert not "accounts.google.com" in page.url, "Please supply authentication cookie information in config.json. See README.md for more information."


soup = BeautifulSoup(page.text, features="html5lib")
del page
parser = MyHTMLParser()
parser.feed(page.text)
#soup = BeautifulSoup(page.text, features="html5lib")
#del page


divs = soup.find_all("div", class_="timed-event-line") divs = soup.find_all("div", class_="timed-event-line")


@@ -64,6 +100,8 @@ def subprrun(jobs, headers):


myfs.write(timedelta_to_sbv_timestamp(timedelta(milliseconds=startms)) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=endms)) + "\n" + text + "\n") myfs.write(timedelta_to_sbv_timestamp(timedelta(milliseconds=startms)) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=endms)) + "\n" + text + "\n")
#text.decompose()
item.decompose()
del item del item
del text del text
del startms del startms
@@ -86,6 +124,7 @@ def subprrun(jobs, headers):
open("out/"+vid+"/"+vid+"_"+langcode+".json", "w", encoding="utf-8").write(dumps(metadata)) open("out/"+vid+"/"+vid+"_"+langcode+".json", "w", encoding="utf-8").write(dumps(metadata))
del metadata del metadata


soup.decompose()
del soup del soup
del langcode del langcode
del vid del vid


Načítá se…
Zrušit
Uložit