Sfoglia il codice sorgente

If community-updated metadata is available retrieve it, if captions are published, retrieve the community version too

pull/5/head
tech234a 3 anni fa
parent
commit
664eb02e71
2 ha cambiato i file con 55 aggiunte e 19 eliminazioni
  1. +54
    -18
      export.py
  2. +1
    -1
      worker.py

+ 54
- 18
export.py Vedi File

@@ -74,27 +74,63 @@ class MyHTMLParser(HTMLParser):
def subprrun(jobs, mysession): def subprrun(jobs, mysession):
while not jobs.empty(): while not jobs.empty():
collect() #cleanup memory collect() #cleanup memory
langcode, vid = jobs.get()
langcode, vid, mode = jobs.get()
vid = vid.strip() vid = vid.strip()
print(langcode, vid) print(langcode, vid)
pparams = (
("v", vid),
("lang", langcode),
("action_mde_edit_form", 1),
("bl", "vmp"),
("ui", "hd"),
("tab", "captions"),
("o", "U")
)

page = mysession.get("https://www.youtube.com/timedtext_editor", params=pparams)

if mode == "default":
pparams = (
("v", vid),
("lang", langcode),
("action_mde_edit_form", 1),
("bl", "vmp"),
("ui", "hd"),
("tab", "captions"),
("o", "U")
)

page = mysession.get("https://www.youtube.com/timedtext_editor", params=pparams)
elif mode == "forceedit-metadata":
pparams = (
("v", vid),
("lang", langcode),
("action_mde_edit_form", 1),
('forceedit', 'metadata'),
('tab', 'metadata')
)

page = mysession.get("https://www.youtube.com/timedtext_editor", params=pparams)
elif mode == "forceedit-captions":
pparams = (
("v", vid),
("lang", langcode),
("action_mde_edit_form", 1),
("bl", "vmp"),
("ui", "hd"),
('forceedit', 'captions'),
("tab", "captions"),
("o", "U")
)

page = mysession.get("https://www.youtube.com/timedtext_editor", params=pparams)


assert not "accounts.google.com" in page.url, "Please supply authentication cookie information in config.json. See README.md for more information." assert not "accounts.google.com" in page.url, "Please supply authentication cookie information in config.json. See README.md for more information."


inttext = page.text inttext = page.text
del page del page


if 'id="reject-captions-button"' in inttext or 'id="reject-metadata-button"' in inttext or 'data-state="published"' in inttext or 'title="The video owner already provided subtitles/CC"' in inttext: #quick way of checking if this page is worth parsing
filestring = ""
if "forceedit" in mode:
filestring = "_community"

if not "forceedit" in mode:
if '&amp;forceedit=metadata&amp;tab=metadata">See latest</a>' in inttext:
jobs.put((langcode, vid, "forceedit-metadata"))

if '<li id="captions-editor-nav-captions" role="tab" data-state="published" class="published">' in inttext:
jobs.put((langcode, vid, "forceedit-captions"))

if 'id="reject-captions-button"' in inttext or 'id="reject-metadata-button"' in inttext or 'data-state="published"' in inttext or 'title="The video owner already provided subtitles/CC"' in inttext or "forceedit" in mode: #quick way of checking if this page is worth parsing
parser = MyHTMLParser() parser = MyHTMLParser()
parser.feed(inttext) parser.feed(inttext)


@@ -103,8 +139,8 @@ def subprrun(jobs, mysession):
if item["text"][:-9]: if item["text"][:-9]:
captiontext = True captiontext = True


if captiontext:
myfs = open("out/"+vid+"/"+vid+"_"+langcode+".sbv", "w", encoding="utf-8")
if captiontext and (mode == "default" or mode == "forceedit-captions"):
myfs = open("out/"+vid+"/"+vid+"_"+langcode+filestring+".sbv", "w", encoding="utf-8")
captions = parser.captions captions = parser.captions
captions.pop(0) #get rid of the fake one captions.pop(0) #get rid of the fake one
while captions: while captions:
@@ -121,13 +157,13 @@ def subprrun(jobs, mysession):


del captiontext del captiontext


if parser.title or parser.description[:-16]:
if parser.title or parser.description[:-16] and (mode == "default" or mode == "forceedit-metadata"):
metadata = {} metadata = {}
metadata["title"] = parser.title metadata["title"] = parser.title
if metadata["title"] == False: if metadata["title"] == False:
metadata["title"] = "" metadata["title"] = ""
metadata["description"] = parser.description[:-16] metadata["description"] = parser.description[:-16]
open("out/"+vid+"/"+vid+"_"+langcode+".json", "w", encoding="utf-8").write(dumps(metadata))
open("out/"+vid+"/"+vid+"_"+langcode+filestring+".json", "w", encoding="utf-8").write(dumps(metadata))
del metadata del metadata


del inttext del inttext
@@ -188,7 +224,7 @@ if __name__ == "__main__":
except: except:
pass pass
for lang in langs: for lang in langs:
jobs.put((lang, video))
jobs.put((lang, video, "default"))


subthreads = [] subthreads = []




+ 1
- 1
worker.py Vedi File

@@ -169,7 +169,7 @@ while True:
intvid = ccenabledl.pop(0) intvid = ccenabledl.pop(0)


while langcontent: while langcontent:
subtjobs.put((langcontent.pop(0), intvid))
subtjobs.put((langcontent.pop(0), intvid, "default"))
del intvid del intvid
del langcontent del langcontent




Caricamento…
Annulla
Salva