Browse Source

Add files via upload

pull/3/head
tech234a 3 years ago
committed by GitHub
commit
e36d4af1ed
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 198 additions and 0 deletions
  1. +8
    -0
      README.md
  2. +5
    -0
      config.json
  3. +78
    -0
      discovery.py
  4. +3
    -0
      requirements.txt
  5. +104
    -0
      ytcc-exporter.py

+ 8
- 0
README.md View File

@@ -0,0 +1,8 @@
# YouTube Community Contributed Captions Exporter
Export YouTube community-contributed captioning drafts to SBV files.
## Setup
Install the requirements in the requirements.txt file (`pip install -r requirements.txt`). Because the captioning editor is only available to logged-in users, you must specify the values of three session cookies for any Google account (`HSID`, `SSID`, and `SID`). You can get these cookie values by opening the developer tools on any youtube.com webpage, going to the "Application" (Chrome) or "Storage" (Firefox) tab, selecting "Cookies", and copying the required values.
## Usage
Simply run `python3 ytcc-exporter.py` followed by a list of space-separated YouTube video IDs, and all community-contributed captioning drafts in all languages will be exported.

+ 5
- 0
config.json View File

@@ -0,0 +1,5 @@
{
"HSID": "",
"SSID": "",
"SID" : ""
}

+ 78
- 0
discovery.py View File

@@ -0,0 +1,78 @@
import requests
from json import loads
def getmetadata(vid):
params = (
("v", vid),
)
wpage = requests.get("https://www.youtube.com/watch", params=params)
wptext = wpage.text
initplay = None
initdata = None
recvids = set()
recchans = set()
recmixes = set()
recplayl = set()
for line in wptext.splitlines():
if line.strip().startswith('window["ytInitialPlayerResponse"] = '):
initplay = loads(line.split('window["ytInitialPlayerResponse"] = ', 1)[1].strip()[:-1])
if initplay["playabilityStatus"]["status"] == "ERROR":
print(vid, "unavailable")
return False, recvids, recchans, recmixes, recplayl
if "endscreen" in initplay.keys():
for el in initplay["endscreen"]["endscreenRenderer"]:
elint = el["endscreenElementRenderer"]
if elint["style"] == "VIDEO":
recvids.add(elint["endpoint"]["watchEndpoint"]["videoId"])
elif elint["style"] == "CHANNEL":
recchans.add(elint["endpoint"]["browseEndpoint"]["browseId"])
elif elint["style"] == "PLAYLIST":
recvids.add(elint["endpoint"]["watchEndpoint"]["videoId"])
recplayl.add(elint["endpoint"]["watchEndpint"]["playlistId"])
if "captions" in initplay.keys():
ccenabled = "contribute" in initplay["captions"]["playerCaptionsRenderer"]
recchans.add(initplay["videoDetails"]["channelId"])
elif line.strip().startswith('window["ytInitialData"] = '):
initdata = loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1])
if "contents" in initdata.keys(): #prevent exception
for recmd in initdata["contents"]["twoColumnWatchNextResults"]["secondaryResults"]["secondaryResults"]["results"]:
#auto is like the others
if "compactAutoplayRenderer" in recmd.keys():
recmd = recmd["compactAutoplayRenderer"]["contents"][0]
if "compactVideoRenderer" in recmd.keys():
recvids.add(recmd["compactVideoRenderer"]["videoId"])
recchans.add(recmd["compactVideoRenderer"]["channelId"])
elif "compactPlaylistRenderer" in recmd.keys():
recplayl.add(recmd["compactPlaylistRenderer"]["playlistId"])
recvids.add(recmd["compactPlaylistRenderer"]["navigationEndpoint"]["watchEndpoint"]["videoId"])
recchans.add(recmd["compactPlaylistRenderer"]["shortBylineText"]["navigationEndpoint"]["browseEndpoint"]["browseId"])
elif "compactRadioRenderer" in recmd.keys(): #mix playlist
recmixes.add(recmd["compactRadioRenderer"]["playlistId"])
# todo: find out if channels can be suggested
if initplay and initdata:
break
return ccenabled, recvids, recchans, recmixes, recplayl
if __name__ == "__main__":
from sys import argv
vidl = argv
vidl.pop(0)
for video in vidl:
print(getmetadata(video))

+ 3
- 0
requirements.txt View File

@@ -0,0 +1,3 @@
requests
beautifulsoup4
html5lib

+ 104
- 0
ytcc-exporter.py View File

@@ -0,0 +1,104 @@
# This function adapted from https://github.com/cdown/srt/blob/11089f1e021f2e074d04c33fc7ffc4b7b52e7045/srt.py, lines 69 and 189 (MIT License)
def timedelta_to_sbv_timestamp(timedelta_timestamp):
r"""
Convert a :py:class:`~datetime.timedelta` to an SRT timestamp.
.. doctest::
>>> import datetime
>>> delta = datetime.timedelta(hours=1, minutes=23, seconds=4)
>>> timedelta_to_sbv_timestamp(delta)
'01:23:04,000'
:param datetime.timedelta timedelta_timestamp: A datetime to convert to an
SBV timestamp
:returns: The timestamp in SBV format
:rtype: str
"""
SECONDS_IN_HOUR = 3600
SECONDS_IN_MINUTE = 60
HOURS_IN_DAY = 24
MICROSECONDS_IN_MILLISECOND = 1000
hrs, secs_remainder = divmod(timedelta_timestamp.seconds, SECONDS_IN_HOUR)
hrs += timedelta_timestamp.days * HOURS_IN_DAY
mins, secs = divmod(secs_remainder, SECONDS_IN_MINUTE)
msecs = timedelta_timestamp.microseconds // MICROSECONDS_IN_MILLISECOND
return "%1d:%02d:%02d.%03d" % (hrs, mins, secs, msecs)
import requests
from bs4 import BeautifulSoup
from datetime import timedelta
from json import loads
#HSID, SSID, SID cookies required
cookies = loads(open("config.json").read())
headers = {
"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"],
}
def getsubs(vid, lang="all"):
if lang == "all":
lparams = (
("v", vid),
("ref", "player"),
("o", "U"),
)
langpage = requests.get("https://www.youtube.com/timedtext_video", params=lparams, headers=headers)
assert not "accounts.google.com" in langpage.url, "Please supply authentication cookie information in config.json. See README.md for more information."
langs = []
langsoup = BeautifulSoup(langpage.text, features="html5lib")
if "create_channel" in langpage.url:
print(vid, "not found.")
elif langsoup.find_all("div", {"class": "not-accepting-caption-submissions"}):
print(vid, "has disabled community-contributed captioning.")
langs = []
else:
langdivs = langsoup.find("ul", class_="yt-uix-languagepicker-language-list").find_all("li", class_="yt-uix-languagepicker-menu-item")
for item in langdivs:
langs.append(item["data-value"])
print(vid, "has the following languages available", ", ".join(langs)+".")
else:
langs = [lang]
for langcode in langs:
pparams = (
("v", vid),
("lang", langcode),
("action_mde_edit_form", 1),
("bl", "vmp"),
("ui", "hd"),
("tab", "captions"),
("o", "U")
)
page = requests.get("https://www.youtube.com/timedtext_editor", params=pparams, headers=headers)
soup = BeautifulSoup(page.text, features="html5lib")
divs = soup.find_all("div", class_="timed-event-line")
outtext = ""
for item in divs:
text = item.find("textarea").text
startms = int(item.find("input", class_="event-start-time")["data-start-ms"])
endms = int(item.find("input", class_="event-end-time")["data-end-ms"])
outtext += timedelta_to_sbv_timestamp(timedelta(milliseconds=startms)) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=endms)) + "\n" + text + "\n\n"
open(vid+"_"+langcode+".sbv", "w", encoding="utf-8").write(outtext[:-1])
if __name__ == "__main__":
from sys import argv
vidl = argv
vidl.pop(0)
for video in vidl:
getsubs(video)

Loading…
Cancel
Save