From e36d4af1edc0583cd99fd2a2c6ad67cedfb52de1 Mon Sep 17 00:00:00 2001 From: tech234a <46801700+tech234a@users.noreply.github.com> Date: Wed, 16 Sep 2020 18:27:00 -0400 Subject: [PATCH] Add files via upload --- README.md | 8 ++++ config.json | 5 +++ discovery.py | 78 +++++++++++++++++++++++++++++++++++ requirements.txt | 3 ++ ytcc-exporter.py | 104 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 198 insertions(+) create mode 100644 README.md create mode 100644 config.json create mode 100644 discovery.py create mode 100644 requirements.txt create mode 100644 ytcc-exporter.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..a9b2d98 --- /dev/null +++ b/README.md @@ -0,0 +1,8 @@ +# YouTube Community Contributed Captions Exporter +Export YouTube community-contributed captioning drafts to SBV files. + +## Setup +Install the requirements in the requirements.txt file (`pip install -r requirements.txt`). Because the captioning editor is only available to logged-in users, you must specify the values of three session cookies for any Google account (`HSID`, `SSID`, and `SID`). You can get these cookie values by opening the developer tools on any youtube.com webpage, going to the "Application" (Chrome) or "Storage" (Firefox) tab, selecting "Cookies", and copying the required values. + +## Usage +Simply run `python3 ytcc-exporter.py` followed by a list of space-separated YouTube video IDs, and all community-contributed captioning drafts in all languages will be exported. \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..0e3741c --- /dev/null +++ b/config.json @@ -0,0 +1,5 @@ +{ + "HSID": "", + "SSID": "", + "SID" : "" +} \ No newline at end of file diff --git a/discovery.py b/discovery.py new file mode 100644 index 0000000..84e5455 --- /dev/null +++ b/discovery.py @@ -0,0 +1,78 @@ +import requests +from json import loads + +def getmetadata(vid): + params = ( + ("v", vid), + ) + wpage = requests.get("https://www.youtube.com/watch", params=params) + + wptext = wpage.text + + initplay = None + initdata = None + + recvids = set() + recchans = set() + recmixes = set() + recplayl = set() + + for line in wptext.splitlines(): + if line.strip().startswith('window["ytInitialPlayerResponse"] = '): + initplay = loads(line.split('window["ytInitialPlayerResponse"] = ', 1)[1].strip()[:-1]) + + if initplay["playabilityStatus"]["status"] == "ERROR": + print(vid, "unavailable") + return False, recvids, recchans, recmixes, recplayl + + if "endscreen" in initplay.keys(): + for el in initplay["endscreen"]["endscreenRenderer"]: + + elint = el["endscreenElementRenderer"] + + if elint["style"] == "VIDEO": + recvids.add(elint["endpoint"]["watchEndpoint"]["videoId"]) + + elif elint["style"] == "CHANNEL": + recchans.add(elint["endpoint"]["browseEndpoint"]["browseId"]) + + elif elint["style"] == "PLAYLIST": + recvids.add(elint["endpoint"]["watchEndpoint"]["videoId"]) + recplayl.add(elint["endpoint"]["watchEndpint"]["playlistId"]) + + if "captions" in initplay.keys(): + ccenabled = "contribute" in initplay["captions"]["playerCaptionsRenderer"] + + recchans.add(initplay["videoDetails"]["channelId"]) + elif line.strip().startswith('window["ytInitialData"] = '): + initdata = loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1]) + if "contents" in initdata.keys(): #prevent exception + for recmd in initdata["contents"]["twoColumnWatchNextResults"]["secondaryResults"]["secondaryResults"]["results"]: + #auto is like the others + if "compactAutoplayRenderer" in recmd.keys(): + recmd = recmd["compactAutoplayRenderer"]["contents"][0] + + if "compactVideoRenderer" in recmd.keys(): + recvids.add(recmd["compactVideoRenderer"]["videoId"]) + recchans.add(recmd["compactVideoRenderer"]["channelId"]) + + elif "compactPlaylistRenderer" in recmd.keys(): + recplayl.add(recmd["compactPlaylistRenderer"]["playlistId"]) + recvids.add(recmd["compactPlaylistRenderer"]["navigationEndpoint"]["watchEndpoint"]["videoId"]) + recchans.add(recmd["compactPlaylistRenderer"]["shortBylineText"]["navigationEndpoint"]["browseEndpoint"]["browseId"]) + + elif "compactRadioRenderer" in recmd.keys(): #mix playlist + recmixes.add(recmd["compactRadioRenderer"]["playlistId"]) + # todo: find out if channels can be suggested + + if initplay and initdata: + break + + return ccenabled, recvids, recchans, recmixes, recplayl + +if __name__ == "__main__": + from sys import argv + vidl = argv + vidl.pop(0) + for video in vidl: + print(getmetadata(video)) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9f6d497 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +requests +beautifulsoup4 +html5lib \ No newline at end of file diff --git a/ytcc-exporter.py b/ytcc-exporter.py new file mode 100644 index 0000000..7882d7a --- /dev/null +++ b/ytcc-exporter.py @@ -0,0 +1,104 @@ +# This function adapted from https://github.com/cdown/srt/blob/11089f1e021f2e074d04c33fc7ffc4b7b52e7045/srt.py, lines 69 and 189 (MIT License) +def timedelta_to_sbv_timestamp(timedelta_timestamp): + r""" + Convert a :py:class:`~datetime.timedelta` to an SRT timestamp. + .. doctest:: + >>> import datetime + >>> delta = datetime.timedelta(hours=1, minutes=23, seconds=4) + >>> timedelta_to_sbv_timestamp(delta) + '01:23:04,000' + :param datetime.timedelta timedelta_timestamp: A datetime to convert to an + SBV timestamp + :returns: The timestamp in SBV format + :rtype: str + """ + + SECONDS_IN_HOUR = 3600 + SECONDS_IN_MINUTE = 60 + HOURS_IN_DAY = 24 + MICROSECONDS_IN_MILLISECOND = 1000 + + hrs, secs_remainder = divmod(timedelta_timestamp.seconds, SECONDS_IN_HOUR) + hrs += timedelta_timestamp.days * HOURS_IN_DAY + mins, secs = divmod(secs_remainder, SECONDS_IN_MINUTE) + msecs = timedelta_timestamp.microseconds // MICROSECONDS_IN_MILLISECOND + return "%1d:%02d:%02d.%03d" % (hrs, mins, secs, msecs) + + +import requests +from bs4 import BeautifulSoup +from datetime import timedelta + +from json import loads + + +#HSID, SSID, SID cookies required +cookies = loads(open("config.json").read()) +headers = { + "cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], +} + +def getsubs(vid, lang="all"): + if lang == "all": + lparams = ( + ("v", vid), + ("ref", "player"), + ("o", "U"), + ) + + langpage = requests.get("https://www.youtube.com/timedtext_video", params=lparams, headers=headers) + + assert not "accounts.google.com" in langpage.url, "Please supply authentication cookie information in config.json. See README.md for more information." + + langs = [] + langsoup = BeautifulSoup(langpage.text, features="html5lib") + + if "create_channel" in langpage.url: + print(vid, "not found.") + elif langsoup.find_all("div", {"class": "not-accepting-caption-submissions"}): + print(vid, "has disabled community-contributed captioning.") + langs = [] + else: + langdivs = langsoup.find("ul", class_="yt-uix-languagepicker-language-list").find_all("li", class_="yt-uix-languagepicker-menu-item") + + for item in langdivs: + langs.append(item["data-value"]) + + print(vid, "has the following languages available", ", ".join(langs)+".") + else: + langs = [lang] + + for langcode in langs: + pparams = ( + ("v", vid), + ("lang", langcode), + ("action_mde_edit_form", 1), + ("bl", "vmp"), + ("ui", "hd"), + ("tab", "captions"), + ("o", "U") + ) + + page = requests.get("https://www.youtube.com/timedtext_editor", params=pparams, headers=headers) + + soup = BeautifulSoup(page.text, features="html5lib") + + divs = soup.find_all("div", class_="timed-event-line") + + outtext = "" + + for item in divs: + text = item.find("textarea").text + startms = int(item.find("input", class_="event-start-time")["data-start-ms"]) + endms = int(item.find("input", class_="event-end-time")["data-end-ms"]) + + outtext += timedelta_to_sbv_timestamp(timedelta(milliseconds=startms)) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=endms)) + "\n" + text + "\n\n" + + open(vid+"_"+langcode+".sbv", "w", encoding="utf-8").write(outtext[:-1]) + +if __name__ == "__main__": + from sys import argv + vidl = argv + vidl.pop(0) + for video in vidl: + getsubs(video) \ No newline at end of file