Add files via upload

3 years ago · e36d4af1ed
--- a/README.md
+++ b/README.md
@@ -0,0 +1,8 @@
 # YouTube Community Contributed Captions Exporter
 Export YouTube community-contributed captioning drafts to SBV files.

 ## Setup
 Install the requirements in the requirements.txt file (`pip install -r requirements.txt`). Because the captioning editor is only available to logged-in users, you must specify the values of three session cookies for any Google account (`HSID`, `SSID`, and `SID`). You can get these cookie values by opening the developer tools on any youtube.com webpage, going to the "Application" (Chrome) or "Storage" (Firefox) tab, selecting "Cookies", and copying the required values.

 ## Usage
 Simply run `python3 ytcc-exporter.py` followed by a list of space-separated YouTube video IDs, and all community-contributed captioning drafts in all languages will be exported.
--- a/config.json
+++ b/config.json
@@ -0,0 +1,5 @@
 {
    "HSID": "",
    "SSID": "",
    "SID" : ""
 }
--- a/discovery.py
+++ b/discovery.py
@@ -0,0 +1,78 @@
 import requests
 from json import loads

 def getmetadata(vid):
    params = (
        ("v", vid),
    )
    wpage = requests.get("https://www.youtube.com/watch", params=params)

    wptext = wpage.text

    initplay = None
    initdata = None

    recvids  = set()
    recchans = set()
    recmixes = set()
    recplayl = set()

    for line in wptext.splitlines():
        if line.strip().startswith('window["ytInitialPlayerResponse"] = '):
            initplay = loads(line.split('window["ytInitialPlayerResponse"] = ', 1)[1].strip()[:-1])

            if initplay["playabilityStatus"]["status"] == "ERROR":
                print(vid, "unavailable")
                return False, recvids, recchans, recmixes, recplayl
            
            if "endscreen" in initplay.keys():
                for el in initplay["endscreen"]["endscreenRenderer"]:

                    elint = el["endscreenElementRenderer"]

                    if elint["style"] == "VIDEO":
                        recvids.add(elint["endpoint"]["watchEndpoint"]["videoId"])

                    elif elint["style"] == "CHANNEL":
                        recchans.add(elint["endpoint"]["browseEndpoint"]["browseId"])

                    elif elint["style"] == "PLAYLIST":
                        recvids.add(elint["endpoint"]["watchEndpoint"]["videoId"])
                        recplayl.add(elint["endpoint"]["watchEndpint"]["playlistId"])

            if "captions" in initplay.keys():
                ccenabled = "contribute" in initplay["captions"]["playerCaptionsRenderer"]

            recchans.add(initplay["videoDetails"]["channelId"])
        elif line.strip().startswith('window["ytInitialData"] = '):
            initdata = loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1])
            if "contents" in initdata.keys(): #prevent exception
                for recmd in initdata["contents"]["twoColumnWatchNextResults"]["secondaryResults"]["secondaryResults"]["results"]:
                    #auto is like the others
                    if "compactAutoplayRenderer" in recmd.keys():
                        recmd = recmd["compactAutoplayRenderer"]["contents"][0]

                    if "compactVideoRenderer" in recmd.keys():
                        recvids.add(recmd["compactVideoRenderer"]["videoId"])
                        recchans.add(recmd["compactVideoRenderer"]["channelId"])

                    elif "compactPlaylistRenderer" in recmd.keys():
                        recplayl.add(recmd["compactPlaylistRenderer"]["playlistId"])
                        recvids.add(recmd["compactPlaylistRenderer"]["navigationEndpoint"]["watchEndpoint"]["videoId"])
                        recchans.add(recmd["compactPlaylistRenderer"]["shortBylineText"]["navigationEndpoint"]["browseEndpoint"]["browseId"])

                    elif "compactRadioRenderer" in recmd.keys(): #mix playlist
                        recmixes.add(recmd["compactRadioRenderer"]["playlistId"])
                    # todo: find out if channels can be suggested
        
        if initplay and initdata:
            break

    return ccenabled, recvids, recchans, recmixes, recplayl

 if __name__ == "__main__":
    from sys import argv
    vidl = argv
    vidl.pop(0)
    for video in vidl:
        print(getmetadata(video))
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,3 @@
 requests
 beautifulsoup4
 html5lib
--- a/ytcc-exporter.py
+++ b/ytcc-exporter.py
@@ -0,0 +1,104 @@
 # This function adapted from https://github.com/cdown/srt/blob/11089f1e021f2e074d04c33fc7ffc4b7b52e7045/srt.py, lines 69 and 189 (MIT License)
 def timedelta_to_sbv_timestamp(timedelta_timestamp):
    r"""
    Convert a :py:class:`~datetime.timedelta` to an SRT timestamp.
    .. doctest::
        >>> import datetime
        >>> delta = datetime.timedelta(hours=1, minutes=23, seconds=4)
        >>> timedelta_to_sbv_timestamp(delta)
        '01:23:04,000'
    :param datetime.timedelta timedelta_timestamp: A datetime to convert to an
                                                   SBV timestamp
    :returns: The timestamp in SBV format
    :rtype: str
    """

    SECONDS_IN_HOUR = 3600
    SECONDS_IN_MINUTE = 60
    HOURS_IN_DAY = 24
    MICROSECONDS_IN_MILLISECOND = 1000

    hrs, secs_remainder = divmod(timedelta_timestamp.seconds, SECONDS_IN_HOUR)
    hrs += timedelta_timestamp.days * HOURS_IN_DAY
    mins, secs = divmod(secs_remainder, SECONDS_IN_MINUTE)
    msecs = timedelta_timestamp.microseconds // MICROSECONDS_IN_MILLISECOND
    return "%1d:%02d:%02d.%03d" % (hrs, mins, secs, msecs)


 import requests
 from bs4 import BeautifulSoup
 from datetime import timedelta

 from json import loads


 #HSID, SSID, SID cookies required
 cookies = loads(open("config.json").read())
 headers = {
    "cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"],
 }

 def getsubs(vid, lang="all"):
    if lang == "all":
        lparams = (
            ("v", vid),
            ("ref", "player"),
            ("o", "U"),
        )

        langpage = requests.get("https://www.youtube.com/timedtext_video", params=lparams, headers=headers)

        assert not "accounts.google.com" in langpage.url, "Please supply authentication cookie information in config.json. See README.md for more information."

        langs = []
        langsoup = BeautifulSoup(langpage.text, features="html5lib")

        if "create_channel" in langpage.url:
            print(vid, "not found.")
        elif langsoup.find_all("div", {"class": "not-accepting-caption-submissions"}):
            print(vid, "has disabled community-contributed captioning.")
            langs = []
        else:
            langdivs = langsoup.find("ul", class_="yt-uix-languagepicker-language-list").find_all("li", class_="yt-uix-languagepicker-menu-item")

            for item in langdivs:
                langs.append(item["data-value"])

            print(vid, "has the following languages available", ", ".join(langs)+".")
    else:
        langs = [lang]

    for langcode in langs:
        pparams = (
            ("v", vid),
            ("lang", langcode),
            ("action_mde_edit_form", 1),
            ("bl", "vmp"),
            ("ui", "hd"),
            ("tab", "captions"),
            ("o", "U")
        )

        page = requests.get("https://www.youtube.com/timedtext_editor", params=pparams, headers=headers)

        soup = BeautifulSoup(page.text, features="html5lib")

        divs = soup.find_all("div", class_="timed-event-line")

        outtext = ""

        for item in divs:
            text = item.find("textarea").text
            startms = int(item.find("input", class_="event-start-time")["data-start-ms"])
            endms = int(item.find("input", class_="event-end-time")["data-end-ms"])

            outtext += timedelta_to_sbv_timestamp(timedelta(milliseconds=startms)) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=endms)) + "\n" + text + "\n\n"

        open(vid+"_"+langcode+".sbv", "w", encoding="utf-8").write(outtext[:-1])

 if __name__ == "__main__":
    from sys import argv
    vidl = argv
    vidl.pop(0)
    for video in vidl:
        getsubs(video)