archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.
 
 

163 Zeilen
14 KiB

  1. from time import sleep
  2. from typing import Dict
  3. from json import loads
  4. langcodes = {"Afar": "aa", "Abkhazian": "ab", "Afrikaans": "af", "Akan": "ak", "all": "all", "Amharic": "am", "Aragonese": "an", "Arabic": "ar", "Aramaic": "arc", "Algerian Arabic": "arq", "Assamese": "as", "American Sign Language": "ase", "Asturian": "ast", "Avaric": "av", "Aymara": "ay", "Azerbaijani": "az", "Bashkir": "ba", "Belarusian": "be", "Bulgarian": "bg", "Bihari": "bh", "Bislama": "bi", "Bangla": "bn", "Tibetan": "bo", "Breton": "br", "Bosnian": "bs", "Catalan": "ca", "Cebuano": "ceb", "Choctaw": "cho", "Cherokee": "chr", "Corsican": "co", "Czech": "cs", "Church Slavic": "cu", "Welsh": "cy", "Danish": "da", "Danish (Denmark)": "da-DK", "German": "de", "German (Austria)": "de-AT", "German (Switzerland)": "de-CH", "German (Germany)": "de-DE", "Divehi": "dv", "Dzongkha": "dz", "Ewe": "ee", "Greek": "el", "English": "en", "English (United Arab Emirates)": "en-AE", "English (Canada)": "en-CA", "English (United Kingdom)": "en-GB", "English (Ireland)": "en-IE", "English (India)": "en-IN", "English (United States)": "en-US", "Esperanto": "eo", "Spanish": "es", "Spanish (Latin America)": "es-419", "Spanish (Argentina)": "es-AR", "Spanish (Chile)": "es-CL", "Spanish (Colombia)": "es-CO", "Spanish (Costa Rica)": "es-CR", "Spanish (Spain)": "es-ES", "Spanish (Mexico)": "es-MX", "Spanish (Nicaragua)": "es-NI", "Spanish (United States)": "es-US", "Estonian": "et", "Basque": "eu", "Persian": "fa", "Persian (Afghanistan)": "fa-AF", "Persian (Iran)": "fa-IR", "Fulah": "ff", "Finnish": "fi", "Filipino": "fil", "Fijian": "fj", "Faroese": "fo", "French": "fr", "French (Belgium)": "fr-BE", "French (Canada)": "fr-CA", "French (Switzerland)": "fr-CH", "French (France)": "fr-FR", "Western Frisian": "fy", "Irish": "ga", "Scottish Gaelic": "gd", "Galician": "gl", "Guarani": "gn", "Swiss German": "gsw", "Gujarati": "gu", "Hausa": "ha", "Hakka Chinese": "hak", "Hakka Chinese (Taiwan)": "hak-TW", "Hindi": "hi-Latn", "Hmong": "hmn", "Croatian": "hr", "Haitian Creole": "ht", "Hungarian": "hu", "Armenian": "hy", "Interlingua": "ia", "Indonesian": "id", "Interlingue": "ie", "Igbo": "ig", "Sichuan Yi": "ii", "Inupiaq": "ik", "Icelandic": "is", "Italian": "it", "Italian (Italy)": "it-IT", "Inuktitut": "iu", "Hebrew": "iw", "Japanese": "ja", "Javanese": "jv", "Georgian": "ka", "Kazakh": "kk", "Kalaallisut": "kl", "Khmer": "km", "Kannada": "kn", "Korean": "ko", "Korean (South Korea)": "ko-KR", "Kanuri": "kr", "Kashmiri": "ks", "Kurdish": "ku", "Kyrgyz": "ky", "Latin": "la", "Luxembourgish": "lb", "Lingala": "ln", "Lao": "lo", "Lithuanian": "lt", "Mizo": "lus", "Latvian": "lv", "Masai": "mas", "Malagasy": "mg", "Maori": "mi", "Miscellaneous languages": "mis", "Macedonian": "mk", "Malayalam": "ml", "Mongolian": "mn", "Manipuri": "mni", "Moldavian": "mo", "Marathi": "mr", "Malay": "ms", "Maltese": "mt", "Burmese": "my", "Nauru": "na", "Min Nan Chinese": "nan", "Min Nan Chinese (Taiwan)": "nan-TW", "Nepali": "ne", "Dutch": "nl", "Dutch (Belgium)": "nl-BE", "Dutch (Netherlands)": "nl-NL", "Norwegian Nynorsk": "nn", "Norwegian": "no", "not": "not", "Navajo": "nv", "Occitan": "oc", "Oromo": "om", "Odia": "or", "Punjabi": "pa", "Polish": "pl", "Polish (Poland)": "pl-PL", "Pashto": "ps", "Portuguese": "pt", "Portuguese (Brazil)": "pt-BR", "Portuguese (Portugal)": "pt-PT", "Quechua": "qu", "Romansh": "rm", "Rundi": "rn", "Romanian": "ro", "Romanian (Moldova)": "ro-MD", "Russian": "ru-Latn", "Russian (Russia)": "ru-RU", "Kinyarwanda": "rw", "Sanskrit": "sa", "Sardinian": "sc", "Sicilian": "scn", "Scots": "sco", "Sindhi": "sd", "Sherdukpen": "sdp", "Northern Sami": "se", "Sango": "sg", "Serbo-Croatian": "sh", "Sinhala": "si", "Slovak": "sk", "Slovenian": "sl", "Samoan": "sm", "Shona": "sn", "Somali": "so", "Albanian": "sq", "Serbian": "sr", "Serbian (Cyrillic)": "sr-Cyrl", "Serbian (Latin)": "sr-Latn", "Swati": "ss", "Southern Sotho": "st", "Sundanese": "su", "Swedish": "sv", "Swahili": "sw", "Tamil": "ta", "Telugu": "te", "Tajik": "tg", "Thai": "th", "Tigrinya": "ti", "Turkmen": "tk", "Tagalog": "tl", "Klingon": "tlh", "Tswana": "tn", "Tongan": "to", "Turkish": "tr", "Turkish (Turkey)": "tr-TR", "Tsonga": "ts", "Tatar": "tt", "Twi": "tw", "Ukrainian": "uk", "Urdu": "ur", "Uzbek": "uz", "Vietnamese": "vi", "Volap\\xFCk": "vo", "Wolof": "wo", "Xhosa": "xh", "Yiddish": "yi", "Yoruba": "yo", "Cantonese": "yue", "Cantonese (Hong Kong)": "yue-HK", "Chinese": "zh", "Chinese (China)": "zh-CN", "Chinese (Hong Kong)": "zh-HK", "Chinese (Simplified)": "zh-Hans", "Chinese (Simplified, China)": "zh-Hans-CN", "Chinese (Simplified, Singapore)": "zh-Hans-SG", "Chinese (Traditional)": "zh-Hant", "Chinese (Traditional, Hong Kong)": "zh-Hant-HK", "Chinese (Traditional, Taiwan)": "zh-Hant-TW", "Chinese (Singapore)": "zh-SG", "Chinese (Taiwan)": "zh-TW", "Zulu": "zu", "Hiri Motu": "ho", "Tok Pisin": "tpi", "Voro": "vor"}
  5. def getmetadata(mysession, vid, ccenabledonly=False):
  6. params = (
  7. ("v", vid),
  8. )
  9. while True:
  10. wpage = mysession.get("https://www.youtube.com/watch", params=params)
  11. if not """</div><div id="content" class=" content-alignment" role="main"><p class='largeText'>Sorry for the interruption. We have been receiving a large volume of requests from your network.</p>
  12. <p>To continue with your YouTube experience, please fill out the form below.</p>""" in wpage.text and not wpage.status_code == 429 and 'window["ytInitialPlayerResponse"] = ' in wpage.text and 'window["ytInitialData"] = ' in wpage.text:
  13. break
  14. else:
  15. print("Captcha detected, waiting 30 seconds")
  16. sleep(30)
  17. wptext = wpage.text
  18. initplay = None
  19. initdata = None
  20. recvids = set()
  21. recchans = set()
  22. recmixes = set()
  23. recplayl = set()
  24. ccenabled = False #default values
  25. creditdata = {}
  26. for line in wptext.splitlines():
  27. if line.strip().startswith('window["ytInitialPlayerResponse"] = '):
  28. initplay = loads(line.split('window["ytInitialPlayerResponse"] = ', 1)[1].strip()[:-1])
  29. if not ccenabledonly:
  30. if initplay["playabilityStatus"]["status"] == "ERROR":
  31. print(vid, "unavailable")
  32. return False, {}, recvids, recchans, recmixes, recplayl
  33. if "endscreen" in initplay.keys():
  34. if "endscreenRenderer" in initplay["endscreen"].keys():
  35. for el in initplay["endscreen"]["endscreenRenderer"]:
  36. if type(el) == Dict:
  37. elint = el["endscreenElementRenderer"]
  38. if "endscreenElementRenderer" in el.keys():
  39. if elint["style"] == "VIDEO":
  40. recvids.add(elint["endpoint"]["watchEndpoint"]["videoId"])
  41. elif elint["style"] == "CHANNEL":
  42. try:
  43. recchans.add(elint["endpoint"]["browseEndpoint"]["browseId"])
  44. except:
  45. print("Channel endscreen error")
  46. raise
  47. elif elint["style"] == "PLAYLIST":
  48. recvids.add(elint["endpoint"]["watchEndpoint"]["videoId"])
  49. recplayl.add(elint["endpoint"]["watchEndpint"]["playlistId"])
  50. if "captions" in initplay.keys():
  51. ccenabled = "contribute" in initplay["captions"]["playerCaptionsRenderer"]
  52. else:
  53. ccenabled = False # if captions information is not present, community contributions are not enabled
  54. if not ccenabledonly:
  55. if "videoDetails" in initplay.keys():
  56. if "channelId" in initplay["videoDetails"].keys():
  57. recchans.add(initplay["videoDetails"]["channelId"])
  58. elif line.strip().startswith('window["ytInitialData"] = '):
  59. if not ccenabledonly:
  60. initdata = loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1])
  61. if "contents" in initdata.keys(): #prevent exception
  62. try:
  63. if "results" in initdata["contents"]["twoColumnWatchNextResults"]["secondaryResults"]["secondaryResults"].keys():
  64. for recmd in initdata["contents"]["twoColumnWatchNextResults"]["secondaryResults"]["secondaryResults"]["results"]:
  65. #auto is like the others
  66. if "compactAutoplayRenderer" in recmd.keys():
  67. recmd = recmd["compactAutoplayRenderer"]["contents"][0]
  68. if "compactVideoRenderer" in recmd.keys():
  69. recvids.add(recmd["compactVideoRenderer"]["videoId"])
  70. try:
  71. recchans.add(recmd["compactVideoRenderer"]["channelId"])
  72. except KeyError as e:
  73. try:
  74. recchans.add(recmd["compactVideoRenderer"]["longBylineText"]["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"])
  75. except KeyError as e:
  76. print("Channel extract error")
  77. #raise
  78. #print("Unable to extract channel:")
  79. #print(recmd["compactVideoRenderer"])
  80. elif "compactPlaylistRenderer" in recmd.keys():
  81. recplayl.add(recmd["compactPlaylistRenderer"]["playlistId"])
  82. if "navigationEndpoint" in recmd["compactPlaylistRenderer"].keys():
  83. recvids.add(recmd["compactPlaylistRenderer"]["navigationEndpoint"]["watchEndpoint"]["videoId"])
  84. if "navigationEndpoint" in recmd["compactPlaylistRenderer"]["shortBylineText"].keys():
  85. recchans.add(recmd["compactPlaylistRenderer"]["shortBylineText"]["navigationEndpoint"]["browseEndpoint"]["browseId"])
  86. elif "compactRadioRenderer" in recmd.keys(): #mix playlist
  87. recmixes.add(recmd["compactRadioRenderer"]["playlistId"])
  88. # todo: find out if channels can be suggested
  89. except BaseException as e:
  90. print(e)
  91. print("Exception in discovery, continuing anyway")
  92. creditdata = {}
  93. if not ccenabledonly:
  94. try:
  95. mdinfo = initdata["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][1]["videoSecondaryInfoRenderer"]["metadataRowContainer"]["metadataRowContainerRenderer"]["rows"]
  96. for item in mdinfo:
  97. if item["metadataRowRenderer"]["title"]["simpleText"].startswith("Caption author"): #the request to /watch needs to be in English for this to work
  98. try:
  99. desl = langcodes[item["metadataRowRenderer"]["title"]["simpleText"].split("(", 1)[1][:-1]]
  100. except KeyError as e:
  101. #print(e)
  102. print("Language code conversion error, using language name")
  103. desl = item["metadataRowRenderer"]["title"]["simpleText"].split("(", 1)[1][:-1]
  104. creditdata[desl] = []
  105. for itemint in item["metadataRowRenderer"]["contents"]:
  106. creditdata[desl].append({"name": itemint["runs"][0]["text"], "channel": itemint["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"]})
  107. except KeyError as e:
  108. #print("Video does not have credits")
  109. pass
  110. #raise
  111. #print(e)
  112. if initplay and (initdata or ccenabledonly):
  113. break
  114. return ccenabled, creditdata, recvids, recchans, recmixes, recplayl
  115. if __name__ == "__main__":
  116. from sys import argv
  117. from requests import session
  118. from os.path import isfile
  119. from os import environ
  120. mysession = session()
  121. #HSID, SSID, SID cookies required
  122. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  123. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  124. elif isfile("config.json"):
  125. cookies = loads(open("config.json").read())
  126. else:
  127. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  128. assert False
  129. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  130. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  131. assert False
  132. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  133. vidl = argv
  134. vidl.pop(0)
  135. for video in vidl:
  136. print(getmetadata(mysession, video))