archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 

108 lignes
9.6 KiB

  1. import requests
  2. from json import loads
  3. langcodes = {"Afar": "aa", "Abkhazian": "ab", "Afrikaans": "af", "Akan": "ak", "all": "all", "Amharic": "am", "Aragonese": "an", "Arabic": "ar", "Aramaic": "arc", "Algerian Arabic": "arq", "Assamese": "as", "American Sign Language": "ase", "Asturian": "ast", "Avaric": "av", "Aymara": "ay", "Azerbaijani": "az", "Bashkir": "ba", "Belarusian": "be", "Bulgarian": "bg", "Bihari": "bh", "Bislama": "bi", "Bangla": "bn", "Tibetan": "bo", "Breton": "br", "Bosnian": "bs", "Catalan": "ca", "Cebuano": "ceb", "Choctaw": "cho", "Cherokee": "chr", "Corsican": "co", "Czech": "cs", "Church Slavic": "cu", "Welsh": "cy", "Danish": "da", "Danish (Denmark)": "da-DK", "German": "de", "German (Austria)": "de-AT", "German (Switzerland)": "de-CH", "German (Germany)": "de-DE", "Divehi": "dv", "Dzongkha": "dz", "Ewe": "ee", "Greek": "el", "English": "en", "English (United Arab Emirates)": "en-AE", "English (Canada)": "en-CA", "English (United Kingdom)": "en-GB", "English (Ireland)": "en-IE", "English (India)": "en-IN", "English (United States)": "en-US", "Esperanto": "eo", "Spanish": "es", "Spanish (Latin America)": "es-419", "Spanish (Argentina)": "es-AR", "Spanish (Chile)": "es-CL", "Spanish (Colombia)": "es-CO", "Spanish (Costa Rica)": "es-CR", "Spanish (Spain)": "es-ES", "Spanish (Mexico)": "es-MX", "Spanish (Nicaragua)": "es-NI", "Spanish (United States)": "es-US", "Estonian": "et", "Basque": "eu", "Persian": "fa", "Persian (Afghanistan)": "fa-AF", "Persian (Iran)": "fa-IR", "Fulah": "ff", "Finnish": "fi", "Filipino": "fil", "Fijian": "fj", "Faroese": "fo", "French": "fr", "French (Belgium)": "fr-BE", "French (Canada)": "fr-CA", "French (Switzerland)": "fr-CH", "French (France)": "fr-FR", "Western Frisian": "fy", "Irish": "ga", "Scottish Gaelic": "gd", "Galician": "gl", "Guarani": "gn", "Swiss German": "gsw", "Gujarati": "gu", "Hausa": "ha", "Hakka Chinese": "hak", "Hakka Chinese (Taiwan)": "hak-TW", "Hindi": "hi-Latn", "Hmong": "hmn", "Croatian": "hr", "Haitian Creole": "ht", "Hungarian": "hu", "Armenian": "hy", "Interlingua": "ia", "Indonesian": "id", "Interlingue": "ie", "Igbo": "ig", "Sichuan Yi": "ii", "Inupiaq": "ik", "Icelandic": "is", "Italian": "it", "Italian (Italy)": "it-IT", "Inuktitut": "iu", "Hebrew": "iw", "Japanese": "ja", "Javanese": "jv", "Georgian": "ka", "Kazakh": "kk", "Kalaallisut": "kl", "Khmer": "km", "Kannada": "kn", "Korean": "ko", "Korean (South Korea)": "ko-KR", "Kanuri": "kr", "Kashmiri": "ks", "Kurdish": "ku", "Kyrgyz": "ky", "Latin": "la", "Luxembourgish": "lb", "Lingala": "ln", "Lao": "lo", "Lithuanian": "lt", "Mizo": "lus", "Latvian": "lv", "Masai": "mas", "Malagasy": "mg", "Maori": "mi", "Miscellaneous languages": "mis", "Macedonian": "mk", "Malayalam": "ml", "Mongolian": "mn", "Manipuri": "mni", "Moldavian": "mo", "Marathi": "mr", "Malay": "ms", "Maltese": "mt", "Burmese": "my", "Nauru": "na", "Min Nan Chinese": "nan", "Min Nan Chinese (Taiwan)": "nan-TW", "Nepali": "ne", "Dutch": "nl", "Dutch (Belgium)": "nl-BE", "Dutch (Netherlands)": "nl-NL", "Norwegian Nynorsk": "nn", "Norwegian": "no", "not": "not", "Navajo": "nv", "Occitan": "oc", "Oromo": "om", "Odia": "or", "Punjabi": "pa", "Polish": "pl", "Polish (Poland)": "pl-PL", "Pashto": "ps", "Portuguese": "pt", "Portuguese (Brazil)": "pt-BR", "Portuguese (Portugal)": "pt-PT", "Quechua": "qu", "Romansh": "rm", "Rundi": "rn", "Romanian": "ro", "Romanian (Moldova)": "ro-MD", "Russian": "ru-Latn", "Russian (Russia)": "ru-RU", "Kinyarwanda": "rw", "Sanskrit": "sa", "Sardinian": "sc", "Sicilian": "scn", "Scots": "sco", "Sindhi": "sd", "Sherdukpen": "sdp", "Northern Sami": "se", "Sango": "sg", "Serbo-Croatian": "sh", "Sinhala": "si", "Slovak": "sk", "Slovenian": "sl", "Samoan": "sm", "Shona": "sn", "Somali": "so", "Albanian": "sq", "Serbian": "sr", "Serbian (Cyrillic)": "sr-Cyrl", "Serbian (Latin)": "sr-Latn", "Swati": "ss", "Southern Sotho": "st", "Sundanese": "su", "Swedish": "sv", "Swahili": "sw", "Tamil": "ta", "Telugu": "te", "Tajik": "tg", "Thai": "th", "Tigrinya": "ti", "Turkmen": "tk", "Tagalog": "tl", "Klingon": "tlh", "Tswana": "tn", "Tongan": "to", "Turkish": "tr", "Turkish (Turkey)": "tr-TR", "Tsonga": "ts", "Tatar": "tt", "Twi": "tw", "Ukrainian": "uk", "Urdu": "ur", "Uzbek": "uz", "Vietnamese": "vi", "Volap\\xFCk": "vo", "Wolof": "wo", "Xhosa": "xh", "Yiddish": "yi", "Yoruba": "yo", "Cantonese": "yue", "Cantonese (Hong Kong)": "yue-HK", "Chinese": "zh", "Chinese (China)": "zh-CN", "Chinese (Hong Kong)": "zh-HK", "Chinese (Simplified)": "zh-Hans", "Chinese (Simplified, China)": "zh-Hans-CN", "Chinese (Simplified, Singapore)": "zh-Hans-SG", "Chinese (Traditional)": "zh-Hant", "Chinese (Traditional, Hong Kong)": "zh-Hant-HK", "Chinese (Traditional, Taiwan)": "zh-Hant-TW", "Chinese (Singapore)": "zh-SG", "Chinese (Taiwan)": "zh-TW", "Zulu": "zu", "Hiri Motu": "ho", "Tok Pisin": "tpi", "Voro": "vor"}
  4. def getmetadata(vid):
  5. params = (
  6. ("v", vid),
  7. )
  8. headers = {
  9. "Accept-Language": "en-US",
  10. }
  11. wpage = requests.get("https://www.youtube.com/watch", headers=headers, params=params)
  12. wptext = wpage.text
  13. initplay = None
  14. initdata = None
  15. recvids = set()
  16. recchans = set()
  17. recmixes = set()
  18. recplayl = set()
  19. for line in wptext.splitlines():
  20. if line.strip().startswith('window["ytInitialPlayerResponse"] = '):
  21. initplay = loads(line.split('window["ytInitialPlayerResponse"] = ', 1)[1].strip()[:-1])
  22. if initplay["playabilityStatus"]["status"] == "ERROR":
  23. print(vid, "unavailable")
  24. return False, {}, recvids, recchans, recmixes, recplayl
  25. if "endscreen" in initplay.keys():
  26. for el in initplay["endscreen"]["endscreenRenderer"]:
  27. elint = el["endscreenElementRenderer"]
  28. if elint["style"] == "VIDEO":
  29. recvids.add(elint["endpoint"]["watchEndpoint"]["videoId"])
  30. elif elint["style"] == "CHANNEL":
  31. recchans.add(elint["endpoint"]["browseEndpoint"]["browseId"])
  32. elif elint["style"] == "PLAYLIST":
  33. recvids.add(elint["endpoint"]["watchEndpoint"]["videoId"])
  34. recplayl.add(elint["endpoint"]["watchEndpint"]["playlistId"])
  35. if "captions" in initplay.keys():
  36. ccenabled = "contribute" in initplay["captions"]["playerCaptionsRenderer"]
  37. else:
  38. ccenabled = False # if captions information is not present, community contributions are not enabled
  39. recchans.add(initplay["videoDetails"]["channelId"])
  40. elif line.strip().startswith('window["ytInitialData"] = '):
  41. initdata = loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1])
  42. if "contents" in initdata.keys(): #prevent exception
  43. for recmd in initdata["contents"]["twoColumnWatchNextResults"]["secondaryResults"]["secondaryResults"]["results"]:
  44. #auto is like the others
  45. if "compactAutoplayRenderer" in recmd.keys():
  46. recmd = recmd["compactAutoplayRenderer"]["contents"][0]
  47. if "compactVideoRenderer" in recmd.keys():
  48. recvids.add(recmd["compactVideoRenderer"]["videoId"])
  49. try:
  50. recchans.add(recmd["compactVideoRenderer"]["channelId"])
  51. except KeyError as e:
  52. print("Unable to extract channel:")
  53. print(recmd["compactVideoRenderer"])
  54. elif "compactPlaylistRenderer" in recmd.keys():
  55. recplayl.add(recmd["compactPlaylistRenderer"]["playlistId"])
  56. recvids.add(recmd["compactPlaylistRenderer"]["navigationEndpoint"]["watchEndpoint"]["videoId"])
  57. recchans.add(recmd["compactPlaylistRenderer"]["shortBylineText"]["navigationEndpoint"]["browseEndpoint"]["browseId"])
  58. elif "compactRadioRenderer" in recmd.keys(): #mix playlist
  59. recmixes.add(recmd["compactRadioRenderer"]["playlistId"])
  60. # todo: find out if channels can be suggested
  61. creditdata = {}
  62. try:
  63. mdinfo = initdata["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][1]["videoSecondaryInfoRenderer"]["metadataRowContainer"]["metadataRowContainerRenderer"]["rows"]
  64. for item in mdinfo:
  65. if item["metadataRowRenderer"]["title"]["simpleText"].startswith("Caption author"): #the request to /watch needs to be in English for this to work
  66. try:
  67. desl = langcodes[item["metadataRowRenderer"]["title"]["simpleText"].split("(", 1)[1][:-1]]
  68. except KeyError as e:
  69. print(e)
  70. desl = item["metadataRowRenderer"]["title"]["simpleText"].split("(", 1)[1][:-1]
  71. creditdata[desl] = []
  72. for itemint in item["metadataRowRenderer"]["contents"]:
  73. creditdata[desl].append({"name": itemint["runs"][0]["text"], "channel": itemint["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"]})
  74. except KeyError as e:
  75. print(e)
  76. if initplay and initdata:
  77. break
  78. return ccenabled, creditdata, recvids, recchans, recmixes, recplayl
  79. if __name__ == "__main__":
  80. from sys import argv
  81. vidl = argv
  82. vidl.pop(0)
  83. for video in vidl:
  84. print(getmetadata(video))