archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 

157 linhas
13 KiB

  1. from time import sleep
  2. from typing import Dict
  3. from json import loads
  4. from switchable_request import get
  5. backend = "requests"
  6. failcnt = 0
  7. langcodes = {"Afar": "aa", "Abkhazian": "ab", "Afrikaans": "af", "Akan": "ak", "all": "all", "Amharic": "am", "Aragonese": "an", "Arabic": "ar", "Aramaic": "arc", "Algerian Arabic": "arq", "Assamese": "as", "American Sign Language": "ase", "Asturian": "ast", "Avaric": "av", "Aymara": "ay", "Azerbaijani": "az", "Bashkir": "ba", "Belarusian": "be", "Bulgarian": "bg", "Bihari": "bh", "Bislama": "bi", "Bangla": "bn", "Tibetan": "bo", "Breton": "br", "Bosnian": "bs", "Catalan": "ca", "Cebuano": "ceb", "Choctaw": "cho", "Cherokee": "chr", "Corsican": "co", "Czech": "cs", "Church Slavic": "cu", "Welsh": "cy", "Danish": "da", "Danish (Denmark)": "da-DK", "German": "de", "German (Austria)": "de-AT", "German (Switzerland)": "de-CH", "German (Germany)": "de-DE", "Divehi": "dv", "Dzongkha": "dz", "Ewe": "ee", "Greek": "el", "English": "en", "English (United Arab Emirates)": "en-AE", "English (Canada)": "en-CA", "English (United Kingdom)": "en-GB", "English (Ireland)": "en-IE", "English (India)": "en-IN", "English (United States)": "en-US", "Esperanto": "eo", "Spanish": "es", "Spanish (Latin America)": "es-419", "Spanish (Argentina)": "es-AR", "Spanish (Chile)": "es-CL", "Spanish (Colombia)": "es-CO", "Spanish (Costa Rica)": "es-CR", "Spanish (Spain)": "es-ES", "Spanish (Mexico)": "es-MX", "Spanish (Nicaragua)": "es-NI", "Spanish (United States)": "es-US", "Estonian": "et", "Basque": "eu", "Persian": "fa", "Persian (Afghanistan)": "fa-AF", "Persian (Iran)": "fa-IR", "Fulah": "ff", "Finnish": "fi", "Filipino": "fil", "Fijian": "fj", "Faroese": "fo", "French": "fr", "French (Belgium)": "fr-BE", "French (Canada)": "fr-CA", "French (Switzerland)": "fr-CH", "French (France)": "fr-FR", "Western Frisian": "fy", "Irish": "ga", "Scottish Gaelic": "gd", "Galician": "gl", "Guarani": "gn", "Swiss German": "gsw", "Gujarati": "gu", "Hausa": "ha", "Hakka Chinese": "hak", "Hakka Chinese (Taiwan)": "hak-TW", "Hindi": "hi-Latn", "Hmong": "hmn", "Croatian": "hr", "Haitian Creole": "ht", "Hungarian": "hu", "Armenian": "hy", "Interlingua": "ia", "Indonesian": "id", "Interlingue": "ie", "Igbo": "ig", "Sichuan Yi": "ii", "Inupiaq": "ik", "Icelandic": "is", "Italian": "it", "Italian (Italy)": "it-IT", "Inuktitut": "iu", "Hebrew": "iw", "Japanese": "ja", "Javanese": "jv", "Georgian": "ka", "Kazakh": "kk", "Kalaallisut": "kl", "Khmer": "km", "Kannada": "kn", "Korean": "ko", "Korean (South Korea)": "ko-KR", "Kanuri": "kr", "Kashmiri": "ks", "Kurdish": "ku", "Kyrgyz": "ky", "Latin": "la", "Luxembourgish": "lb", "Lingala": "ln", "Lao": "lo", "Lithuanian": "lt", "Mizo": "lus", "Latvian": "lv", "Masai": "mas", "Malagasy": "mg", "Maori": "mi", "Miscellaneous languages": "mis", "Macedonian": "mk", "Malayalam": "ml", "Mongolian": "mn", "Manipuri": "mni", "Moldavian": "mo", "Marathi": "mr", "Malay": "ms", "Maltese": "mt", "Burmese": "my", "Nauru": "na", "Min Nan Chinese": "nan", "Min Nan Chinese (Taiwan)": "nan-TW", "Nepali": "ne", "Dutch": "nl", "Dutch (Belgium)": "nl-BE", "Dutch (Netherlands)": "nl-NL", "Norwegian Nynorsk": "nn", "Norwegian": "no", "not": "not", "Navajo": "nv", "Occitan": "oc", "Oromo": "om", "Odia": "or", "Punjabi": "pa", "Polish": "pl", "Polish (Poland)": "pl-PL", "Pashto": "ps", "Portuguese": "pt", "Portuguese (Brazil)": "pt-BR", "Portuguese (Portugal)": "pt-PT", "Quechua": "qu", "Romansh": "rm", "Rundi": "rn", "Romanian": "ro", "Romanian (Moldova)": "ro-MD", "Russian": "ru-Latn", "Russian (Russia)": "ru-RU", "Kinyarwanda": "rw", "Sanskrit": "sa", "Sardinian": "sc", "Sicilian": "scn", "Scots": "sco", "Sindhi": "sd", "Sherdukpen": "sdp", "Northern Sami": "se", "Sango": "sg", "Serbo-Croatian": "sh", "Sinhala": "si", "Slovak": "sk", "Slovenian": "sl", "Samoan": "sm", "Shona": "sn", "Somali": "so", "Albanian": "sq", "Serbian": "sr", "Serbian (Cyrillic)": "sr-Cyrl", "Serbian (Latin)": "sr-Latn", "Swati": "ss", "Southern Sotho": "st", "Sundanese": "su", "Swedish": "sv", "Swahili": "sw", "Tamil": "ta", "Telugu": "te", "Tajik": "tg", "Thai": "th", "Tigrinya": "ti", "Turkmen": "tk", "Tagalog": "tl", "Klingon": "tlh", "Tswana": "tn", "Tongan": "to", "Turkish": "tr", "Turkish (Turkey)": "tr-TR", "Tsonga": "ts", "Tatar": "tt", "Twi": "tw", "Ukrainian": "uk", "Urdu": "ur", "Uzbek": "uz", "Vietnamese": "vi", "Volap\\xFCk": "vo", "Wolof": "wo", "Xhosa": "xh", "Yiddish": "yi", "Yoruba": "yo", "Cantonese": "yue", "Cantonese (Hong Kong)": "yue-HK", "Chinese": "zh", "Chinese (China)": "zh-CN", "Chinese (Hong Kong)": "zh-HK", "Chinese (Simplified)": "zh-Hans", "Chinese (Simplified, China)": "zh-Hans-CN", "Chinese (Simplified, Singapore)": "zh-Hans-SG", "Chinese (Traditional)": "zh-Hant", "Chinese (Traditional, Hong Kong)": "zh-Hant-HK", "Chinese (Traditional, Taiwan)": "zh-Hant-TW", "Chinese (Singapore)": "zh-SG", "Chinese (Taiwan)": "zh-TW", "Zulu": "zu", "Hiri Motu": "ho", "Tok Pisin": "tpi", "Voro": "vor"}
  8. def getmetadata(mysession, vid, allheaders):
  9. global backend
  10. global failcnt
  11. params = (
  12. ("v", vid),
  13. )
  14. while True:
  15. wpage = get("https://www.youtube.com/watch", params=params, mysession=mysession, backend=backend, http3headers=allheaders)
  16. if not """</div><div id="content" class=" content-alignment" role="main"><p class='largeText'>Sorry for the interruption. We have been receiving a large volume of requests from your network.</p>
  17. <p>To continue with your YouTube experience, please fill out the form below.</p>""" in wpage.text and not wpage.status_code == 429 and 'window["ytInitialPlayerResponse"] = ' in wpage.text and 'window["ytInitialData"] = ' in wpage.text:
  18. break
  19. else:
  20. if backend == "requests" and failcnt > 30:
  21. backend = "http3"
  22. print("Captcha detected, switching discovery to HTTP3/QUIC")
  23. elif backend == "http3" and failcnt < 30:
  24. failcnt += 1
  25. print("Captcha detected, waiting 30 seconds... ", 30-failcnt, "attempts left until switching discovery to HTTP3/QUIC.")
  26. sleep(30)
  27. else:
  28. print("Captcha detected, waiting 30 seconds")
  29. sleep(30)
  30. wptext = wpage.text
  31. initplay = None
  32. initdata = None
  33. recvids = set()
  34. recchans = set()
  35. recmixes = set()
  36. recplayl = set()
  37. ccenabled = False #default values
  38. creditdata = {}
  39. for line in wptext.splitlines():
  40. if line.strip().startswith('window["ytInitialPlayerResponse"] = '):
  41. initplay = loads(line.split('window["ytInitialPlayerResponse"] = ', 1)[1].strip()[:-1])
  42. if initplay["playabilityStatus"]["status"] == "ERROR":
  43. print(vid, "unavailable")
  44. return False, {}, recvids, recchans, recmixes, recplayl
  45. if "endscreen" in initplay.keys():
  46. if "endscreenRenderer" in initplay["endscreen"].keys():
  47. for el in initplay["endscreen"]["endscreenRenderer"]:
  48. if type(el) == Dict:
  49. elint = el["endscreenElementRenderer"]
  50. if "endscreenElementRenderer" in el.keys():
  51. if elint["style"] == "VIDEO":
  52. recvids.add(elint["endpoint"]["watchEndpoint"]["videoId"])
  53. elif elint["style"] == "CHANNEL":
  54. try:
  55. recchans.add(elint["endpoint"]["browseEndpoint"]["browseId"])
  56. except:
  57. print("Channel endscreen error")
  58. raise
  59. elif elint["style"] == "PLAYLIST":
  60. recvids.add(elint["endpoint"]["watchEndpoint"]["videoId"])
  61. recplayl.add(elint["endpoint"]["watchEndpint"]["playlistId"])
  62. if "captions" in initplay.keys():
  63. ccenabled = "contribute" in initplay["captions"]["playerCaptionsRenderer"]
  64. else:
  65. ccenabled = False # if captions information is not present, community contributions are not enabled
  66. if "videoDetails" in initplay.keys():
  67. if "channelId" in initplay["videoDetails"].keys():
  68. recchans.add(initplay["videoDetails"]["channelId"])
  69. elif line.strip().startswith('window["ytInitialData"] = '):
  70. initdata = loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1])
  71. if "contents" in initdata.keys(): #prevent exception
  72. try:
  73. if "results" in initdata["contents"]["twoColumnWatchNextResults"]["secondaryResults"]["secondaryResults"].keys():
  74. for recmd in initdata["contents"]["twoColumnWatchNextResults"]["secondaryResults"]["secondaryResults"]["results"]:
  75. #auto is like the others
  76. if "compactAutoplayRenderer" in recmd.keys():
  77. recmd = recmd["compactAutoplayRenderer"]["contents"][0]
  78. if "compactVideoRenderer" in recmd.keys():
  79. recvids.add(recmd["compactVideoRenderer"]["videoId"])
  80. try:
  81. recchans.add(recmd["compactVideoRenderer"]["channelId"])
  82. except KeyError as e:
  83. try:
  84. recchans.add(recmd["compactVideoRenderer"]["longBylineText"]["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"])
  85. except KeyError as e:
  86. print("Channel extract error")
  87. #raise
  88. #print("Unable to extract channel:")
  89. #print(recmd["compactVideoRenderer"])
  90. elif "compactPlaylistRenderer" in recmd.keys():
  91. recplayl.add(recmd["compactPlaylistRenderer"]["playlistId"])
  92. if "navigationEndpoint" in recmd["compactPlaylistRenderer"].keys():
  93. recvids.add(recmd["compactPlaylistRenderer"]["navigationEndpoint"]["watchEndpoint"]["videoId"])
  94. if "navigationEndpoint" in recmd["compactPlaylistRenderer"]["shortBylineText"].keys():
  95. recchans.add(recmd["compactPlaylistRenderer"]["shortBylineText"]["navigationEndpoint"]["browseEndpoint"]["browseId"])
  96. elif "compactRadioRenderer" in recmd.keys(): #mix playlist
  97. recmixes.add(recmd["compactRadioRenderer"]["playlistId"])
  98. # todo: find out if channels can be suggested
  99. except BaseException as e:
  100. print(e)
  101. print("Exception in discovery, continuing anyway")
  102. creditdata = {}
  103. try:
  104. mdinfo = initdata["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][1]["videoSecondaryInfoRenderer"]["metadataRowContainer"]["metadataRowContainerRenderer"]["rows"]
  105. for item in mdinfo:
  106. if item["metadataRowRenderer"]["title"]["simpleText"].startswith("Caption author"): #the request to /watch needs to be in English for this to work
  107. try:
  108. desl = langcodes[item["metadataRowRenderer"]["title"]["simpleText"].split("(", 1)[1][:-1]]
  109. except KeyError as e:
  110. #print(e)
  111. print("Language code conversion error, using language name")
  112. desl = item["metadataRowRenderer"]["title"]["simpleText"].split("(", 1)[1][:-1]
  113. creditdata[desl] = []
  114. for itemint in item["metadataRowRenderer"]["contents"]:
  115. creditdata[desl].append({"name": itemint["runs"][0]["text"], "channel": itemint["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"]})
  116. except KeyError as e:
  117. #print("Video does not have credits")
  118. pass
  119. #raise
  120. #print(e)
  121. if initplay and initdata:
  122. break
  123. return ccenabled, creditdata, recvids, recchans, recmixes, recplayl
  124. if __name__ == "__main__":
  125. from sys import argv
  126. vidl = argv
  127. vidl.pop(0)
  128. for video in vidl:
  129. print(getmetadata(video))