archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.
 
 

307 Zeilen
13 KiB

  1. # This function adapted from https://github.com/cdown/srt/blob/11089f1e021f2e074d04c33fc7ffc4b7b52e7045/srt.py, lines 69 and 189 (MIT License)
  2. def timedelta_to_sbv_timestamp(timedelta_timestamp):
  3. r"""
  4. Convert a :py:class:`~datetime.timedelta` to an SRT timestamp.
  5. .. doctest::
  6. >>> import datetime
  7. >>> delta = datetime.timedelta(hours=1, minutes=23, seconds=4)
  8. >>> timedelta_to_sbv_timestamp(delta)
  9. '01:23:04,000'
  10. :param datetime.timedelta timedelta_timestamp: A datetime to convert to an
  11. SBV timestamp
  12. :returns: The timestamp in SBV format
  13. :rtype: str
  14. """
  15. SECONDS_IN_HOUR = 3600
  16. SECONDS_IN_MINUTE = 60
  17. HOURS_IN_DAY = 24
  18. MICROSECONDS_IN_MILLISECOND = 1000
  19. hrs, secs_remainder = divmod(timedelta_timestamp.seconds, SECONDS_IN_HOUR)
  20. hrs += timedelta_timestamp.days * HOURS_IN_DAY
  21. mins, secs = divmod(secs_remainder, SECONDS_IN_MINUTE)
  22. msecs = timedelta_timestamp.microseconds // MICROSECONDS_IN_MILLISECOND
  23. return "%1d:%02d:%02d.%03d" % (hrs, mins, secs, msecs)
  24. from datetime import timedelta
  25. from json import dumps
  26. from gc import collect
  27. # import requests
  28. from time import sleep
  29. # https://docs.python.org/3/library/html.parser.html
  30. from html.parser import HTMLParser
  31. class MyHTMLParser(HTMLParser):
  32. def __init__(self):
  33. HTMLParser.__init__(self)
  34. self.captions = []
  35. self.title = ""
  36. self.description = ""
  37. self.inittitle = ""
  38. self.initdescription = ""
  39. def check_attr(self, attrs, attr, value):
  40. for item in attrs:
  41. if item[0] == attr and item[1] == value:
  42. return True
  43. return False
  44. def get_attr(self, attrs, attr):
  45. for item in attrs:
  46. if item[0] == attr:
  47. return item[1]
  48. return False
  49. def handle_starttag(self, tag, attrs):
  50. if tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-start-time"):
  51. self.captions.append({"startTime": int(self.get_attr(attrs, "data-start-ms")), "text": ""})
  52. elif tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-end-time"):
  53. self.captions[len(self.captions)-1]["endTime"] = int(self.get_attr(attrs, "data-end-ms"))
  54. elif tag == "input" and self.check_attr(attrs, "id", "metadata-title"):
  55. self.title = self.get_attr(attrs, "value")
  56. elif tag == "textarea" and self.check_attr(attrs, "id", "metadata-description"):
  57. self.initdescription = self.get_attr(attrs, "data-original-description")
  58. def handle_data(self, data):
  59. if self.get_starttag_text() and self.get_starttag_text().startswith("<textarea "):
  60. if 'name="serve_text"' in self.get_starttag_text():
  61. self.captions[len(self.captions)-1]["text"] += data
  62. elif 'id="metadata-description"' in self.get_starttag_text():
  63. self.description += data
  64. elif self.get_starttag_text() and self.get_starttag_text().startswith('<div id="original-video-title"'):
  65. self.inittitle += data
  66. def subprrun(mysession, langcode, vid, mode, needforcemetadata, needforcecaptions):
  67. if mode == "forceedit-metadata":
  68. while needforcemetadata[langcode] == None: #extra logic
  69. print("Awaiting forcemetadata")
  70. sleep(1)
  71. if needforcemetadata[langcode] == False:
  72. #print("forcemetadata not needed")
  73. return True #nothing needs to be done, otherwise, continue
  74. if mode == "forceedit-captions":
  75. while needforcecaptions[langcode] == None: #extra logic
  76. print("Awaiting forcecaptions")
  77. sleep(1)
  78. if needforcecaptions[langcode] == False:
  79. #print("forcecaptions not needed")
  80. return True #nothing needs to be done, otherwise, continue
  81. collect() #cleanup memory
  82. vid = vid.strip()
  83. print(langcode, vid)
  84. while True:
  85. try:
  86. if mode == "default":
  87. pparams = (
  88. ("v", vid),
  89. ("lang", langcode),
  90. ("action_mde_edit_form", 1),
  91. ("bl", "vmp"),
  92. ("ui", "hd"),
  93. ("tab", "captions"),
  94. ("o", "U")
  95. )
  96. page = mysession.get("https://www.youtube.com/timedtext_editor", params=pparams)
  97. elif mode == "forceedit-metadata":
  98. pparams = (
  99. ("v", vid),
  100. ("lang", langcode),
  101. ("action_mde_edit_form", 1),
  102. ('forceedit', 'metadata'),
  103. ('tab', 'metadata')
  104. )
  105. page = mysession.get("https://www.youtube.com/timedtext_editor", params=pparams)
  106. elif mode == "forceedit-captions":
  107. pparams = (
  108. ("v", vid),
  109. ("lang", langcode),
  110. ("action_mde_edit_form", 1),
  111. ("bl", "vmp"),
  112. ("ui", "hd"),
  113. ('forceedit', 'captions'),
  114. ("tab", "captions"),
  115. ("o", "U")
  116. )
  117. page = mysession.get("https://www.youtube.com/timedtext_editor", params=pparams)
  118. if not "accounts.google.com" in page.url and page.status_code != 429 and 'Subtitles/CC' in page.text and ('Title &amp; description' in page.text or 'Title and description' in page.text):
  119. break
  120. else:
  121. print("[Retrying in 30 seconds for rate limit or login failure] Please supply authentication cookie information in config.json or environment variables. See README.md for more information.")
  122. sleep(30)
  123. except:
  124. print("Error in request, retrying in 5 seconds...")
  125. sleep(5)
  126. inttext = page.text
  127. try:
  128. initlang = page.text.split("'metadataLanguage': \"", 1)[1].split('"', 1)[0]
  129. except:
  130. initlang = ""
  131. del page
  132. filestring = "_community_draft"
  133. if '<li id="captions-editor-nav-captions" role="tab" data-state="published" class="published">' in inttext:
  134. filestring = "_community_published"
  135. if mode == "forceedit-captions":
  136. filestring = "_community_draft"
  137. if 'title="The video owner already provided subtitles/CC"' in inttext:
  138. filestring = "_uploader_provided"
  139. if not "forceedit" in mode:
  140. if '&amp;forceedit=metadata&amp;tab=metadata">See latest</a>' in inttext:
  141. print("Need forcemetadata")
  142. needforcemetadata[langcode] = True
  143. else:
  144. needforcemetadata[langcode] = False
  145. if '<li id="captions-editor-nav-captions" role="tab" data-state="published" class="published">' in inttext:
  146. print("Need forcecaptions")
  147. needforcecaptions[langcode] = True
  148. else:
  149. needforcecaptions[langcode] = False
  150. if 'id="reject-captions-button"' in inttext or 'id="reject-metadata-button"' in inttext or 'data-state="published"' in inttext or 'title="The video owner already provided subtitles/CC"' in inttext: #quick way of checking if this page is worth parsing
  151. parser = MyHTMLParser()
  152. parser.feed(inttext)
  153. captiontext = False
  154. for item in parser.captions:
  155. if item["text"][:-9]:
  156. captiontext = True
  157. if captiontext and (mode == "default" or mode == "forceedit-captions"):
  158. myfs = open("out/"+vid+"/"+vid+"_"+langcode+filestring+".sbv", "w", encoding="utf-8")
  159. captions = parser.captions
  160. captions.pop(0) #get rid of the fake one
  161. while captions:
  162. item = captions.pop(0)
  163. myfs.write(timedelta_to_sbv_timestamp(timedelta(milliseconds=item["startTime"])) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=item["endTime"])) + "\n" + item["text"][:-9] + "\n")
  164. del item
  165. if captions:
  166. myfs.write("\n")
  167. del captions
  168. myfs.close()
  169. del myfs
  170. del captiontext
  171. if (parser.title or parser.description[:-16]) and (mode == "default" or mode == "forceedit-metadata"):
  172. metadata = {}
  173. metadata["title"] = parser.title
  174. if metadata["title"] == False:
  175. metadata["title"] = ""
  176. metadata["description"] = parser.description[:-16]
  177. filestring = "_community_draft"
  178. if '<li id="captions-editor-nav-metadata" role="tab" data-state="published" class="published">' in inttext:
  179. filestring = "_community_published"
  180. if mode == "forceedit-metadata":
  181. filestring = "_community_draft"
  182. open("out/"+vid+"/"+vid+"_"+langcode+filestring+".json", "w", encoding="utf-8").write(dumps(metadata))
  183. del metadata
  184. if (parser.inittitle[9:-17] or parser.initdescription) and (mode == "default" or mode == "forceedit-metadata" and initlang):
  185. metadata = {}
  186. metadata["title"] = parser.inittitle[9:-17]
  187. if metadata["title"] == False:
  188. metadata["title"] = ""
  189. metadata["description"] = parser.initdescription
  190. filestring = "_uploader_provided"
  191. open("out/"+vid+"/"+vid+"_"+initlang+filestring+".json", "w", encoding="utf-8").write(dumps(metadata))
  192. del metadata
  193. del inttext
  194. del langcode
  195. del vid
  196. del pparams
  197. return True
  198. # if __name__ == "__main__":
  199. # from os import environ, mkdir
  200. # from os.path import isfile
  201. # from json import loads
  202. # #HSID, SSID, SID cookies required
  203. # if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  204. # cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  205. # elif isfile("config.json"):
  206. # cookies = loads(open("config.json").read())
  207. # else:
  208. # print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  209. # assert False
  210. # if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  211. # print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  212. # assert False
  213. # mysession = requests.session()
  214. # mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  215. # del cookies
  216. # from sys import argv
  217. # from queue import Queue
  218. # from threading import Thread
  219. # langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  220. # 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  221. # 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  222. # 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  223. # 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  224. # 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  225. # 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  226. # 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  227. # 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  228. # 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  229. # 'xh', 'yi', 'yo', 'zu']
  230. # vidl = argv
  231. # vidl.pop(0)
  232. # try:
  233. # mkdir("out")
  234. # except:
  235. # pass
  236. # jobs = Queue()
  237. # for video in vidl:
  238. # try:
  239. # mkdir("out/"+video.strip())
  240. # except:
  241. # pass
  242. # for lang in langs:
  243. # jobs.put((lang, video, "default"))
  244. # subthreads = []
  245. # for r in range(50):
  246. # subrunthread = Thread(target=subprrun, args=(jobs,mysession))
  247. # subrunthread.start()
  248. # subthreads.append(subrunthread)
  249. # del subrunthread
  250. # for xa in subthreads:
  251. # xa.join() #bug (occurred once: the script ended before the last thread finished)
  252. # subthreads.remove(xa)
  253. # del xa