archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Não pode escolher mais do que 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 

277 linhas
11 KiB

  1. # This function adapted from https://github.com/cdown/srt/blob/11089f1e021f2e074d04c33fc7ffc4b7b52e7045/srt.py, lines 69 and 189 (MIT License)
  2. def timedelta_to_sbv_timestamp(timedelta_timestamp):
  3. r"""
  4. Convert a :py:class:`~datetime.timedelta` to an SRT timestamp.
  5. .. doctest::
  6. >>> import datetime
  7. >>> delta = datetime.timedelta(hours=1, minutes=23, seconds=4)
  8. >>> timedelta_to_sbv_timestamp(delta)
  9. '01:23:04,000'
  10. :param datetime.timedelta timedelta_timestamp: A datetime to convert to an
  11. SBV timestamp
  12. :returns: The timestamp in SBV format
  13. :rtype: str
  14. """
  15. SECONDS_IN_HOUR = 3600
  16. SECONDS_IN_MINUTE = 60
  17. HOURS_IN_DAY = 24
  18. MICROSECONDS_IN_MILLISECOND = 1000
  19. hrs, secs_remainder = divmod(timedelta_timestamp.seconds, SECONDS_IN_HOUR)
  20. hrs += timedelta_timestamp.days * HOURS_IN_DAY
  21. mins, secs = divmod(secs_remainder, SECONDS_IN_MINUTE)
  22. msecs = timedelta_timestamp.microseconds // MICROSECONDS_IN_MILLISECOND
  23. return "%1d:%02d:%02d.%03d" % (hrs, mins, secs, msecs)
  24. from datetime import timedelta
  25. from json import dumps
  26. from gc import collect
  27. import requests
  28. # https://docs.python.org/3/library/html.parser.html
  29. from html.parser import HTMLParser
  30. class MyHTMLParser(HTMLParser):
  31. def __init__(self):
  32. HTMLParser.__init__(self)
  33. self.captions = []
  34. self.title = ""
  35. self.description = ""
  36. self.inittitle = ""
  37. self.initdescription = ""
  38. def check_attr(self, attrs, attr, value):
  39. for item in attrs:
  40. if item[0] == attr and item[1] == value:
  41. return True
  42. return False
  43. def get_attr(self, attrs, attr):
  44. for item in attrs:
  45. if item[0] == attr:
  46. return item[1]
  47. return False
  48. def handle_starttag(self, tag, attrs):
  49. if tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-start-time"):
  50. self.captions.append({"startTime": int(self.get_attr(attrs, "data-start-ms")), "text": ""})
  51. elif tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-end-time"):
  52. self.captions[len(self.captions)-1]["endTime"] = int(self.get_attr(attrs, "data-end-ms"))
  53. elif tag == "input" and self.check_attr(attrs, "id", "metadata-title"):
  54. self.title = self.get_attr(attrs, "value")
  55. elif tag == "textarea" and self.check_attr(attrs, "id", "metadata-description"):
  56. self.initdescription = self.get_attr(attrs, "data-original-description")
  57. def handle_data(self, data):
  58. if self.get_starttag_text() and self.get_starttag_text().startswith("<textarea "):
  59. if 'name="serve_text"' in self.get_starttag_text():
  60. self.captions[len(self.captions)-1]["text"] += data
  61. elif 'id="metadata-description"' in self.get_starttag_text():
  62. self.description += data
  63. elif self.get_starttag_text() and self.get_starttag_text().startswith('<div id="original-video-title"'):
  64. self.inittitle += data
  65. def subprrun(jobs, mysession):
  66. while not jobs.empty():
  67. collect() #cleanup memory
  68. langcode, vid, mode = jobs.get()
  69. vid = vid.strip()
  70. print(langcode, vid)
  71. if mode == "default":
  72. pparams = (
  73. ("v", vid),
  74. ("lang", langcode),
  75. ("action_mde_edit_form", 1),
  76. ("bl", "vmp"),
  77. ("ui", "hd"),
  78. ("tab", "captions"),
  79. ("o", "U")
  80. )
  81. page = mysession.get("https://www.youtube.com/timedtext_editor", params=pparams)
  82. elif mode == "forceedit-metadata":
  83. pparams = (
  84. ("v", vid),
  85. ("lang", langcode),
  86. ("action_mde_edit_form", 1),
  87. ('forceedit', 'metadata'),
  88. ('tab', 'metadata')
  89. )
  90. page = mysession.get("https://www.youtube.com/timedtext_editor", params=pparams)
  91. elif mode == "forceedit-captions":
  92. pparams = (
  93. ("v", vid),
  94. ("lang", langcode),
  95. ("action_mde_edit_form", 1),
  96. ("bl", "vmp"),
  97. ("ui", "hd"),
  98. ('forceedit', 'captions'),
  99. ("tab", "captions"),
  100. ("o", "U")
  101. )
  102. page = mysession.get("https://www.youtube.com/timedtext_editor", params=pparams)
  103. assert not "accounts.google.com" in page.url, "Please supply authentication cookie information in config.json or environment variables. See README.md for more information."
  104. inttext = page.text
  105. try:
  106. initlang = page.text.split("'metadataLanguage': \"", 1)[1].split('"', 1)[0]
  107. except:
  108. initlang = ""
  109. del page
  110. filestring = "_community_draft"
  111. if '<li id="captions-editor-nav-captions" role="tab" data-state="published" class="published">' in inttext:
  112. filestring = "_community_published"
  113. if mode == "forceedit-captions":
  114. filestring = "_community_draft"
  115. if 'title="The video owner already provided subtitles/CC"' in inttext:
  116. filestring = "_uploader_provided"
  117. if not "forceedit" in mode:
  118. if '&amp;forceedit=metadata&amp;tab=metadata">See latest</a>' in inttext:
  119. jobs.put((langcode, vid, "forceedit-metadata"))
  120. if '<li id="captions-editor-nav-captions" role="tab" data-state="published" class="published">' in inttext:
  121. jobs.put((langcode, vid, "forceedit-captions"))
  122. if 'id="reject-captions-button"' in inttext or 'id="reject-metadata-button"' in inttext or 'data-state="published"' in inttext or 'title="The video owner already provided subtitles/CC"' in inttext: #quick way of checking if this page is worth parsing
  123. parser = MyHTMLParser()
  124. parser.feed(inttext)
  125. captiontext = False
  126. for item in parser.captions:
  127. if item["text"][:-9]:
  128. captiontext = True
  129. if captiontext and (mode == "default" or mode == "forceedit-captions"):
  130. myfs = open("out/"+vid+"/"+vid+"_"+langcode+filestring+".sbv", "w", encoding="utf-8")
  131. captions = parser.captions
  132. captions.pop(0) #get rid of the fake one
  133. while captions:
  134. item = captions.pop(0)
  135. myfs.write(timedelta_to_sbv_timestamp(timedelta(milliseconds=item["startTime"])) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=item["endTime"])) + "\n" + item["text"][:-9] + "\n")
  136. del item
  137. if captions:
  138. myfs.write("\n")
  139. del captions
  140. myfs.close()
  141. del myfs
  142. del captiontext
  143. if (parser.title or parser.description[:-16]) and (mode == "default" or mode == "forceedit-metadata"):
  144. metadata = {}
  145. metadata["title"] = parser.title
  146. if metadata["title"] == False:
  147. metadata["title"] = ""
  148. metadata["description"] = parser.description[:-16]
  149. filestring = "_community_draft"
  150. if '<li id="captions-editor-nav-metadata" role="tab" data-state="published" class="published">' in inttext:
  151. filestring = "_community_published"
  152. if mode == "forceedit-metadata":
  153. filestring = "_community_draft"
  154. open("out/"+vid+"/"+vid+"_"+langcode+filestring+".json", "w", encoding="utf-8").write(dumps(metadata))
  155. del metadata
  156. if (parser.inittitle[9:-17] or parser.initdescription) and (mode == "default" or mode == "forceedit-metadata" and initlang):
  157. metadata = {}
  158. metadata["title"] = parser.inittitle[9:-17]
  159. if metadata["title"] == False:
  160. metadata["title"] = ""
  161. metadata["description"] = parser.initdescription
  162. filestring = "_uploader_provided"
  163. open("out/"+vid+"/"+vid+"_"+initlang+filestring+".json", "w", encoding="utf-8").write(dumps(metadata))
  164. del metadata
  165. del inttext
  166. del langcode
  167. del vid
  168. del pparams
  169. jobs.task_done()
  170. return True
  171. if __name__ == "__main__":
  172. from os import environ, mkdir
  173. from os.path import isfile
  174. from json import loads
  175. #HSID, SSID, SID cookies required
  176. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  177. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  178. elif isfile("config.json"):
  179. cookies = loads(open("config.json").read())
  180. else:
  181. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  182. assert False
  183. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  184. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  185. assert False
  186. mysession = requests.session()
  187. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  188. del cookies
  189. from sys import argv
  190. from queue import Queue
  191. from threading import Thread
  192. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  193. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  194. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  195. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  196. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  197. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  198. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  199. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  200. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  201. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  202. 'xh', 'yi', 'yo', 'zu']
  203. vidl = argv
  204. vidl.pop(0)
  205. try:
  206. mkdir("out")
  207. except:
  208. pass
  209. jobs = Queue()
  210. for video in vidl:
  211. try:
  212. mkdir("out/"+video.strip())
  213. except:
  214. pass
  215. for lang in langs:
  216. jobs.put((lang, video, "default"))
  217. subthreads = []
  218. for r in range(50):
  219. subrunthread = Thread(target=subprrun, args=(jobs,mysession))
  220. subrunthread.start()
  221. subthreads.append(subrunthread)
  222. del subrunthread
  223. for xa in subthreads:
  224. xa.join() #bug (occurred once: the script ended before the last thread finished)
  225. subthreads.remove(xa)
  226. del xa