archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.
 
 

204 Zeilen
8.2 KiB

  1. # This function adapted from https://github.com/cdown/srt/blob/11089f1e021f2e074d04c33fc7ffc4b7b52e7045/srt.py, lines 69 and 189 (MIT License)
  2. def timedelta_to_sbv_timestamp(timedelta_timestamp):
  3. r"""
  4. Convert a :py:class:`~datetime.timedelta` to an SRT timestamp.
  5. .. doctest::
  6. >>> import datetime
  7. >>> delta = datetime.timedelta(hours=1, minutes=23, seconds=4)
  8. >>> timedelta_to_sbv_timestamp(delta)
  9. '01:23:04,000'
  10. :param datetime.timedelta timedelta_timestamp: A datetime to convert to an
  11. SBV timestamp
  12. :returns: The timestamp in SBV format
  13. :rtype: str
  14. """
  15. SECONDS_IN_HOUR = 3600
  16. SECONDS_IN_MINUTE = 60
  17. HOURS_IN_DAY = 24
  18. MICROSECONDS_IN_MILLISECOND = 1000
  19. hrs, secs_remainder = divmod(timedelta_timestamp.seconds, SECONDS_IN_HOUR)
  20. hrs += timedelta_timestamp.days * HOURS_IN_DAY
  21. mins, secs = divmod(secs_remainder, SECONDS_IN_MINUTE)
  22. msecs = timedelta_timestamp.microseconds // MICROSECONDS_IN_MILLISECOND
  23. return "%1d:%02d:%02d.%03d" % (hrs, mins, secs, msecs)
  24. from datetime import timedelta
  25. from json import dumps
  26. from gc import collect
  27. import requests
  28. # https://docs.python.org/3/library/html.parser.html
  29. from html.parser import HTMLParser
  30. class MyHTMLParser(HTMLParser):
  31. def __init__(self):
  32. HTMLParser.__init__(self)
  33. self.captions = []
  34. self.title = ""
  35. self.description = ""
  36. def check_attr(self, attrs, attr, value):
  37. for item in attrs:
  38. if item[0] == attr and item[1] == value:
  39. return True
  40. return False
  41. def get_attr(self, attrs, attr):
  42. for item in attrs:
  43. if item[0] == attr:
  44. return item[1]
  45. return False
  46. def handle_starttag(self, tag, attrs):
  47. if tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-start-time"):
  48. self.captions.append({"startTime": int(self.get_attr(attrs, "data-start-ms")), "text": ""})
  49. elif tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-end-time"):
  50. self.captions[len(self.captions)-1]["endTime"] = int(self.get_attr(attrs, "data-end-ms"))
  51. elif tag == "input" and self.check_attr(attrs, "id", "metadata-title"):
  52. self.title = self.get_attr(attrs, "value")
  53. def handle_data(self, data):
  54. if self.get_starttag_text() and self.get_starttag_text().startswith("<textarea "):
  55. if 'name="serve_text"' in self.get_starttag_text():
  56. self.captions[len(self.captions)-1]["text"] += data
  57. elif 'id="metadata-description"' in self.get_starttag_text():
  58. self.description += data
  59. def subprrun(jobs, mysession):
  60. while not jobs.empty():
  61. collect() #cleanup memory
  62. langcode, vid = jobs.get()
  63. vid = vid.strip()
  64. print(langcode, vid)
  65. pparams = (
  66. ("v", vid),
  67. ("lang", langcode),
  68. ("action_mde_edit_form", 1),
  69. ("bl", "vmp"),
  70. ("ui", "hd"),
  71. ("tab", "captions"),
  72. ("o", "U")
  73. )
  74. page = mysession.get("https://www.youtube.com/timedtext_editor", params=pparams)
  75. assert not "accounts.google.com" in page.url, "Please supply authentication cookie information in config.json. See README.md for more information."
  76. inttext = page.text
  77. del page
  78. if 'id="reject-captions-button"' in inttext or 'id="reject-metadata-button"' in inttext or 'data-state="published"' in inttext or 'title="The video owner already provided subtitles/CC"' in inttext: #quick way of checking if this page is worth parsing
  79. parser = MyHTMLParser()
  80. parser.feed(inttext)
  81. captiontext = False
  82. for item in parser.captions:
  83. if item["text"][:-9]:
  84. captiontext = True
  85. if captiontext:
  86. myfs = open("out/"+vid+"/"+vid+"_"+langcode+".sbv", "w", encoding="utf-8")
  87. captions = parser.captions
  88. captions.pop(0) #get rid of the fake one
  89. while captions:
  90. item = captions.pop(0)
  91. myfs.write(timedelta_to_sbv_timestamp(timedelta(milliseconds=item["startTime"])) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=item["endTime"])) + "\n" + item["text"][:-9] + "\n")
  92. del item
  93. if captions:
  94. myfs.write("\n")
  95. del captions
  96. myfs.close()
  97. del myfs
  98. del captiontext
  99. if parser.title or parser.description[:-16]:
  100. metadata = {}
  101. metadata["title"] = parser.title
  102. if metadata["title"] == False:
  103. metadata["title"] = ""
  104. metadata["description"] = parser.description[:-16]
  105. open("out/"+vid+"/"+vid+"_"+langcode+".json", "w", encoding="utf-8").write(dumps(metadata))
  106. del metadata
  107. del inttext
  108. del langcode
  109. del vid
  110. del pparams
  111. jobs.task_done()
  112. return True
  113. if __name__ == "__main__":
  114. from os import environ, mkdir
  115. from os.path import isfile
  116. from json import loads
  117. #HSID, SSID, SID cookies required
  118. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  119. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  120. elif isfile("config.json"):
  121. cookies = loads(open("config.json").read())
  122. else:
  123. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  124. assert False
  125. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  126. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  127. assert False
  128. mysession = requests.session()
  129. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  130. del cookies
  131. from sys import argv
  132. from queue import Queue
  133. from threading import Thread
  134. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  135. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  136. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  137. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  138. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  139. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  140. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  141. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  142. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  143. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  144. 'xh', 'yi', 'yo', 'zu']
  145. vidl = argv
  146. vidl.pop(0)
  147. try:
  148. mkdir("out")
  149. except:
  150. pass
  151. jobs = Queue()
  152. for video in vidl:
  153. try:
  154. mkdir("out/"+video.strip())
  155. except:
  156. pass
  157. for lang in langs:
  158. jobs.put((lang, video))
  159. subthreads = []
  160. for r in range(50):
  161. subrunthread = Thread(target=subprrun, args=(jobs,mysession))
  162. subrunthread.start()
  163. subthreads.append(subrunthread)
  164. del subrunthread
  165. for xa in subthreads:
  166. xa.join() #bug (occurred once: the script ended before the last thread finished)
  167. subthreads.remove(xa)
  168. del xa