archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 

125 lignes
5.0 KiB

  1. # This function adapted from https://github.com/cdown/srt/blob/11089f1e021f2e074d04c33fc7ffc4b7b52e7045/srt.py, lines 69 and 189 (MIT License)
  2. def timedelta_to_sbv_timestamp(timedelta_timestamp):
  3. r"""
  4. Convert a :py:class:`~datetime.timedelta` to an SRT timestamp.
  5. .. doctest::
  6. >>> import datetime
  7. >>> delta = datetime.timedelta(hours=1, minutes=23, seconds=4)
  8. >>> timedelta_to_sbv_timestamp(delta)
  9. '01:23:04,000'
  10. :param datetime.timedelta timedelta_timestamp: A datetime to convert to an
  11. SBV timestamp
  12. :returns: The timestamp in SBV format
  13. :rtype: str
  14. """
  15. SECONDS_IN_HOUR = 3600
  16. SECONDS_IN_MINUTE = 60
  17. HOURS_IN_DAY = 24
  18. MICROSECONDS_IN_MILLISECOND = 1000
  19. hrs, secs_remainder = divmod(timedelta_timestamp.seconds, SECONDS_IN_HOUR)
  20. hrs += timedelta_timestamp.days * HOURS_IN_DAY
  21. mins, secs = divmod(secs_remainder, SECONDS_IN_MINUTE)
  22. msecs = timedelta_timestamp.microseconds // MICROSECONDS_IN_MILLISECOND
  23. return "%1d:%02d:%02d.%03d" % (hrs, mins, secs, msecs)
  24. import requests
  25. from bs4 import BeautifulSoup
  26. from datetime import timedelta
  27. import threading
  28. from os import mkdir
  29. from json import loads, dumps
  30. #HSID, SSID, SID cookies required
  31. cookies = loads(open("config.json").read())
  32. mysession = requests.session()
  33. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  34. class subtitlethread(threading.Thread):
  35. def run(self):
  36. langcode, vid = self.getName().split(";", 1)
  37. pparams = (
  38. ("v", vid),
  39. ("lang", langcode),
  40. ("action_mde_edit_form", 1),
  41. ("bl", "vmp"),
  42. ("ui", "hd"),
  43. ("tab", "captions"),
  44. ("o", "U")
  45. )
  46. page = mysession.get("https://www.youtube.com/timedtext_editor", params=pparams)
  47. assert not "accounts.google.com" in page.url, "Please supply authentication cookie information in config.json. See README.md for more information."
  48. soup = BeautifulSoup(page.text, features="html5lib")
  49. divs = soup.find_all("div", class_="timed-event-line")
  50. outtext = ""
  51. for item in divs:
  52. text = item.find("textarea").text
  53. startms = int(item.find("input", class_="event-start-time")["data-start-ms"])
  54. endms = int(item.find("input", class_="event-end-time")["data-end-ms"])
  55. outtext += timedelta_to_sbv_timestamp(timedelta(milliseconds=startms)) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=endms)) + "\n" + text + "\n\n"
  56. open("out/"+vid+"/"+vid+"_"+langcode+".sbv", "w", encoding="utf-8").write(outtext[:-1])
  57. if soup.find("li", id="captions-editor-nav-metadata")["data-state"] != "locked":
  58. metadata = {}
  59. try:
  60. metadata["title"] = soup.find("input", id="metadata-title")["value"]
  61. except KeyError:
  62. metadata["title"] = ""
  63. metadata["description"] = soup.find("textarea", id="metadata-description").text
  64. open("out/"+vid+"/"+vid+"_"+langcode+".json", "w", encoding="utf-8").write(dumps(metadata))
  65. def getsubs(vid):
  66. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  67. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  68. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  69. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  70. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  71. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  72. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  73. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  74. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  75. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  76. 'xh', 'yi', 'yo', 'zu']
  77. threads = []
  78. for langcode in langs:
  79. runthread = subtitlethread(name = langcode+";"+vid)
  80. runthread.start()
  81. threads.append(runthread)
  82. for x in threads:
  83. x.join()
  84. return True
  85. if __name__ == "__main__":
  86. from sys import argv
  87. vidl = argv
  88. vidl.pop(0)
  89. for video in vidl:
  90. try:
  91. mkdir("out")
  92. except:
  93. pass
  94. try:
  95. mkdir("out/"+video)
  96. except:
  97. pass
  98. getsubs(video)