archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.
 
 

137 Zeilen
5.2 KiB

  1. # This function adapted from https://github.com/cdown/srt/blob/11089f1e021f2e074d04c33fc7ffc4b7b52e7045/srt.py, lines 69 and 189 (MIT License)
  2. def timedelta_to_sbv_timestamp(timedelta_timestamp):
  3. r"""
  4. Convert a :py:class:`~datetime.timedelta` to an SRT timestamp.
  5. .. doctest::
  6. >>> import datetime
  7. >>> delta = datetime.timedelta(hours=1, minutes=23, seconds=4)
  8. >>> timedelta_to_sbv_timestamp(delta)
  9. '01:23:04,000'
  10. :param datetime.timedelta timedelta_timestamp: A datetime to convert to an
  11. SBV timestamp
  12. :returns: The timestamp in SBV format
  13. :rtype: str
  14. """
  15. SECONDS_IN_HOUR = 3600
  16. SECONDS_IN_MINUTE = 60
  17. HOURS_IN_DAY = 24
  18. MICROSECONDS_IN_MILLISECOND = 1000
  19. hrs, secs_remainder = divmod(timedelta_timestamp.seconds, SECONDS_IN_HOUR)
  20. hrs += timedelta_timestamp.days * HOURS_IN_DAY
  21. mins, secs = divmod(secs_remainder, SECONDS_IN_MINUTE)
  22. msecs = timedelta_timestamp.microseconds // MICROSECONDS_IN_MILLISECOND
  23. return "%1d:%02d:%02d.%03d" % (hrs, mins, secs, msecs)
  24. from datetime import timedelta
  25. from json import dumps
  26. import requests
  27. # https://docs.python.org/3/library/html.parser.html
  28. from html.parser import HTMLParser
  29. class MyHTMLParser(HTMLParser):
  30. def __init__(self):
  31. HTMLParser.__init__(self)
  32. self.captions = []
  33. self.captiontext = True
  34. self.title = ""
  35. self.description = ""
  36. def check_attr(self, attrs, attr, value):
  37. for item in attrs:
  38. if item[0] == attr and item[1] == value:
  39. return True
  40. return False
  41. def get_attr(self, attrs, attr):
  42. for item in attrs:
  43. if item[0] == attr:
  44. return item[1]
  45. return False
  46. def handle_starttag(self, tag, attrs):
  47. if tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-start-time") and not ' data-segment-id="" ' in self.get_starttag_text():
  48. self.captions.append({"startTime": int(self.get_attr(attrs, "data-start-ms")), "text": ""})
  49. elif tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-end-time") and not ' data-segment-id="" ' in self.get_starttag_text():
  50. self.captions[len(self.captions)-1]["endTime"] = int(self.get_attr(attrs, "data-end-ms"))
  51. # elif tag == "textarea" and self.check_attr(attrs, "class", "yt-uix-form-input-textarea event-text goog-textarea"):
  52. # if len(self.captions):
  53. # self.datatarget = len(self.captions)-1
  54. # else:
  55. # self.datatarget = 0
  56. elif tag == "input" and self.check_attr(attrs, "id", "metadata-title"):
  57. self.title = self.get_attr(attrs, "value")
  58. # elif tag == "textarea" and self.check_attr(attrs, "id", "metadata-description"):
  59. # self.datatarget = "description"
  60. # def handle_endtag(self, tag):
  61. # if tag == "textarea":
  62. # self.datatarget = None
  63. def handle_data(self, data):
  64. if self.get_starttag_text() and self.get_starttag_text().startswith("<textarea "):
  65. if 'name="serve_text"' in self.get_starttag_text() and not 'data-segment-id=""' in self.get_starttag_text():
  66. self.captions[len(self.captions)-1]["text"] += data
  67. self.captiontext = True
  68. elif 'id="metadata-description"' in self.get_starttag_text():
  69. self.description += data
  70. def subprrun(jobs, headers):
  71. while not jobs.empty():
  72. langcode, vid = jobs.get()
  73. vid = vid.strip()
  74. print(langcode, vid)
  75. pparams = (
  76. ("v", vid),
  77. ("lang", langcode),
  78. ("action_mde_edit_form", 1),
  79. ("bl", "vmp"),
  80. ("ui", "hd"),
  81. ("tab", "captions"),
  82. ("o", "U")
  83. )
  84. page = requests.get("https://www.youtube.com/timedtext_editor", headers=headers, params=pparams)
  85. assert not "accounts.google.com" in page.url, "Please supply authentication cookie information in config.json. See README.md for more information."
  86. parser = MyHTMLParser()
  87. parser.feed(page.text)
  88. del page
  89. if parser.captiontext:
  90. myfs = open("out/"+vid+"/"+vid+"_"+langcode+".sbv", "w", encoding="utf-8")
  91. captions = parser.captions
  92. captions.pop(0) #get rid of the fake one
  93. while captions:
  94. item = captions.pop(0)
  95. myfs.write(timedelta_to_sbv_timestamp(timedelta(milliseconds=item["startTime"])) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=item["endTime"])) + "\n" + item["text"][:-9] + "\n")
  96. del item
  97. if captions:
  98. myfs.write("\n")
  99. del captions
  100. myfs.close()
  101. del myfs
  102. if parser.title or parser.description:
  103. metadata = {}
  104. metadata["title"] = parser.title
  105. metadata["description"] = parser.description[:-16]
  106. open("out/"+vid+"/"+vid+"_"+langcode+".json", "w", encoding="utf-8").write(dumps(metadata))
  107. del metadata
  108. del langcode
  109. del vid
  110. del pparams
  111. jobs.task_done()
  112. return True