archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.
 
 

96 Zeilen
3.3 KiB

  1. # This function adapted from https://github.com/cdown/srt/blob/11089f1e021f2e074d04c33fc7ffc4b7b52e7045/srt.py, lines 69 and 189 (MIT License)
  2. def timedelta_to_sbv_timestamp(timedelta_timestamp):
  3. r"""
  4. Convert a :py:class:`~datetime.timedelta` to an SRT timestamp.
  5. .. doctest::
  6. >>> import datetime
  7. >>> delta = datetime.timedelta(hours=1, minutes=23, seconds=4)
  8. >>> timedelta_to_sbv_timestamp(delta)
  9. '01:23:04,000'
  10. :param datetime.timedelta timedelta_timestamp: A datetime to convert to an
  11. SBV timestamp
  12. :returns: The timestamp in SBV format
  13. :rtype: str
  14. """
  15. SECONDS_IN_HOUR = 3600
  16. SECONDS_IN_MINUTE = 60
  17. HOURS_IN_DAY = 24
  18. MICROSECONDS_IN_MILLISECOND = 1000
  19. hrs, secs_remainder = divmod(timedelta_timestamp.seconds, SECONDS_IN_HOUR)
  20. hrs += timedelta_timestamp.days * HOURS_IN_DAY
  21. mins, secs = divmod(secs_remainder, SECONDS_IN_MINUTE)
  22. msecs = timedelta_timestamp.microseconds // MICROSECONDS_IN_MILLISECOND
  23. return "%1d:%02d:%02d.%03d" % (hrs, mins, secs, msecs)
  24. from bs4 import BeautifulSoup
  25. from datetime import timedelta
  26. from json import dumps
  27. import requests
  28. def subprrun(jobs, headers):
  29. while not jobs.empty():
  30. langcode, vid = jobs.get()
  31. print(langcode, vid)
  32. pparams = (
  33. ("v", vid),
  34. ("lang", langcode),
  35. ("action_mde_edit_form", 1),
  36. ("bl", "vmp"),
  37. ("ui", "hd"),
  38. ("tab", "captions"),
  39. ("o", "U")
  40. )
  41. page = requests.get("https://www.youtube.com/timedtext_editor", headers=headers, params=pparams)
  42. assert not "accounts.google.com" in page.url, "Please supply authentication cookie information in config.json. See README.md for more information."
  43. soup = BeautifulSoup(page.text, features="html5lib")
  44. del page
  45. divs = soup.find_all("div", class_="timed-event-line")
  46. myfs = open("out/"+vid+"/"+vid+"_"+langcode+".sbv", "w", encoding="utf-8")
  47. while divs:
  48. item = divs.pop(0)
  49. text = item.find("textarea").text
  50. startms = int(item.find("input", class_="event-start-time")["data-start-ms"])
  51. endms = int(item.find("input", class_="event-end-time")["data-end-ms"])
  52. myfs.write(timedelta_to_sbv_timestamp(timedelta(milliseconds=startms)) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=endms)) + "\n" + text + "\n")
  53. del item
  54. del text
  55. del startms
  56. del endms
  57. if divs:
  58. myfs.write("\n")
  59. del divs
  60. myfs.close()
  61. del myfs
  62. if soup.find("li", id="captions-editor-nav-metadata")["data-state"] != "locked":
  63. metadata = {}
  64. try:
  65. metadata["title"] = soup.find("input", id="metadata-title")["value"]
  66. except KeyError:
  67. metadata["title"] = ""
  68. metadata["description"] = soup.find("textarea", id="metadata-description").text
  69. open("out/"+vid+"/"+vid+"_"+langcode+".json", "w", encoding="utf-8").write(dumps(metadata))
  70. del metadata
  71. del soup
  72. del langcode
  73. del vid
  74. del pparams
  75. jobs.task_done()
  76. return True