archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.

257 рядки
12 KiB

  1. from threading import Thread
  2. import requests
  3. from time import sleep
  4. from os import mkdir, rmdir, listdir, system, environ
  5. from os.path import isdir, isfile, getsize
  6. from json import dumps, loads
  7. import signal
  8. import tracker
  9. from youtube_dl import YoutubeDL
  10. from shutil import rmtree
  11. from queue import Queue
  12. from gc import collect
  13. from datetime import timedelta, datetime
  14. from discovery import getmetadata
  15. from export import subprrun
  16. #useful Queue example: https://stackoverflow.com/a/54658363
  17. jobs = Queue()
  18. langcnt = {}
  19. lasttask = datetime.min
  20. try:
  21. mkdir("out")
  22. except:
  23. pass
  24. try:
  25. mkdir("directory")
  26. except:
  27. pass
  28. HEROKU = False
  29. if isfile("../Procfile"):
  30. HEROKU = True
  31. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  32. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  33. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  34. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  35. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  36. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  37. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  38. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  39. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  40. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  41. 'xh', 'yi', 'yo', 'zu']
  42. #HSID, SSID, SID cookies required
  43. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  44. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  45. elif isfile("config.json"):
  46. cookies = loads(open("config.json").read())
  47. else:
  48. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  49. assert False
  50. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  51. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  52. assert False
  53. mysession = requests.session()
  54. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  55. validationtest = mysession.get("https://www.youtube.com/timedtext_editor?action_mde_edit_form=1&v=1iNTtHUwvq4&lang=en&bl=vmp&ui=hd&ref=player&tab=captions&o=U")
  56. assert not "accounts.google.com" in validationtest.url, "Please ensure you have correctly specified account cookies."
  57. assert """<button class="yt-uix-button yt-uix-button-size-default yt-uix-button-default yt-uix-button-has-icon" type="button" onclick=";return false;" id="yt-picker-language-button" data-button-action="yt.www.picker.load" data-button-menu-id="arrow-display" data-picker-key="language" data-picker-position="footer" data-button-toggle="true"><span class="yt-uix-button-icon-wrapper"><span class="yt-uix-button-icon yt-uix-button-icon-footer-language yt-sprite"></span></span><span class="yt-uix-button-content"> <span class="yt-picker-button-label">
  58. Language:
  59. </span>
  60. English
  61. </span><span class="yt-uix-button-arrow yt-sprite"></span></button>""" in validationtest.text, "Please make sure your YouTube and Google account language is set to English (United States)"
  62. del validationtest
  63. open("cookies.txt", "w").write("""# HTTP Cookie File
  64. .youtube.com TRUE / FALSE 1663793455 SID [SID]
  65. .youtube.com TRUE / FALSE 1663793455 HSID [HSID]
  66. .youtube.com TRUE / TRUE 1663793455 SSID [SSID]""".replace("[SID]", cookies["SID"]).replace("[HSID]", cookies["HSID"]).replace("[SSID]", cookies["SSID"]))
  67. del cookies
  68. #Graceful Shutdown
  69. class GracefulKiller:
  70. kill_now = False
  71. def __init__(self):
  72. signal.signal(signal.SIGINT, self.exit_gracefully)
  73. signal.signal(signal.SIGTERM, self.exit_gracefully)
  74. def exit_gracefully(self, signum, frame):
  75. print("Graceful exit process initiated, stopping all tasks...")
  76. self.kill_now = True
  77. gkiller = GracefulKiller()
  78. #microtasks
  79. def threadrunner(jobs: Queue):
  80. global langcnt
  81. global lasttask
  82. ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False})
  83. while not gkiller.kill_now:
  84. if not jobs.empty():
  85. task, vid, args = jobs.get()
  86. if task == "submitdiscovery":
  87. tracker.add_item_to_tracker(args, vid)
  88. elif task == "discovery":
  89. while True:
  90. try:
  91. info = getmetadata(mysession, str(vid).strip())
  92. break
  93. except BaseException as e:
  94. print(e)
  95. print("Error in retrieving information, waiting 30 seconds")
  96. sleep(30)
  97. if info[0] or info[1]: # ccenabled or creditdata
  98. if not isdir("out/"+str(vid).strip()):
  99. mkdir("out/"+str(vid).strip())
  100. if info[1]:
  101. open("out/"+str(vid).strip()+"/"+str(vid).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  102. if info[0]:
  103. langcnt[vid] = 0
  104. for langcode in langs:
  105. jobs.put(("subtitles", vid, langcode))
  106. else:
  107. jobs.put(("complete", None, "video:"+vid))
  108. for videodisc in info[2]:
  109. jobs.put(("submitdiscovery", videodisc, tracker.ItemType.Video))
  110. for channeldisc in info[3]:
  111. jobs.put(("submitdiscovery", channeldisc, tracker.ItemType.Channel))
  112. for mixdisc in info[4]:
  113. jobs.put(("submitdiscovery", mixdisc, tracker.ItemType.MixPlaylist))
  114. for playldisc in info[5]:
  115. jobs.put(("submitdiscovery", playldisc, tracker.ItemType.Playlist))
  116. elif task == "subtitles":
  117. retval = subprrun(jobs, mysession, args, vid, "default")
  118. langcnt[vid] += retval
  119. if langcnt[vid] >= 585:
  120. jobs.put(("complete", None, "video:"+vid))
  121. elif task == "subtitles-forceedit-captions":
  122. subprrun(jobs, mysession, args, vid, "forceedit-captions")
  123. elif task == "subtitles-forceedit-metadata":
  124. subprrun(jobs, mysession, args, vid, "forceedit-metadata")
  125. elif task == "channel":
  126. y = ydl.extract_info("https://www.youtube.com/channel/"+desit.split(":", 1)[1], download=False)
  127. for itemyv in y["entries"]:
  128. jobs.put(("submitdiscovery", itemyv["id"], tracker.ItemType.Video))
  129. jobs.put(("complete", None, "channel:"+args))
  130. elif task == "playlist":
  131. y = ydl.extract_info("https://www.youtube.com/playlist?list="+desit.split(":", 1)[1], download=False)
  132. for itemyvp in y["entries"]:
  133. jobs.put(("submitdiscovery", itemyvp["id"], tracker.ItemType.Video))
  134. jobs.put(("complete", None, "playlist:"+args))
  135. elif task == "complete":
  136. size = 0
  137. if ":" in args:
  138. if args.split(":", 1)[0] == "video":
  139. #check if dir is empty, make zip if needed
  140. if isdir("out/"+args.split(":", 1)[1]):
  141. if not listdir("out/"+args.split(":", 1)[1]):
  142. rmdir("out/"+args.split(":", 1)[1])
  143. else:
  144. #zip it up
  145. if not isdir("directory/"+args.split(":", 1)[1]):
  146. mkdir("directory/"+args.split(":", 1)[1])
  147. while not isfile("directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip"):
  148. print("Attempting to zip item...")
  149. system("zip -9 -r directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip out/"+args.split(":", 1)[1]+"/*")
  150. #get a target
  151. targetloc = None
  152. while not targetloc:
  153. targetloc = tracker.request_upload_target()
  154. if targetloc:
  155. break
  156. else:
  157. print("Waiting 5 minutes...")
  158. sleep(300)
  159. if targetloc.startswith("rsync"):
  160. system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 --files-from=- directory/"+args.split(":", 1)[1]+"/ "+targetloc)
  161. elif targetloc.startswith("http"):
  162. system("curl -F "+args.split(":", 1)[1]+".zip=@directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip "+targetloc)
  163. size = getsize("directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip")
  164. #cleanup
  165. try:
  166. del langcnt[args.split(":", 1)[1]]
  167. rmtree("directory/"+args.split(":", 1)[1]+"/")
  168. rmdir("directory/"+args.split(":", 1)[1]+"/")
  169. rmtree("out/"+args.split(":", 1)[1]+"/")
  170. rmdir("out/"+args.split(":", 1)[1]+"/")
  171. except:
  172. pass
  173. tracker.mark_item_as_done(args, size)
  174. jobs.task_done()
  175. else:
  176. # get a new task from tracker
  177. if datetime.now() - lasttask > timedelta(seconds=15): #only retrieve a task every 15 seconds to allow queue to build up
  178. collect() #cleanup
  179. desit = tracker.request_item_from_tracker()
  180. print("New task:", desit)
  181. if desit:
  182. if desit.split(":", 1)[0] == "video":
  183. lasttask = datetime.now()
  184. jobs.put(("discovery", desit.split(":", 1)[1], None))
  185. elif desit.split(":", 1)[0] == "channel":
  186. lasttask = datetime.now()
  187. jobs.put(("channel", None, desit.split(":", 1)[1]))
  188. elif desit.split(":", 1)[0] == "playlist":
  189. lasttask = datetime.now()
  190. jobs.put(("playlist", None, desit.split(":", 1)[1]))
  191. else:
  192. print("Ignoring item for now", desit)
  193. else:
  194. print("Ignoring item for now", desit)
  195. else:
  196. sleep(1)
  197. threads = []
  198. #start with 1 thread, give it a 5 second head start
  199. runthread = Thread(target=threadrunner, args=(jobs,))
  200. runthread.start()
  201. threads.append(runthread)
  202. del runthread
  203. sleep(5)
  204. #now create the other 49 threads
  205. for i in range(49):
  206. runthread = Thread(target=threadrunner, args=(jobs,))
  207. runthread.start()
  208. threads.append(runthread)
  209. del runthread
  210. #https://stackoverflow.com/a/11968881
  211. for x in threads:
  212. x.join()
  213. threads.remove(x)
  214. del x
  215. print("Exiting...")