archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
25'ten fazla konu seçemezsiniz Konular bir harf veya rakamla başlamalı, kısa çizgiler ('-') içerebilir ve en fazla 35 karakter uzunluğunda olabilir.
 
 

288 satır
18 KiB

  1. from os import system
  2. from os.path import isfile
  3. from sys import exit
  4. HEROKU = False
  5. if isfile("../Procfile") and isfile("../requirements.txt"):
  6. print("Heroku detected... using 20 threads instead of 50.")
  7. HEROKU = True
  8. if HEROKU:
  9. if not "aioquic" in open("../requirements.txt").read():
  10. print("Installing aioquic on this Heroku instance since it wasn't installed on deploy...")
  11. system("python3 -m pip install --user aioquic")
  12. system("python3 worker.py")
  13. exit(0)
  14. from threading import Thread
  15. import requests
  16. from time import sleep
  17. from os import mkdir, rmdir, listdir, environ
  18. from os.path import isdir, getsize
  19. from json import dumps, loads
  20. import signal
  21. import tracker
  22. from youtube_dl import YoutubeDL
  23. from shutil import rmtree, which
  24. from queue import Queue
  25. from gc import collect
  26. from discovery import getmetadata
  27. from export import subprrun
  28. #useful Queue example: https://stackoverflow.com/a/54658363
  29. jobs = Queue()
  30. try:
  31. mkdir("out")
  32. except:
  33. pass
  34. try:
  35. mkdir("directory")
  36. except:
  37. pass
  38. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  39. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  40. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  41. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  42. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  43. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  44. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  45. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  46. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  47. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  48. 'xh', 'yi', 'yo', 'zu']
  49. assert which("zip") and which("rsync") and which("curl"), "Please ensure the zip, rsync, and curl commands are installed on your system."
  50. #HSID, SSID, SID cookies required
  51. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  52. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  53. elif isfile("config.json"):
  54. cookies = loads(open("config.json").read())
  55. else:
  56. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  57. assert False
  58. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  59. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  60. assert False
  61. mysession = requests.session()
  62. allheaders = {"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",}
  63. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  64. validationtest = mysession.get("https://www.youtube.com/timedtext_editor?action_mde_edit_form=1&v=1iNTtHUwvq4&lang=en&bl=vmp&ui=hd&ref=player&tab=captions&o=U")
  65. assert not "accounts.google.com" in validationtest.url, "Please ensure you have correctly specified account cookies."
  66. assert """<button class="yt-uix-button yt-uix-button-size-default yt-uix-button-default yt-uix-button-has-icon" type="button" onclick=";return false;" id="yt-picker-language-button" data-button-action="yt.www.picker.load" data-button-menu-id="arrow-display" data-picker-key="language" data-picker-position="footer" data-button-toggle="true"><span class="yt-uix-button-icon-wrapper"><span class="yt-uix-button-icon yt-uix-button-icon-footer-language yt-sprite"></span></span><span class="yt-uix-button-content"> <span class="yt-picker-button-label">
  67. Language:
  68. </span>
  69. English
  70. </span><span class="yt-uix-button-arrow yt-sprite"></span></button>""" in validationtest.text, "Please make sure your YouTube and Google account language is set to English (United States)"
  71. del validationtest
  72. open("cookies.txt", "w").write("""# HTTP Cookie File
  73. .youtube.com TRUE / FALSE 1663793455 SID [SID]
  74. .youtube.com TRUE / FALSE 1663793455 HSID [HSID]
  75. .youtube.com TRUE / TRUE 1663793455 SSID [SSID]""".replace("[SID]", cookies["SID"]).replace("[HSID]", cookies["HSID"]).replace("[SSID]", cookies["SSID"]))
  76. del cookies
  77. #Graceful Shutdown
  78. class GracefulKiller:
  79. kill_now = False
  80. def __init__(self):
  81. signal.signal(signal.SIGINT, self.exit_gracefully)
  82. signal.signal(signal.SIGTERM, self.exit_gracefully)
  83. def exit_gracefully(self, signum, frame):
  84. print("Graceful exit process initiated, no longer accepting new tasks but finishing existing ones...")
  85. self.kill_now = True
  86. gkiller = GracefulKiller()
  87. #microtasks
  88. def threadrunner():
  89. jobs = Queue()
  90. ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False})
  91. while True:
  92. if not jobs.empty():
  93. task, vid, args = jobs.get()
  94. if task == "submitdiscovery":
  95. tracker.add_item_to_tracker(args, vid)
  96. elif task == "discovery":
  97. while True:
  98. try:
  99. info = getmetadata(mysession, str(vid).strip(), allheaders)
  100. break
  101. except BaseException as e:
  102. print(e)
  103. print("Error in retrieving information, waiting 30 seconds and trying again")
  104. #raise
  105. sleep(30)
  106. if info[0] or info[1]: # ccenabled or creditdata
  107. if not isdir("out/"+str(vid).strip()):
  108. mkdir("out/"+str(vid).strip())
  109. if info[1]:
  110. open("out/"+str(vid).strip()+"/"+str(vid).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  111. if info[0]:
  112. for langcode in langs:
  113. jobs.put(("subtitles", vid, langcode))
  114. for langcode in langs:
  115. jobs.put(("subtitles-forceedit-metadata", vid, langcode))
  116. for langcode in langs:
  117. jobs.put(("subtitles-forceedit-captions", vid, langcode))
  118. jobs.put(("complete", None, "video:"+vid))
  119. for videodisc in info[2]:
  120. jobs.put(("submitdiscovery", videodisc, tracker.ItemType.Video))
  121. for channeldisc in info[3]:
  122. jobs.put(("submitdiscovery", channeldisc, tracker.ItemType.Channel))
  123. for mixdisc in info[4]:
  124. jobs.put(("submitdiscovery", mixdisc, tracker.ItemType.MixPlaylist))
  125. for playldisc in info[5]:
  126. jobs.put(("submitdiscovery", playldisc, tracker.ItemType.Playlist))
  127. elif task == "subtitles":
  128. subprrun(mysession, args, vid, "default", needforcemetadata, needforcecaptions, allheaders)
  129. elif task == "subtitles-forceedit-captions":
  130. subprrun(mysession, args, vid, "forceedit-captions", needforcemetadata, needforcecaptions, allheaders)
  131. elif task == "subtitles-forceedit-metadata":
  132. subprrun(mysession, args, vid, "forceedit-metadata", needforcemetadata, needforcecaptions, allheaders)
  133. elif task == "channel":
  134. try:
  135. y = ydl.extract_info("https://www.youtube.com/channel/"+desit.split(":", 1)[1], download=False)
  136. for itemyv in y["entries"]:
  137. jobs.put(("submitdiscovery", itemyv["id"], tracker.ItemType.Video))
  138. jobs.put(("complete", None, "channel:"+args))
  139. except:
  140. print("YouTube-DL error, ignoring but not marking as complete...", "https://www.youtube.com/channel/"+desit.split(":", 1)[1])
  141. elif task == "playlist":
  142. try:
  143. y = ydl.extract_info("https://www.youtube.com/playlist?list="+desit.split(":", 1)[1], download=False)
  144. for itemyvp in y["entries"]:
  145. jobs.put(("submitdiscovery", itemyvp["id"], tracker.ItemType.Video))
  146. jobs.put(("complete", None, "playlist:"+args))
  147. except:
  148. print("YouTube-DL error, ignoring but not marking as complete...", "https://www.youtube.com/playlist?list="+desit.split(":", 1)[1])
  149. elif task == "complete":
  150. size = 0
  151. if ":" in args:
  152. if args.split(":", 1)[0] == "video":
  153. #check if dir is empty, make zip if needed
  154. if isdir("out/"+args.split(":", 1)[1]):
  155. if not listdir("out/"+args.split(":", 1)[1]):
  156. rmdir("out/"+args.split(":", 1)[1])
  157. else:
  158. #zip it up
  159. if not isdir("directory/"+args.split(":", 1)[1]):
  160. mkdir("directory/"+args.split(":", 1)[1])
  161. while not isfile("directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip"):
  162. print("Attempting to zip item...")
  163. system("zip -9 -r -j directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip out/"+args.split(":", 1)[1])
  164. #get a target
  165. targetloc = None
  166. while not targetloc:
  167. targetloc = tracker.request_upload_target()
  168. if targetloc:
  169. break
  170. else:
  171. print("Waiting 5 minutes...")
  172. sleep(300)
  173. if targetloc.startswith("rsync"):
  174. system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 directory/"+args.split(":", 1)[1]+"/ "+targetloc)
  175. elif targetloc.startswith("http"):
  176. system("curl -F "+args.split(":", 1)[1]+".zip=@directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip "+targetloc)
  177. size = getsize("directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip")
  178. #cleanup
  179. try:
  180. del langcnt[args.split(":", 1)[1]]
  181. rmtree("directory/"+args.split(":", 1)[1]+"/")
  182. rmdir("directory/"+args.split(":", 1)[1]+"/")
  183. rmtree("out/"+args.split(":", 1)[1]+"/")
  184. rmdir("out/"+args.split(":", 1)[1]+"/")
  185. except:
  186. pass
  187. tracker.mark_item_as_done(args, size)
  188. jobs.task_done()
  189. else:
  190. if not gkiller.kill_now:
  191. # get a new task from tracker
  192. collect() #cleanup
  193. desit = tracker.request_item_from_tracker()
  194. print("New task:", desit)
  195. if desit:
  196. if desit.split(":", 1)[0] == "video":
  197. needforcemetadata = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
  198. 'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
  199. 'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
  200. 'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
  201. 'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
  202. 'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
  203. 'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
  204. 'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
  205. 'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
  206. 'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
  207. 'xh': None, 'yi': None, 'yo': None, 'zu': None}
  208. needforcecaptions = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
  209. 'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
  210. 'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
  211. 'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
  212. 'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
  213. 'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
  214. 'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
  215. 'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
  216. 'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
  217. 'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
  218. 'xh': None, 'yi': None, 'yo': None, 'zu': None}
  219. jobs.put(("discovery", desit.split(":", 1)[1], None))
  220. elif desit.split(":", 1)[0] == "channel":
  221. jobs.put(("channel", None, desit.split(":", 1)[1]))
  222. elif desit.split(":", 1)[0] == "playlist":
  223. jobs.put(("playlist", None, desit.split(":", 1)[1]))
  224. else:
  225. print("Ignoring item for now", desit)
  226. else:
  227. print("Ignoring item for now", desit)
  228. else:
  229. break
  230. threads = []
  231. THREADCNT = 50
  232. if HEROKU:
  233. THREADCNT = 20
  234. #now create the rest of the threads
  235. for i in range(THREADCNT):
  236. runthread = Thread(target=threadrunner)
  237. runthread.start()
  238. threads.append(runthread)
  239. del runthread
  240. #https://stackoverflow.com/a/11968881
  241. for x in threads:
  242. x.join()
  243. threads.remove(x)
  244. del x
  245. print("Exiting...")