archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
25개 이상의 토픽을 선택하실 수 없습니다. Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

282 lines
18 KiB

  1. from threading import Thread
  2. import requests
  3. from time import sleep
  4. from os import mkdir, rmdir, listdir, system, environ
  5. from os.path import isdir, isfile, getsize
  6. from json import dumps, loads
  7. HEROKU = False
  8. if isfile("../Procfile") and isfile("../requirements.txt"):
  9. print("Heroku detected... using 20 threads instead of 50.")
  10. HEROKU = True
  11. if HEROKU:
  12. if not "aioquic" in open("../requirements.txt").read():
  13. print("Installing aioquic on this Heroku instance since it wasn't installed on deploy...")
  14. system("pip install --user aioquic")
  15. import signal
  16. import tracker
  17. from youtube_dl import YoutubeDL
  18. from shutil import rmtree, which
  19. from queue import Queue
  20. from gc import collect
  21. from discovery import getmetadata
  22. from export import subprrun
  23. #useful Queue example: https://stackoverflow.com/a/54658363
  24. jobs = Queue()
  25. try:
  26. mkdir("out")
  27. except:
  28. pass
  29. try:
  30. mkdir("directory")
  31. except:
  32. pass
  33. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  34. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  35. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  36. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  37. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  38. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  39. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  40. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  41. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  42. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  43. 'xh', 'yi', 'yo', 'zu']
  44. assert which("zip") and which("rsync") and which("curl"), "Please ensure the zip, rsync, and curl commands are installed on your system."
  45. #HSID, SSID, SID cookies required
  46. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  47. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  48. elif isfile("config.json"):
  49. cookies = loads(open("config.json").read())
  50. else:
  51. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  52. assert False
  53. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  54. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  55. assert False
  56. mysession = requests.session()
  57. allheaders = {"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",}
  58. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  59. validationtest = mysession.get("https://www.youtube.com/timedtext_editor?action_mde_edit_form=1&v=1iNTtHUwvq4&lang=en&bl=vmp&ui=hd&ref=player&tab=captions&o=U")
  60. assert not "accounts.google.com" in validationtest.url, "Please ensure you have correctly specified account cookies."
  61. assert """<button class="yt-uix-button yt-uix-button-size-default yt-uix-button-default yt-uix-button-has-icon" type="button" onclick=";return false;" id="yt-picker-language-button" data-button-action="yt.www.picker.load" data-button-menu-id="arrow-display" data-picker-key="language" data-picker-position="footer" data-button-toggle="true"><span class="yt-uix-button-icon-wrapper"><span class="yt-uix-button-icon yt-uix-button-icon-footer-language yt-sprite"></span></span><span class="yt-uix-button-content"> <span class="yt-picker-button-label">
  62. Language:
  63. </span>
  64. English
  65. </span><span class="yt-uix-button-arrow yt-sprite"></span></button>""" in validationtest.text, "Please make sure your YouTube and Google account language is set to English (United States)"
  66. del validationtest
  67. open("cookies.txt", "w").write("""# HTTP Cookie File
  68. .youtube.com TRUE / FALSE 1663793455 SID [SID]
  69. .youtube.com TRUE / FALSE 1663793455 HSID [HSID]
  70. .youtube.com TRUE / TRUE 1663793455 SSID [SSID]""".replace("[SID]", cookies["SID"]).replace("[HSID]", cookies["HSID"]).replace("[SSID]", cookies["SSID"]))
  71. del cookies
  72. #Graceful Shutdown
  73. class GracefulKiller:
  74. kill_now = False
  75. def __init__(self):
  76. signal.signal(signal.SIGINT, self.exit_gracefully)
  77. signal.signal(signal.SIGTERM, self.exit_gracefully)
  78. def exit_gracefully(self, signum, frame):
  79. print("Graceful exit process initiated, no longer accepting new tasks but finishing existing ones...")
  80. self.kill_now = True
  81. gkiller = GracefulKiller()
  82. #microtasks
  83. def threadrunner():
  84. jobs = Queue()
  85. ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False})
  86. while True:
  87. if not jobs.empty():
  88. task, vid, args = jobs.get()
  89. if task == "submitdiscovery":
  90. tracker.add_item_to_tracker(args, vid)
  91. elif task == "discovery":
  92. while True:
  93. try:
  94. info = getmetadata(mysession, str(vid).strip(), allheaders)
  95. break
  96. except BaseException as e:
  97. print(e)
  98. print("Error in retrieving information, waiting 30 seconds and trying again")
  99. #raise
  100. sleep(30)
  101. if info[0] or info[1]: # ccenabled or creditdata
  102. if not isdir("out/"+str(vid).strip()):
  103. mkdir("out/"+str(vid).strip())
  104. if info[1]:
  105. open("out/"+str(vid).strip()+"/"+str(vid).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  106. if info[0]:
  107. for langcode in langs:
  108. jobs.put(("subtitles", vid, langcode))
  109. for langcode in langs:
  110. jobs.put(("subtitles-forceedit-metadata", vid, langcode))
  111. for langcode in langs:
  112. jobs.put(("subtitles-forceedit-captions", vid, langcode))
  113. jobs.put(("complete", None, "video:"+vid))
  114. for videodisc in info[2]:
  115. jobs.put(("submitdiscovery", videodisc, tracker.ItemType.Video))
  116. for channeldisc in info[3]:
  117. jobs.put(("submitdiscovery", channeldisc, tracker.ItemType.Channel))
  118. for mixdisc in info[4]:
  119. jobs.put(("submitdiscovery", mixdisc, tracker.ItemType.MixPlaylist))
  120. for playldisc in info[5]:
  121. jobs.put(("submitdiscovery", playldisc, tracker.ItemType.Playlist))
  122. elif task == "subtitles":
  123. subprrun(mysession, args, vid, "default", needforcemetadata, needforcecaptions, allheaders)
  124. elif task == "subtitles-forceedit-captions":
  125. subprrun(mysession, args, vid, "forceedit-captions", needforcemetadata, needforcecaptions, allheaders)
  126. elif task == "subtitles-forceedit-metadata":
  127. subprrun(mysession, args, vid, "forceedit-metadata", needforcemetadata, needforcecaptions, allheaders)
  128. elif task == "channel":
  129. try:
  130. y = ydl.extract_info("https://www.youtube.com/channel/"+desit.split(":", 1)[1], download=False)
  131. for itemyv in y["entries"]:
  132. jobs.put(("submitdiscovery", itemyv["id"], tracker.ItemType.Video))
  133. jobs.put(("complete", None, "channel:"+args))
  134. except:
  135. print("YouTube-DL error, ignoring but not marking as complete...", "https://www.youtube.com/channel/"+desit.split(":", 1)[1])
  136. elif task == "playlist":
  137. try:
  138. y = ydl.extract_info("https://www.youtube.com/playlist?list="+desit.split(":", 1)[1], download=False)
  139. for itemyvp in y["entries"]:
  140. jobs.put(("submitdiscovery", itemyvp["id"], tracker.ItemType.Video))
  141. jobs.put(("complete", None, "playlist:"+args))
  142. except:
  143. print("YouTube-DL error, ignoring but not marking as complete...", "https://www.youtube.com/playlist?list="+desit.split(":", 1)[1])
  144. elif task == "complete":
  145. size = 0
  146. if ":" in args:
  147. if args.split(":", 1)[0] == "video":
  148. #check if dir is empty, make zip if needed
  149. if isdir("out/"+args.split(":", 1)[1]):
  150. if not listdir("out/"+args.split(":", 1)[1]):
  151. rmdir("out/"+args.split(":", 1)[1])
  152. else:
  153. #zip it up
  154. if not isdir("directory/"+args.split(":", 1)[1]):
  155. mkdir("directory/"+args.split(":", 1)[1])
  156. while not isfile("directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip"):
  157. print("Attempting to zip item...")
  158. system("zip -9 -r -j directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip out/"+args.split(":", 1)[1])
  159. #get a target
  160. targetloc = None
  161. while not targetloc:
  162. targetloc = tracker.request_upload_target()
  163. if targetloc:
  164. break
  165. else:
  166. print("Waiting 5 minutes...")
  167. sleep(300)
  168. if targetloc.startswith("rsync"):
  169. system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 directory/"+args.split(":", 1)[1]+"/ "+targetloc)
  170. elif targetloc.startswith("http"):
  171. system("curl -F "+args.split(":", 1)[1]+".zip=@directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip "+targetloc)
  172. size = getsize("directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip")
  173. #cleanup
  174. try:
  175. del langcnt[args.split(":", 1)[1]]
  176. rmtree("directory/"+args.split(":", 1)[1]+"/")
  177. rmdir("directory/"+args.split(":", 1)[1]+"/")
  178. rmtree("out/"+args.split(":", 1)[1]+"/")
  179. rmdir("out/"+args.split(":", 1)[1]+"/")
  180. except:
  181. pass
  182. tracker.mark_item_as_done(args, size)
  183. jobs.task_done()
  184. else:
  185. if not gkiller.kill_now:
  186. # get a new task from tracker
  187. collect() #cleanup
  188. desit = tracker.request_item_from_tracker()
  189. print("New task:", desit)
  190. if desit:
  191. if desit.split(":", 1)[0] == "video":
  192. needforcemetadata = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
  193. 'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
  194. 'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
  195. 'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
  196. 'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
  197. 'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
  198. 'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
  199. 'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
  200. 'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
  201. 'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
  202. 'xh': None, 'yi': None, 'yo': None, 'zu': None}
  203. needforcecaptions = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
  204. 'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
  205. 'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
  206. 'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
  207. 'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
  208. 'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
  209. 'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
  210. 'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
  211. 'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
  212. 'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
  213. 'xh': None, 'yi': None, 'yo': None, 'zu': None}
  214. jobs.put(("discovery", desit.split(":", 1)[1], None))
  215. elif desit.split(":", 1)[0] == "channel":
  216. jobs.put(("channel", None, desit.split(":", 1)[1]))
  217. elif desit.split(":", 1)[0] == "playlist":
  218. jobs.put(("playlist", None, desit.split(":", 1)[1]))
  219. else:
  220. print("Ignoring item for now", desit)
  221. else:
  222. print("Ignoring item for now", desit)
  223. else:
  224. break
  225. threads = []
  226. THREADCNT = 50
  227. if HEROKU:
  228. THREADCNT = 20
  229. #now create the rest of the threads
  230. for i in range(THREADCNT):
  231. runthread = Thread(target=threadrunner)
  232. runthread.start()
  233. threads.append(runthread)
  234. del runthread
  235. #https://stackoverflow.com/a/11968881
  236. for x in threads:
  237. x.join()
  238. threads.remove(x)
  239. del x
  240. print("Exiting...")