archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

312 lines
20 KiB

  1. from threading import Thread
  2. import requests
  3. from time import sleep
  4. from os import mkdir, rmdir, listdir, system, environ
  5. from os.path import isdir, isfile, getsize
  6. from json import loads
  7. from youtube_channel import main
  8. import signal
  9. import tracker
  10. from youtube_dl import YoutubeDL
  11. from shutil import rmtree, which
  12. from queue import Queue
  13. from gc import collect
  14. from discovery import getmetadata
  15. from export import subprrun
  16. #useful Queue example: https://stackoverflow.com/a/54658363
  17. jobs = Queue()
  18. try:
  19. mkdir("out")
  20. except:
  21. pass
  22. try:
  23. mkdir("directory")
  24. except:
  25. pass
  26. HEROKU = False
  27. if isfile("../Procfile"):
  28. HEROKU = True
  29. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  30. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  31. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  32. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  33. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  34. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  35. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  36. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  37. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  38. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  39. 'xh', 'yi', 'yo', 'zu']
  40. assert which("zip") and which("rsync") and which("curl"), "Please ensure the zip, rsync, and curl commands are installed on your system."
  41. #HSID, SSID, SID cookies required
  42. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  43. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  44. elif isfile("config.json"):
  45. cookies = loads(open("config.json").read())
  46. else:
  47. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  48. assert False
  49. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  50. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  51. assert False
  52. mysession = requests.session()
  53. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  54. validationtest = mysession.get("https://www.youtube.com/timedtext_editor?action_mde_edit_form=1&v=1iNTtHUwvq4&lang=en&bl=vmp&ui=hd&ref=player&tab=captions&o=U")
  55. assert not "accounts.google.com" in validationtest.url, "Please ensure you have correctly specified account cookies."
  56. assert """<button class="yt-uix-button yt-uix-button-size-default yt-uix-button-default yt-uix-button-has-icon" type="button" onclick=";return false;" id="yt-picker-language-button" data-button-action="yt.www.picker.load" data-button-menu-id="arrow-display" data-picker-key="language" data-picker-position="footer" data-button-toggle="true"><span class="yt-uix-button-icon-wrapper"><span class="yt-uix-button-icon yt-uix-button-icon-footer-language yt-sprite"></span></span><span class="yt-uix-button-content"> <span class="yt-picker-button-label">
  57. Language:
  58. </span>
  59. English
  60. </span><span class="yt-uix-button-arrow yt-sprite"></span></button>""" in validationtest.text, "Please make sure your YouTube and Google account language is set to English (United States)"
  61. del validationtest
  62. open("cookies.txt", "w").write("""# HTTP Cookie File
  63. .youtube.com TRUE / FALSE 1663793455 SID [SID]
  64. .youtube.com TRUE / FALSE 1663793455 HSID [HSID]
  65. .youtube.com TRUE / TRUE 1663793455 SSID [SSID]""".replace("[SID]", cookies["SID"]).replace("[HSID]", cookies["HSID"]).replace("[SSID]", cookies["SSID"]))
  66. del cookies
  67. validationtimes = 0
  68. #Graceful Shutdown
  69. class GracefulKiller:
  70. kill_now = False
  71. def __init__(self):
  72. signal.signal(signal.SIGINT, self.exit_gracefully)
  73. signal.signal(signal.SIGTERM, self.exit_gracefully)
  74. def exit_gracefully(self, signum, frame):
  75. print("Graceful exit process initiated, no longer accepting new tasks but finishing existing ones...")
  76. self.kill_now = True
  77. gkiller = GracefulKiller()
  78. #microtasks
  79. def threadrunner():
  80. global validationtimes
  81. jobs = Queue()
  82. ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False})
  83. while True:
  84. if not jobs.empty():
  85. task, vid, args = jobs.get()
  86. if task == "submitdiscovery":
  87. tracker.add_item_to_tracker(args, vid)
  88. elif task == "discovery":
  89. while True:
  90. try:
  91. info = getmetadata(mysession, str(vid).strip())
  92. break
  93. except BaseException as e:
  94. print(e)
  95. print("Error in retrieving information, waiting 30 seconds and trying again")
  96. sleep(30)
  97. if info[0]: # ccenabled
  98. if not isdir("out/"+str(vid).strip()):
  99. mkdir("out/"+str(vid).strip())
  100. if info[0]:
  101. for langcode in langs:
  102. jobs.put(("subtitles", vid, langcode))
  103. for langcode in langs:
  104. jobs.put(("subtitles-forceedit-metadata", vid, langcode))
  105. for langcode in langs:
  106. jobs.put(("subtitles-forceedit-captions", vid, langcode))
  107. jobs.put(("complete", None, "video:"+vid))
  108. for videodisc in info[1]:
  109. jobs.put(("submitdiscovery", videodisc, tracker.ItemType.Video))
  110. for channeldisc in info[2]:
  111. jobs.put(("submitdiscovery", channeldisc, tracker.ItemType.Channel))
  112. for mixdisc in info[3]:
  113. jobs.put(("submitdiscovery", mixdisc, tracker.ItemType.MixPlaylist))
  114. for playldisc in info[4]:
  115. jobs.put(("submitdiscovery", playldisc, tracker.ItemType.Playlist))
  116. elif task == "subtitles":
  117. subprrun(mysession, args, vid, "default", needforcemetadata, needforcecaptions)
  118. elif task == "subtitles-forceedit-captions":
  119. subprrun(mysession, args, vid, "forceedit-captions", needforcemetadata, needforcecaptions)
  120. elif task == "subtitles-forceedit-metadata":
  121. subprrun(mysession, args, vid, "forceedit-metadata", needforcemetadata, needforcecaptions)
  122. elif task == "channel":
  123. try:
  124. y = ydl.extract_info("https://www.youtube.com/channel/"+desit.split(":", 1)[1], download=False)
  125. for itemyv in y["entries"]:
  126. jobs.put(("submitdiscovery", itemyv["id"], tracker.ItemType.Video))
  127. #channel created playlists
  128. y = main(desit.split(":", 1)[1])
  129. for itemyv in y["playlists"]:
  130. jobs.put(("submitdiscovery", itemyv, tracker.ItemType.Playlist))
  131. for itemyv in y["channels"]:
  132. jobs.put(("submitdiscovery", itemyv, tracker.ItemType.Channel))
  133. jobs.put(("complete", None, "channel:"+args))
  134. except:
  135. print("YouTube-DL error, ignoring but not marking as complete...", "https://www.youtube.com/channel/"+desit.split(":", 1)[1])
  136. elif task == "playlist":
  137. try:
  138. y = ydl.extract_info("https://www.youtube.com/playlist?list="+desit.split(":", 1)[1], download=False)
  139. #TODO: extract owner channel in other projects
  140. #TODO: handle channels in other projects, not needed here because we will get it from the video
  141. for itemyvp in y["entries"]:
  142. jobs.put(("submitdiscovery", itemyvp["id"], tracker.ItemType.Video))
  143. jobs.put(("complete", None, "playlist:"+args))
  144. except:
  145. print("YouTube-DL error, ignoring but not marking as complete...", "https://www.youtube.com/playlist?list="+desit.split(":", 1)[1])
  146. elif task == "mixplaylist":
  147. try:
  148. wptext = mysession.get("https://www.youtube.com/watch?v=jNQXAC9IVRw&list="+desit.split(":", 1)[1]).text
  149. #channel handling not needed here because we will get it from the video
  150. for line in wptext.splitlines():
  151. if line.strip().startswith('window["ytInitialData"] = '):
  152. initdata = loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1])
  153. for itemyvp in initdata["contents"]["twoColumnWatchNextResults"]["playlist"]["playlist"]["contents"]:
  154. jobs.put(("submitdiscovery", itemyvp["playlistPanelVideoRenderer"]["videoId"], tracker.ItemType.Video))
  155. jobs.put(("complete", None, "mixplaylist:"+args))
  156. except:
  157. print("Mix Playlist error, ignoring but not marking as complete...", "https://www.youtube.com/watch?v=jNQXAC9IVRw&list="+desit.split(":", 1)[1])
  158. elif task == "complete":
  159. size = 0
  160. if ":" in args:
  161. if args.split(":", 1)[0] == "video":
  162. #check if dir is empty, make zip if needed
  163. if isdir("out/"+args.split(":", 1)[1]):
  164. if not listdir("out/"+args.split(":", 1)[1]):
  165. rmdir("out/"+args.split(":", 1)[1])
  166. else:
  167. #zip it up
  168. if not isdir("directory/"+args.split(":", 1)[1]):
  169. mkdir("directory/"+args.split(":", 1)[1])
  170. while not isfile("directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip"):
  171. print("Attempting to zip item...")
  172. system("zip -9 -r -j directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip out/"+args.split(":", 1)[1])
  173. #get a target
  174. targetloc = None
  175. while not targetloc:
  176. targetloc = tracker.request_upload_target()
  177. if targetloc:
  178. break
  179. else:
  180. print("Waiting 5 minutes...")
  181. sleep(300)
  182. while True:
  183. if targetloc.startswith("rsync"):
  184. exitinfo = system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 directory/"+args.split(":", 1)[1]+"/ "+targetloc)
  185. elif targetloc.startswith("http"):
  186. exitinfo = system("curl -F "+args.split(":", 1)[1]+".zip=@directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip "+targetloc)
  187. if exitinfo == 0: # note that on Unix this isn't necessarily the exit code but it's still 0 upon successful exit
  188. break
  189. else:
  190. print("Error in sending data to target, waiting 30 seconds and trying again.")
  191. sleep(30)
  192. size = getsize("directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip")
  193. #cleanup
  194. try:
  195. del langcnt[args.split(":", 1)[1]]
  196. rmtree("directory/"+args.split(":", 1)[1]+"/")
  197. rmdir("directory/"+args.split(":", 1)[1]+"/")
  198. rmtree("out/"+args.split(":", 1)[1]+"/")
  199. rmdir("out/"+args.split(":", 1)[1]+"/")
  200. except:
  201. pass
  202. tracker.mark_item_as_done(args, size)
  203. jobs.task_done()
  204. else:
  205. if not gkiller.kill_now:
  206. # get a new task from tracker
  207. collect() #cleanup
  208. desit = tracker.request_item_from_tracker()
  209. print("New task:", desit)
  210. if desit:
  211. if desit.split(":", 1)[0] == "video":
  212. needforcemetadata = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
  213. 'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
  214. 'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
  215. 'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
  216. 'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
  217. 'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
  218. 'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
  219. 'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
  220. 'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
  221. 'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
  222. 'xh': None, 'yi': None, 'yo': None, 'zu': None}
  223. needforcecaptions = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
  224. 'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
  225. 'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
  226. 'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
  227. 'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
  228. 'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
  229. 'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
  230. 'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
  231. 'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
  232. 'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
  233. 'xh': None, 'yi': None, 'yo': None, 'zu': None}
  234. jobs.put(("discovery", desit.split(":", 1)[1], None))
  235. elif desit.split(":", 1)[0] == "channel":
  236. jobs.put(("channel", None, desit.split(":", 1)[1]))
  237. elif desit.split(":", 1)[0] == "playlist":
  238. jobs.put(("playlist", None, desit.split(":", 1)[1]))
  239. elif desit.split(":", 1)[0] == "mixplaylist":
  240. jobs.put(("mixplaylist", None, desit.split(":", 1)[1]))
  241. else:
  242. print("Ignoring item for now", desit)
  243. else:
  244. print("Ignoring item for now", desit)
  245. else:
  246. break
  247. threads = []
  248. THREADCNT = 50
  249. if HEROKU:
  250. THREADCNT = 20
  251. #now create the rest of the threads
  252. for i in range(THREADCNT):
  253. runthread = Thread(target=threadrunner)
  254. runthread.start()
  255. threads.append(runthread)
  256. del runthread
  257. #https://stackoverflow.com/a/11968881
  258. for x in threads:
  259. x.join()
  260. threads.remove(x)
  261. del x
  262. print("Exiting...")