archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

385 line
13 KiB

  1. from threading import Thread
  2. import requests
  3. from time import sleep
  4. from os import mkdir, rmdir, listdir, system, environ
  5. from os.path import isdir, isfile, getsize
  6. from json import dumps, loads
  7. import signal
  8. import tracker
  9. from youtube_dl import YoutubeDL
  10. from shutil import make_archive, rmtree
  11. from queue import Queue
  12. from gc import collect
  13. from discovery import getmetadata
  14. from export import subprrun
  15. #useful Queue example: https://stackoverflow.com/a/54658363
  16. jobs = Queue()
  17. langcnt = {}
  18. HEROKU = False
  19. if isfile("../Procfile"):
  20. HEROKU = True
  21. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  22. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  23. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  24. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  25. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  26. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  27. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  28. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  29. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  30. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  31. 'xh', 'yi', 'yo', 'zu']
  32. #HSID, SSID, SID cookies required
  33. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  34. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  35. elif isfile("config.json"):
  36. cookies = loads(open("config.json").read())
  37. else:
  38. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  39. assert False
  40. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  41. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  42. assert False
  43. mysession = requests.session()
  44. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  45. validationtest = mysession.get("https://www.youtube.com/timedtext_editor?action_mde_edit_form=1&v=1iNTtHUwvq4&lang=en&bl=vmp&ui=hd&ref=player&tab=captions&o=U")
  46. assert not "accounts.google.com" in validationtest.url, "Please ensure you have correctly specified account cookies."
  47. assert """<button class="yt-uix-button yt-uix-button-size-default yt-uix-button-default yt-uix-button-has-icon" type="button" onclick=";return false;" id="yt-picker-language-button" data-button-action="yt.www.picker.load" data-button-menu-id="arrow-display" data-picker-key="language" data-picker-position="footer" data-button-toggle="true"><span class="yt-uix-button-icon-wrapper"><span class="yt-uix-button-icon yt-uix-button-icon-footer-language yt-sprite"></span></span><span class="yt-uix-button-content"> <span class="yt-picker-button-label">
  48. Language:
  49. </span>
  50. English
  51. </span><span class="yt-uix-button-arrow yt-sprite"></span></button>""" in validationtest.text, "Please make sure your YouTube and Google account language is set to English (United States)"
  52. del validationtest
  53. open("cookies.txt", "w").write("""# HTTP Cookie File
  54. .youtube.com TRUE / FALSE 1663793455 SID [SID]
  55. .youtube.com TRUE / FALSE 1663793455 HSID [HSID]
  56. .youtube.com TRUE / TRUE 1663793455 SSID [SSID]""".replace("[SID]", cookies["SID"]).replace("[HSID]", cookies["HSID"]).replace("[SSID]", cookies["SSID"]))
  57. del cookies
  58. #Graceful Shutdown
  59. class GracefulKiller:
  60. kill_now = False
  61. def __init__(self):
  62. signal.signal(signal.SIGINT, self.exit_gracefully)
  63. signal.signal(signal.SIGTERM, self.exit_gracefully)
  64. def exit_gracefully(self,signum, frame):
  65. self.kill_now = True
  66. gkiller = GracefulKiller()
  67. #TODO: discoveries, zipping, completion of subtitles
  68. #minitasks
  69. def threadrunner(jobs: Queue):
  70. global langcnt
  71. ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False})
  72. while not gkiller.kill_now:
  73. if not jobs.empty():
  74. task, vid, args = jobs.get()
  75. if task == "submitdiscovery":
  76. tracker.add_item_to_tracker(args, vid)
  77. elif task == "discovery":
  78. while True:
  79. try:
  80. info = getmetadata(mysession, str(vid).strip())
  81. break
  82. except BaseException as e:
  83. print(e)
  84. print("Error in retrieving information, waiting 30 seconds")
  85. sleep(30)
  86. if info[0] or info[1]: # ccenabled or creditdata
  87. if not isdir("out/"+str(vid).strip()):
  88. mkdir("out/"+str(vid).strip())
  89. if info[1]:
  90. open("out/"+str(vid).strip()+"/"+str(vid).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  91. if info[0]:
  92. langcnt[vid] = 0
  93. for langcode in langs:
  94. jobs.put(("subtitles", vid, langcode))
  95. else:
  96. jobs.put(("complete", None, "video:"+vid))
  97. elif task == "subtitles":
  98. subprrun(jobs, mysession, args, vid, "default")
  99. langcnt[vid] += 1
  100. if langcnt[vid] >= 195:
  101. pass #complete(?)
  102. elif task == "subtitles-forceedit-captions":
  103. subprrun(jobs, mysession, args, vid, "forceedit-captions")
  104. elif task == "subtitles-forceedit-metadata":
  105. subprrun(jobs, mysession, args, vid, "forceedit-metadata")
  106. elif task == "channel":
  107. y = ydl.extract_info("https://www.youtube.com/channel/"+desit.split(":", 1)[1], download=False)
  108. for itemyv in y["entries"]:
  109. jobs.put(("submitdiscovery", itemyv["id"], tracker.ItemType.Video))
  110. jobs.put(("complete", None, "channel:"+args))
  111. elif task == "playlist":
  112. y = ydl.extract_info("https://www.youtube.com/playlist?list="+desit.split(":", 1)[1], download=False)
  113. for itemyvp in y["entries"]:
  114. jobs.put(("submitdiscovery", itemyvp["id"], tracker.ItemType.Video))
  115. jobs.put(("complete", None, "playlist:"+args))
  116. elif task == "complete":
  117. size = 0
  118. if ":" in args:
  119. if args.split(":", 1)[0] == "video":
  120. if isfile("directory/"+args.split(":", 1)[1]+".zip"):
  121. size = getsize("directory/"+args.split(":", 1)[1]+".zip")
  122. tracker.mark_item_as_done(args, size)
  123. else:
  124. # get a new task from tracker
  125. desit = tracker.request_item_from_tracker()
  126. if desit:
  127. if desit.split(":", 1)[0] == "video":
  128. jobs.put(("discovery", desit.split(":", 1)[1], None))
  129. elif desit.split(":", 1)[0] == "channel":
  130. jobs.put(("channel", None, desit.split(":", 1)[1]))
  131. elif desit.split(":", 1)[0] == "playlist":
  132. jobs.put(("playlist", None, desit.split(":", 1)[1]))
  133. else:
  134. print("Ignoring item for now", desit)
  135. else:
  136. print("Ignoring item for now", desit)
  137. batchcontent.append(desit.split(":", 1)[1])
  138. actualitems.append(desit)
  139. batchcontent = []
  140. actualitems = []
  141. def batchfunc():
  142. if not HEROKU:
  143. desqsize = 51
  144. elif HEROKU:
  145. desqsize = 251
  146. while jobs.qsize() < desqsize:
  147. def submitfunc(submitqueue):
  148. while not submitqueue.empty():
  149. itype, ival = submitqueue.get()
  150. tracker.add_item_to_tracker(itype, ival)
  151. ccenabledl = []
  152. recvids = set()
  153. recchans = set()
  154. recmixes = set()
  155. recplayl = set()
  156. def prrun():
  157. while not jobs.empty():
  158. global recvids
  159. global recchans
  160. global recmixes
  161. global recplayl
  162. global ccenabledl
  163. item = jobs.get()
  164. print("Video ID:", str(item).strip())
  165. # Add any discovered videos
  166. recvids.update(info[2])
  167. recchans.update(info[3])
  168. recmixes.update(info[4])
  169. recplayl.update(info[5])
  170. if info[1]: # creditdata
  171. open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  172. if info[0]: #ccenabled
  173. ccenabledl.append(item)
  174. jobs.task_done()
  175. return True
  176. while not gkiller.kill_now:
  177. collect() #cleanup
  178. try:
  179. mkdir("out")
  180. except:
  181. pass
  182. try:
  183. mkdir("directory")
  184. except:
  185. pass
  186. batchcontent.clear()
  187. actualitems.clear()
  188. # Get a batch ID
  189. batchthreads = []
  190. for r in range(50):
  191. batchrunthread = Thread(target=batchfunc)
  192. batchrunthread.start()
  193. batchthreads.append(batchrunthread)
  194. del batchrunthread
  195. for xc in batchthreads:
  196. xc.join()
  197. batchthreads.remove(xc)
  198. del xc
  199. sleep(1) # prevent the script from continuing before the last thread finishes
  200. threads = []
  201. for i in range(50):
  202. runthread = Thread(target=prrun)
  203. runthread.start()
  204. threads.append(runthread)
  205. del runthread
  206. for x in threads:
  207. x.join()
  208. threads.remove(x)
  209. del x
  210. print("Sending discoveries to tracker...")
  211. submitjobs = Queue()
  212. # IDK how to handle mixes so just send them for now
  213. print("Videos:", len(recvids))
  214. for itemvid in recvids:
  215. submitjobs.put((tracker.ItemType.Video, itemvid))
  216. print("Channels:", len(recchans))
  217. for itemchan in recchans:
  218. submitjobs.put((tracker.ItemType.Channel, itemchan))
  219. print("Mix Playlists:", len(recmixes))
  220. for itemmix in recmixes:
  221. submitjobs.put((tracker.ItemType.MixPlaylist, itemmix))
  222. print("Playlists:", len(recplayl))
  223. for itemplayl in recplayl:
  224. submitjobs.put((tracker.ItemType.Playlist, itemplayl))
  225. # open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))
  226. # clear lists
  227. recvids.clear()
  228. recchans.clear()
  229. recmixes.clear()
  230. recplayl.clear()
  231. submitthreads = []
  232. for r in range(50):
  233. submitrunthread = Thread(target=submitfunc, args=(submitjobs,))
  234. submitrunthread.start()
  235. submitthreads.append(submitrunthread)
  236. del submitrunthread
  237. for xb in submitthreads:
  238. xb.join()
  239. submitthreads.remove(xb)
  240. del xb
  241. sleep(1) # prevent the script from continuing before the last thread finishes
  242. subtjobs = Queue()
  243. while ccenabledl:
  244. langcontent = langs.copy()
  245. intvid = ccenabledl.pop(0)
  246. while langcontent:
  247. subtjobs.put((langcontent.pop(0), intvid, "default"))
  248. del intvid
  249. del langcontent
  250. subthreads = []
  251. for r in range(50):
  252. subrunthread = Thread(target=subprrun, args=(subtjobs,mysession))
  253. subrunthread.start()
  254. subthreads.append(subrunthread)
  255. del subrunthread
  256. for xa in subthreads:
  257. xa.join()
  258. subthreads.remove(xa)
  259. del xa
  260. sleep(30) # wait 30 seconds to hopefully allow the other threads to finish
  261. for fol in listdir("out"): #remove empty folders
  262. try:
  263. if isdir("out/"+fol):
  264. rmdir("out/"+fol)
  265. except:
  266. pass
  267. #https://stackoverflow.com/a/11968881
  268. for fol in listdir("out"):
  269. if isdir("out/"+fol):
  270. make_archive("directory/"+fol, "zip", "out/"+fol)
  271. targetloc = None
  272. while not targetloc:
  273. targetloc = tracker.request_upload_target()
  274. if targetloc:
  275. break
  276. else:
  277. print("Waiting 5 minutes...")
  278. sleep(300)
  279. if targetloc.startswith("rsync"):
  280. system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 --files-from=- directory/ "+targetloc)
  281. elif targetloc.startswith("http"):
  282. for filzip in listdir("directory"):
  283. if filzip.endswith(".zip"):
  284. system("curl -F "+filzip+"=@directory/"+filzip+" "+targetloc)
  285. # Report the batch as complete
  286. for itemb in actualitems:
  287. tracker.mark_item_as_done(itemb, size)
  288. # clear the output directories
  289. rmtree("out")
  290. rmtree("directory")