archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.
 
 

368 Zeilen
12 KiB

  1. from threading import Thread
  2. import requests
  3. from time import sleep
  4. from os import mkdir, rmdir, listdir, system, environ
  5. from os.path import isdir, isfile, getsize
  6. from json import dumps, loads
  7. import signal
  8. import tracker
  9. from youtube_dl import YoutubeDL
  10. from shutil import make_archive, rmtree
  11. from queue import Queue
  12. from gc import collect
  13. from discovery import getmetadata
  14. from export import subprrun
  15. #useful Queue example: https://stackoverflow.com/a/54658363
  16. jobs = Queue()
  17. langcnt = {}
  18. HEROKU = False
  19. if isfile("../Procfile"):
  20. HEROKU = True
  21. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  22. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  23. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  24. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  25. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  26. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  27. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  28. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  29. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  30. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  31. 'xh', 'yi', 'yo', 'zu']
  32. #HSID, SSID, SID cookies required
  33. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  34. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  35. elif isfile("config.json"):
  36. cookies = loads(open("config.json").read())
  37. else:
  38. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  39. assert False
  40. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  41. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  42. assert False
  43. mysession = requests.session()
  44. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  45. validationtest = mysession.get("https://www.youtube.com/timedtext_editor?action_mde_edit_form=1&v=1iNTtHUwvq4&lang=en&bl=vmp&ui=hd&ref=player&tab=captions&o=U")
  46. assert not "accounts.google.com" in validationtest.url, "Please ensure you have correctly specified account cookies."
  47. assert """<button class="yt-uix-button yt-uix-button-size-default yt-uix-button-default yt-uix-button-has-icon" type="button" onclick=";return false;" id="yt-picker-language-button" data-button-action="yt.www.picker.load" data-button-menu-id="arrow-display" data-picker-key="language" data-picker-position="footer" data-button-toggle="true"><span class="yt-uix-button-icon-wrapper"><span class="yt-uix-button-icon yt-uix-button-icon-footer-language yt-sprite"></span></span><span class="yt-uix-button-content"> <span class="yt-picker-button-label">
  48. Language:
  49. </span>
  50. English
  51. </span><span class="yt-uix-button-arrow yt-sprite"></span></button>""" in validationtest.text, "Please make sure your YouTube and Google account language is set to English (United States)"
  52. del validationtest
  53. open("cookies.txt", "w").write("""# HTTP Cookie File
  54. .youtube.com TRUE / FALSE 1663793455 SID [SID]
  55. .youtube.com TRUE / FALSE 1663793455 HSID [HSID]
  56. .youtube.com TRUE / TRUE 1663793455 SSID [SSID]""".replace("[SID]", cookies["SID"]).replace("[HSID]", cookies["HSID"]).replace("[SSID]", cookies["SSID"]))
  57. del cookies
  58. #Graceful Shutdown
  59. class GracefulKiller:
  60. kill_now = False
  61. def __init__(self):
  62. signal.signal(signal.SIGINT, self.exit_gracefully)
  63. signal.signal(signal.SIGTERM, self.exit_gracefully)
  64. def exit_gracefully(self,signum, frame):
  65. self.kill_now = True
  66. gkiller = GracefulKiller()
  67. #minitasks
  68. def threadrunner(jobs: Queue):
  69. global langcnt
  70. ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False})
  71. while not gkiller.kill_now:
  72. if not jobs.empty():
  73. task, vid, args = jobs.get()
  74. if task == "submitdiscovery":
  75. tracker.add_item_to_tracker(args, vid)
  76. elif task == "discovery":
  77. pass
  78. elif task == "subtitles":
  79. pass
  80. elif task == "channel":
  81. y = ydl.extract_info("https://www.youtube.com/channel/"+desit.split(":", 1)[1], download=False)
  82. for itemyv in y["entries"]:
  83. jobs.put(("submitdiscovery", itemyv["id"], tracker.ItemType.Video))
  84. jobs.put(("complete", None, "channel:"+args))
  85. elif task == "playlist":
  86. y = ydl.extract_info("https://www.youtube.com/playlist?list="+desit.split(":", 1)[1], download=False)
  87. for itemyvp in y["entries"]:
  88. jobs.put(("submitdiscovery", itemyvp["id"], tracker.ItemType.Video))
  89. jobs.put(("complete", None, "playlist:"+args))
  90. elif task == "complete":
  91. size = 0
  92. if ":" in args:
  93. if args.split(":", 1)[0] == "video":
  94. if isfile("directory/"+args.split(":", 1)[1]+".zip"):
  95. size = getsize("directory/"+args.split(":", 1)[1]+".zip")
  96. tracker.mark_item_as_done(args, size)
  97. else:
  98. # get a new task from tracker
  99. desit = tracker.request_item_from_tracker()
  100. if desit:
  101. if desit.split(":", 1)[0] == "video":
  102. jobs.put(("discovery", desit.split(":", 1)[1], None))
  103. elif desit.split(":", 1)[0] == "channel":
  104. jobs.put(("channel", None, desit.split(":", 1)[1]))
  105. elif desit.split(":", 1)[0] == "playlist":
  106. jobs.put(("playlist", None, desit.split(":", 1)[1]))
  107. else:
  108. print("Ignoring item for now", desit)
  109. else:
  110. print("Ignoring item for now", desit)
  111. batchcontent.append(desit.split(":", 1)[1])
  112. actualitems.append(desit)
  113. batchcontent = []
  114. actualitems = []
  115. def batchfunc():
  116. if not HEROKU:
  117. desqsize = 51
  118. elif HEROKU:
  119. desqsize = 251
  120. while jobs.qsize() < desqsize:
  121. def submitfunc(submitqueue):
  122. while not submitqueue.empty():
  123. itype, ival = submitqueue.get()
  124. tracker.add_item_to_tracker(itype, ival)
  125. ccenabledl = []
  126. recvids = set()
  127. recchans = set()
  128. recmixes = set()
  129. recplayl = set()
  130. def prrun():
  131. while not jobs.empty():
  132. global recvids
  133. global recchans
  134. global recmixes
  135. global recplayl
  136. global ccenabledl
  137. item = jobs.get()
  138. print("Video ID:", str(item).strip())
  139. while True:
  140. try:
  141. info = getmetadata(mysession, str(item).strip())
  142. break
  143. except BaseException as e:
  144. print(e)
  145. print("Error in retrieving information, waiting 30 seconds")
  146. sleep(30)
  147. # Add any discovered videos
  148. recvids.update(info[2])
  149. recchans.update(info[3])
  150. recmixes.update(info[4])
  151. recplayl.update(info[5])
  152. if info[0] or info[1]: # ccenabled or creditdata
  153. if not isdir("out/"+str(item).strip()):
  154. mkdir("out/"+str(item).strip())
  155. if info[1]: # creditdata
  156. open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  157. if info[0]: #ccenabled
  158. ccenabledl.append(item)
  159. jobs.task_done()
  160. return True
  161. while not gkiller.kill_now:
  162. collect() #cleanup
  163. try:
  164. mkdir("out")
  165. except:
  166. pass
  167. try:
  168. mkdir("directory")
  169. except:
  170. pass
  171. batchcontent.clear()
  172. actualitems.clear()
  173. # Get a batch ID
  174. batchthreads = []
  175. for r in range(50):
  176. batchrunthread = Thread(target=batchfunc)
  177. batchrunthread.start()
  178. batchthreads.append(batchrunthread)
  179. del batchrunthread
  180. for xc in batchthreads:
  181. xc.join()
  182. batchthreads.remove(xc)
  183. del xc
  184. sleep(1) # prevent the script from continuing before the last thread finishes
  185. threads = []
  186. for i in range(50):
  187. runthread = Thread(target=prrun)
  188. runthread.start()
  189. threads.append(runthread)
  190. del runthread
  191. for x in threads:
  192. x.join()
  193. threads.remove(x)
  194. del x
  195. print("Sending discoveries to tracker...")
  196. submitjobs = Queue()
  197. # IDK how to handle mixes so just send them for now
  198. print("Videos:", len(recvids))
  199. for itemvid in recvids:
  200. submitjobs.put((tracker.ItemType.Video, itemvid))
  201. print("Channels:", len(recchans))
  202. for itemchan in recchans:
  203. submitjobs.put((tracker.ItemType.Channel, itemchan))
  204. print("Mix Playlists:", len(recmixes))
  205. for itemmix in recmixes:
  206. submitjobs.put((tracker.ItemType.MixPlaylist, itemmix))
  207. print("Playlists:", len(recplayl))
  208. for itemplayl in recplayl:
  209. submitjobs.put((tracker.ItemType.Playlist, itemplayl))
  210. # open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))
  211. # clear lists
  212. recvids.clear()
  213. recchans.clear()
  214. recmixes.clear()
  215. recplayl.clear()
  216. submitthreads = []
  217. for r in range(50):
  218. submitrunthread = Thread(target=submitfunc, args=(submitjobs,))
  219. submitrunthread.start()
  220. submitthreads.append(submitrunthread)
  221. del submitrunthread
  222. for xb in submitthreads:
  223. xb.join()
  224. submitthreads.remove(xb)
  225. del xb
  226. sleep(1) # prevent the script from continuing before the last thread finishes
  227. subtjobs = Queue()
  228. while ccenabledl:
  229. langcontent = langs.copy()
  230. intvid = ccenabledl.pop(0)
  231. while langcontent:
  232. subtjobs.put((langcontent.pop(0), intvid, "default"))
  233. del intvid
  234. del langcontent
  235. subthreads = []
  236. for r in range(50):
  237. subrunthread = Thread(target=subprrun, args=(subtjobs,mysession))
  238. subrunthread.start()
  239. subthreads.append(subrunthread)
  240. del subrunthread
  241. for xa in subthreads:
  242. xa.join()
  243. subthreads.remove(xa)
  244. del xa
  245. sleep(30) # wait 30 seconds to hopefully allow the other threads to finish
  246. for fol in listdir("out"): #remove empty folders
  247. try:
  248. if isdir("out/"+fol):
  249. rmdir("out/"+fol)
  250. except:
  251. pass
  252. #https://stackoverflow.com/a/11968881
  253. for fol in listdir("out"):
  254. if isdir("out/"+fol):
  255. make_archive("directory/"+fol, "zip", "out/"+fol)
  256. targetloc = None
  257. while not targetloc:
  258. targetloc = tracker.request_upload_target()
  259. if targetloc:
  260. break
  261. else:
  262. print("Waiting 5 minutes...")
  263. sleep(300)
  264. if targetloc.startswith("rsync"):
  265. system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 --files-from=- directory/ "+targetloc)
  266. elif targetloc.startswith("http"):
  267. for filzip in listdir("directory"):
  268. if filzip.endswith(".zip"):
  269. system("curl -F "+filzip+"=@directory/"+filzip+" "+targetloc)
  270. # Report the batch as complete
  271. for itemb in actualitems:
  272. tracker.mark_item_as_done(itemb, size)
  273. # clear the output directories
  274. rmtree("out")
  275. rmtree("directory")