archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

322 lines
11 KiB

  1. from threading import Thread
  2. import requests
  3. from time import sleep
  4. from os import mkdir, rmdir, listdir, system, environ
  5. from os.path import isdir, isfile, getsize
  6. from json import dumps, loads
  7. import signal
  8. import tracker
  9. from youtube_dl import YoutubeDL
  10. from shutil import make_archive, rmtree
  11. from queue import Queue
  12. from gc import collect
  13. from discovery import getmetadata
  14. from export import subprrun
  15. batchcontent = []
  16. HEROKU = False
  17. if isfile("../Procfile"):
  18. HEROKU = True
  19. def batchfunc():
  20. ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False})
  21. if not HEROKU:
  22. desqsize = 51
  23. elif HEROKU:
  24. desqsize = 251
  25. while jobs.qsize() < desqsize:
  26. desit = tracker.request_item_from_tracker()
  27. if desit:
  28. if desit.split(":", 1)[0] == "video":
  29. jobs.put(desit.split(":", 1)[1])
  30. elif desit.split(":", 1)[0] == "channel":
  31. y = ydl.extract_info("https://www.youtube.com/channel/"+desit.split(":", 1)[1], download=False)
  32. for itemyv in y["entries"]:
  33. tracker.add_item_to_tracker(tracker.ItemType.Video, itemyv["id"])
  34. elif desit.split(":", 1)[0] == "playlist":
  35. y = ydl.extract_info("https://www.youtube.com/playlist?list="+desit.split(":", 1)[1], download=False)
  36. for itemyvp in y["entries"]:
  37. tracker.add_item_to_tracker(tracker.ItemType.Video, itemyvp["id"])
  38. else:
  39. print("Ignoring item for now", desit)
  40. else:
  41. print("Ignoring item for now", desit)
  42. batchcontent.append(desit.split(":", 1)[1])
  43. def submitfunc(submitqueue):
  44. while not submitqueue.empty():
  45. itype, ival = submitqueue.get()
  46. tracker.add_item_to_tracker(itype, ival)
  47. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  48. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  49. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  50. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  51. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  52. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  53. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  54. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  55. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  56. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  57. 'xh', 'yi', 'yo', 'zu']
  58. #useful Queue example: https://stackoverflow.com/a/54658363
  59. jobs = Queue()
  60. ccenabledl = []
  61. recvids = set()
  62. recchans = set()
  63. recmixes = set()
  64. recplayl = set()
  65. #HSID, SSID, SID cookies required
  66. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  67. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  68. elif isfile("config.json"):
  69. cookies = loads(open("config.json").read())
  70. else:
  71. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  72. assert False
  73. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  74. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  75. assert False
  76. mysession = requests.session()
  77. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  78. validationtest = mysession.get("https://www.youtube.com/timedtext_editor?action_mde_edit_form=1&v=1iNTtHUwvq4&lang=en&bl=vmp&ui=hd&ref=player&tab=captions&o=U")
  79. assert not "accounts.google.com" in validationtest.url, "Please ensure you have correctly specified account cookies."
  80. assert """<button class="yt-uix-button yt-uix-button-size-default yt-uix-button-default yt-uix-button-has-icon" type="button" onclick=";return false;" id="yt-picker-language-button" data-button-action="yt.www.picker.load" data-button-menu-id="arrow-display" data-picker-key="language" data-picker-position="footer" data-button-toggle="true"><span class="yt-uix-button-icon-wrapper"><span class="yt-uix-button-icon yt-uix-button-icon-footer-language yt-sprite"></span></span><span class="yt-uix-button-content"> <span class="yt-picker-button-label">
  81. Language:
  82. </span>
  83. English
  84. </span><span class="yt-uix-button-arrow yt-sprite"></span></button>""" in validationtest.text, "Please make sure your YouTube and Google account language is set to English (United States)"
  85. del validationtest
  86. open("cookies.txt", "w").write("""# HTTP Cookie File
  87. .youtube.com TRUE / FALSE 1663793455 SID [SID]
  88. .youtube.com TRUE / FALSE 1663793455 HSID [HSID]
  89. .youtube.com TRUE / TRUE 1663793455 SSID [SSID]""".replace("[SID]", cookies["SID"]).replace("[HSID]", cookies["HSID"]).replace("[SSID]", cookies["SSID"]))
  90. del cookies
  91. #Graceful Shutdown
  92. class GracefulKiller:
  93. kill_now = False
  94. def __init__(self):
  95. signal.signal(signal.SIGINT, self.exit_gracefully)
  96. signal.signal(signal.SIGTERM, self.exit_gracefully)
  97. def exit_gracefully(self,signum, frame):
  98. self.kill_now = True
  99. gkiller = GracefulKiller()
  100. def prrun():
  101. while not jobs.empty():
  102. global recvids
  103. global recchans
  104. global recmixes
  105. global recplayl
  106. global ccenabledl
  107. item = jobs.get()
  108. print("Video ID:", str(item).strip())
  109. while True:
  110. try:
  111. info = getmetadata(mysession, str(item).strip())
  112. break
  113. except BaseException as e:
  114. print(e)
  115. print("Error in retrieving information, waiting 30 seconds")
  116. sleep(30)
  117. # Add any discovered videos
  118. recvids.update(info[2])
  119. recchans.update(info[3])
  120. recmixes.update(info[4])
  121. recplayl.update(info[5])
  122. if info[0] or info[1]: # ccenabled or creditdata
  123. if not isdir("out/"+str(item).strip()):
  124. mkdir("out/"+str(item).strip())
  125. if info[1]: # creditdata
  126. open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  127. if info[0]: #ccenabled
  128. ccenabledl.append(item)
  129. jobs.task_done()
  130. return True
  131. while not gkiller.kill_now:
  132. collect() #cleanup
  133. try:
  134. mkdir("out")
  135. except:
  136. pass
  137. try:
  138. mkdir("directory")
  139. except:
  140. pass
  141. batchcontent.clear()
  142. # Get a batch ID
  143. batchthreads = []
  144. for r in range(50):
  145. batchrunthread = Thread(target=batchfunc)
  146. batchrunthread.start()
  147. batchthreads.append(batchrunthread)
  148. del batchrunthread
  149. for xc in batchthreads:
  150. xc.join()
  151. batchthreads.remove(xc)
  152. del xc
  153. sleep(1) # prevent the script from continuing before the last thread finishes
  154. threads = []
  155. for i in range(50):
  156. runthread = Thread(target=prrun)
  157. runthread.start()
  158. threads.append(runthread)
  159. del runthread
  160. for x in threads:
  161. x.join()
  162. threads.remove(x)
  163. del x
  164. print("Sending discoveries to tracker...")
  165. submitjobs = Queue()
  166. # IDK how to handle mixes so just send them for now
  167. print("Videos:", len(recvids))
  168. for itemvid in recvids:
  169. submitjobs.put((tracker.ItemType.Video, itemvid))
  170. print("Channels:", len(recchans))
  171. for itemchan in recchans:
  172. submitjobs.put((tracker.ItemType.Channel, itemchan))
  173. print("Mix Playlists:", len(recmixes))
  174. for itemmix in recmixes:
  175. submitjobs.put((tracker.ItemType.MixPlaylist, itemmix))
  176. print("Playlists:", len(recplayl))
  177. for itemplayl in recplayl:
  178. submitjobs.put((tracker.ItemType.Playlist, itemplayl))
  179. # open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))
  180. # clear lists
  181. recvids.clear()
  182. recchans.clear()
  183. recmixes.clear()
  184. recplayl.clear()
  185. submitthreads = []
  186. for r in range(50):
  187. submitrunthread = Thread(target=submitfunc, args=(submitjobs,))
  188. submitrunthread.start()
  189. submitthreads.append(submitrunthread)
  190. del submitrunthread
  191. for xb in submitthreads:
  192. xb.join()
  193. submitthreads.remove(xb)
  194. del xb
  195. sleep(1) # prevent the script from continuing before the last thread finishes
  196. subtjobs = Queue()
  197. while ccenabledl:
  198. langcontent = langs.copy()
  199. intvid = ccenabledl.pop(0)
  200. while langcontent:
  201. subtjobs.put((langcontent.pop(0), intvid, "default"))
  202. del intvid
  203. del langcontent
  204. subthreads = []
  205. for r in range(50):
  206. subrunthread = Thread(target=subprrun, args=(subtjobs,mysession))
  207. subrunthread.start()
  208. subthreads.append(subrunthread)
  209. del subrunthread
  210. for xa in subthreads:
  211. xa.join()
  212. subthreads.remove(xa)
  213. del xa
  214. sleep(30) # wait 30 seconds to hopefully allow the other threads to finish
  215. for fol in listdir("out"): #remove empty folders
  216. try:
  217. if isdir("out/"+fol):
  218. rmdir("out/"+fol)
  219. except:
  220. pass
  221. #https://stackoverflow.com/a/11968881
  222. for fol in listdir("out"):
  223. if isdir("out/"+fol):
  224. make_archive("directory/"+fol, "zip", "out/"+fol)
  225. targetloc = None
  226. while not targetloc:
  227. targetloc = tracker.request_upload_target()
  228. if targetloc:
  229. break
  230. else:
  231. print("Waiting 5 minutes...")
  232. sleep(300)
  233. if targetloc.startswith("rsync"):
  234. system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 --files-from=- directory/ "+targetloc)
  235. elif targetloc.startswith("http"):
  236. for filzip in listdir("directory"):
  237. if filzip.endswith(".zip"):
  238. system("curl -F "+filzip+"=@directory/"+filzip+" "+targetloc)
  239. # Report the batch as complete
  240. for itemb in batchcontent:
  241. if isfile("directory/"+itemb.split(":", 1)[1]+".zip"):
  242. size = getsize("directory/"+itemb.split(":", 1)[1]+".zip")
  243. else:
  244. size = 0
  245. tracker.mark_item_as_done(itemb, size)
  246. # clear the output directories
  247. rmtree("out")
  248. rmtree("directory")