archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

326 rivejä
11 KiB

  1. from threading import Thread
  2. import requests
  3. from time import sleep
  4. from os import mkdir, rmdir, listdir, system, environ
  5. from os.path import isdir, isfile, getsize
  6. from json import dumps, loads
  7. import signal
  8. import tracker
  9. from youtube_dl import YoutubeDL
  10. from shutil import make_archive, rmtree
  11. from queue import Queue
  12. from gc import collect
  13. from discovery import getmetadata
  14. from export import subprrun
  15. batchcontent = []
  16. actualitems = []
  17. HEROKU = False
  18. if isfile("../Procfile"):
  19. HEROKU = True
  20. def batchfunc():
  21. ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False})
  22. if not HEROKU:
  23. desqsize = 51
  24. elif HEROKU:
  25. desqsize = 251
  26. while jobs.qsize() < desqsize:
  27. desit = tracker.request_item_from_tracker()
  28. if desit:
  29. if desit.split(":", 1)[0] == "video":
  30. jobs.put(desit.split(":", 1)[1])
  31. elif desit.split(":", 1)[0] == "channel":
  32. y = ydl.extract_info("https://www.youtube.com/channel/"+desit.split(":", 1)[1], download=False)
  33. for itemyv in y["entries"]:
  34. tracker.add_item_to_tracker(tracker.ItemType.Video, itemyv["id"])
  35. elif desit.split(":", 1)[0] == "playlist":
  36. y = ydl.extract_info("https://www.youtube.com/playlist?list="+desit.split(":", 1)[1], download=False)
  37. for itemyvp in y["entries"]:
  38. tracker.add_item_to_tracker(tracker.ItemType.Video, itemyvp["id"])
  39. else:
  40. print("Ignoring item for now", desit)
  41. else:
  42. print("Ignoring item for now", desit)
  43. batchcontent.append(desit.split(":", 1)[1])
  44. actualitems.append(desit)
  45. def submitfunc(submitqueue):
  46. while not submitqueue.empty():
  47. itype, ival = submitqueue.get()
  48. tracker.add_item_to_tracker(itype, ival)
  49. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  50. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  51. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  52. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  53. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  54. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  55. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  56. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  57. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  58. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  59. 'xh', 'yi', 'yo', 'zu']
  60. #useful Queue example: https://stackoverflow.com/a/54658363
  61. jobs = Queue()
  62. ccenabledl = []
  63. recvids = set()
  64. recchans = set()
  65. recmixes = set()
  66. recplayl = set()
  67. #HSID, SSID, SID cookies required
  68. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  69. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  70. elif isfile("config.json"):
  71. cookies = loads(open("config.json").read())
  72. else:
  73. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  74. assert False
  75. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  76. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  77. assert False
  78. mysession = requests.session()
  79. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  80. validationtest = mysession.get("https://www.youtube.com/timedtext_editor?action_mde_edit_form=1&v=1iNTtHUwvq4&lang=en&bl=vmp&ui=hd&ref=player&tab=captions&o=U")
  81. assert not "accounts.google.com" in validationtest.url, "Please ensure you have correctly specified account cookies."
  82. assert """<button class="yt-uix-button yt-uix-button-size-default yt-uix-button-default yt-uix-button-has-icon" type="button" onclick=";return false;" id="yt-picker-language-button" data-button-action="yt.www.picker.load" data-button-menu-id="arrow-display" data-picker-key="language" data-picker-position="footer" data-button-toggle="true"><span class="yt-uix-button-icon-wrapper"><span class="yt-uix-button-icon yt-uix-button-icon-footer-language yt-sprite"></span></span><span class="yt-uix-button-content"> <span class="yt-picker-button-label">
  83. Language:
  84. </span>
  85. English
  86. </span><span class="yt-uix-button-arrow yt-sprite"></span></button>""" in validationtest.text, "Please make sure your YouTube and Google account language is set to English (United States)"
  87. del validationtest
  88. open("cookies.txt", "w").write("""# HTTP Cookie File
  89. .youtube.com TRUE / FALSE 1663793455 SID [SID]
  90. .youtube.com TRUE / FALSE 1663793455 HSID [HSID]
  91. .youtube.com TRUE / TRUE 1663793455 SSID [SSID]""".replace("[SID]", cookies["SID"]).replace("[HSID]", cookies["HSID"]).replace("[SSID]", cookies["SSID"]))
  92. del cookies
  93. #Graceful Shutdown
  94. class GracefulKiller:
  95. kill_now = False
  96. def __init__(self):
  97. signal.signal(signal.SIGINT, self.exit_gracefully)
  98. signal.signal(signal.SIGTERM, self.exit_gracefully)
  99. def exit_gracefully(self,signum, frame):
  100. self.kill_now = True
  101. gkiller = GracefulKiller()
  102. def prrun():
  103. while not jobs.empty():
  104. global recvids
  105. global recchans
  106. global recmixes
  107. global recplayl
  108. global ccenabledl
  109. item = jobs.get()
  110. print("Video ID:", str(item).strip())
  111. while True:
  112. try:
  113. info = getmetadata(mysession, str(item).strip())
  114. break
  115. except BaseException as e:
  116. print(e)
  117. print("Error in retrieving information, waiting 30 seconds")
  118. sleep(30)
  119. # Add any discovered videos
  120. recvids.update(info[2])
  121. recchans.update(info[3])
  122. recmixes.update(info[4])
  123. recplayl.update(info[5])
  124. if info[0] or info[1]: # ccenabled or creditdata
  125. if not isdir("out/"+str(item).strip()):
  126. mkdir("out/"+str(item).strip())
  127. if info[1]: # creditdata
  128. open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  129. if info[0]: #ccenabled
  130. ccenabledl.append(item)
  131. jobs.task_done()
  132. return True
  133. while not gkiller.kill_now:
  134. collect() #cleanup
  135. try:
  136. mkdir("out")
  137. except:
  138. pass
  139. try:
  140. mkdir("directory")
  141. except:
  142. pass
  143. batchcontent.clear()
  144. actualitems.clear()
  145. # Get a batch ID
  146. batchthreads = []
  147. for r in range(50):
  148. batchrunthread = Thread(target=batchfunc)
  149. batchrunthread.start()
  150. batchthreads.append(batchrunthread)
  151. del batchrunthread
  152. for xc in batchthreads:
  153. xc.join()
  154. batchthreads.remove(xc)
  155. del xc
  156. sleep(1) # prevent the script from continuing before the last thread finishes
  157. threads = []
  158. for i in range(50):
  159. runthread = Thread(target=prrun)
  160. runthread.start()
  161. threads.append(runthread)
  162. del runthread
  163. for x in threads:
  164. x.join()
  165. threads.remove(x)
  166. del x
  167. print("Sending discoveries to tracker...")
  168. submitjobs = Queue()
  169. # IDK how to handle mixes so just send them for now
  170. print("Videos:", len(recvids))
  171. for itemvid in recvids:
  172. submitjobs.put((tracker.ItemType.Video, itemvid))
  173. print("Channels:", len(recchans))
  174. for itemchan in recchans:
  175. submitjobs.put((tracker.ItemType.Channel, itemchan))
  176. print("Mix Playlists:", len(recmixes))
  177. for itemmix in recmixes:
  178. submitjobs.put((tracker.ItemType.MixPlaylist, itemmix))
  179. print("Playlists:", len(recplayl))
  180. for itemplayl in recplayl:
  181. submitjobs.put((tracker.ItemType.Playlist, itemplayl))
  182. # open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))
  183. # clear lists
  184. recvids.clear()
  185. recchans.clear()
  186. recmixes.clear()
  187. recplayl.clear()
  188. submitthreads = []
  189. for r in range(50):
  190. submitrunthread = Thread(target=submitfunc, args=(submitjobs,))
  191. submitrunthread.start()
  192. submitthreads.append(submitrunthread)
  193. del submitrunthread
  194. for xb in submitthreads:
  195. xb.join()
  196. submitthreads.remove(xb)
  197. del xb
  198. sleep(1) # prevent the script from continuing before the last thread finishes
  199. subtjobs = Queue()
  200. while ccenabledl:
  201. langcontent = langs.copy()
  202. intvid = ccenabledl.pop(0)
  203. while langcontent:
  204. subtjobs.put((langcontent.pop(0), intvid, "default"))
  205. del intvid
  206. del langcontent
  207. subthreads = []
  208. for r in range(50):
  209. subrunthread = Thread(target=subprrun, args=(subtjobs,mysession))
  210. subrunthread.start()
  211. subthreads.append(subrunthread)
  212. del subrunthread
  213. for xa in subthreads:
  214. xa.join()
  215. subthreads.remove(xa)
  216. del xa
  217. sleep(30) # wait 30 seconds to hopefully allow the other threads to finish
  218. for fol in listdir("out"): #remove empty folders
  219. try:
  220. if isdir("out/"+fol):
  221. rmdir("out/"+fol)
  222. except:
  223. pass
  224. #https://stackoverflow.com/a/11968881
  225. for fol in listdir("out"):
  226. if isdir("out/"+fol):
  227. make_archive("directory/"+fol, "zip", "out/"+fol)
  228. targetloc = None
  229. while not targetloc:
  230. targetloc = tracker.request_upload_target()
  231. if targetloc:
  232. break
  233. else:
  234. print("Waiting 5 minutes...")
  235. sleep(300)
  236. if targetloc.startswith("rsync"):
  237. system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 --files-from=- directory/ "+targetloc)
  238. elif targetloc.startswith("http"):
  239. for filzip in listdir("directory"):
  240. if filzip.endswith(".zip"):
  241. system("curl -F "+filzip+"=@directory/"+filzip+" "+targetloc)
  242. # Report the batch as complete
  243. for itemb in actualitems:
  244. size = 0
  245. if ":" in itemb:
  246. if itemb.split(":", 1)[0] == "video":
  247. if isfile("directory/"+itemb.split(":", 1)[1]+".zip"):
  248. size = getsize("directory/"+itemb.split(":", 1)[1]+".zip")
  249. tracker.mark_item_as_done(itemb, size)
  250. # clear the output directories
  251. rmtree("out")
  252. rmtree("directory")