archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

318 line
10 KiB

  1. from threading import Thread
  2. import requests
  3. from time import sleep
  4. from os import mkdir, rmdir, listdir, system, environ
  5. from os.path import isdir, isfile, getsize
  6. from json import dumps, loads
  7. import signal
  8. from youtube_dl.utils import DownloadError
  9. import tracker
  10. from youtube_dl import YoutubeDL
  11. from shutil import make_archive, rmtree
  12. from queue import Queue
  13. from gc import collect
  14. from discovery import getmetadata
  15. from export import subprrun
  16. batchcontent = []
  17. def batchfunc():
  18. while len(batchcontent) < 500:
  19. batchcontent.append(tracker.request_item_from_tracker())
  20. def submitfunc(submitqueue):
  21. while not submitqueue.empty():
  22. itype, ival = submitqueue.get()
  23. tracker.add_item_to_tracker(itype, ival)
  24. WORKER_VERSION = 1
  25. SERVER_BASE_URL = "http://localhost:5000"
  26. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  27. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  28. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  29. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  30. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  31. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  32. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  33. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  34. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  35. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  36. 'xh', 'yi', 'yo', 'zu']
  37. #useful Queue example: https://stackoverflow.com/a/54658363
  38. jobs = Queue()
  39. ccenabledl = []
  40. recvids = set()
  41. recchans = set()
  42. recmixes = set()
  43. recplayl = set()
  44. #HSID, SSID, SID cookies required
  45. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  46. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  47. elif isfile("config.json"):
  48. cookies = loads(open("config.json").read())
  49. else:
  50. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  51. assert False
  52. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  53. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  54. assert False
  55. mysession = requests.session()
  56. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  57. open("cookies.txt", "w").write("""# HTTP Cookie File
  58. .youtube.com TRUE / FALSE 1663793455 SID [SID]
  59. .youtube.com TRUE / FALSE 1663793455 HSID [HSID]
  60. .youtube.com TRUE / TRUE 1663793455 SSID [SSID]""".replace("[SID]", cookies["SID"]).replace("[HSID]", cookies["HSID"]).replace("[SSID]", cookies["SSID"]))
  61. del cookies
  62. #Graceful Shutdown
  63. class GracefulKiller:
  64. kill_now = False
  65. def __init__(self):
  66. signal.signal(signal.SIGINT, self.exit_gracefully)
  67. signal.signal(signal.SIGTERM, self.exit_gracefully)
  68. def exit_gracefully(self,signum, frame):
  69. self.kill_now = True
  70. gkiller = GracefulKiller()
  71. def prrun():
  72. while not jobs.empty():
  73. global recvids
  74. global recchans
  75. global recmixes
  76. global recplayl
  77. global ccenabledl
  78. item = jobs.get()
  79. print("Video ID:", str(item).strip())
  80. while True:
  81. try:
  82. info = getmetadata(str(item).strip())
  83. break
  84. except BaseException as e:
  85. print(e)
  86. print("Error in retrieving information, waiting 30 seconds")
  87. #raise
  88. sleep(30)
  89. ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False})
  90. for chaninfo in set(info[3]):
  91. if chaninfo not in recchans:
  92. while True:
  93. try:
  94. y = ydl.extract_info("https://www.youtube.com/channel/"+chaninfo, download=False)
  95. recchans.add(chaninfo)
  96. break
  97. except:
  98. sleep(30)
  99. sleep(5) #prevent error 429
  100. for itemyv in y["entries"]:
  101. recvids.add(itemyv["id"])
  102. for playlinfo in set(info[5]):
  103. if playlinfo not in recplayl:
  104. while True:
  105. try:
  106. y = ydl.extract_info("https://www.youtube.com/playlist?list="+playlinfo, download=False)
  107. recplayl.add(playlinfo)
  108. break
  109. except:
  110. sleep(30)
  111. sleep(5) #prevent error 429
  112. for itemyvp in y["entries"]:
  113. recvids.add(itemyvp["id"])
  114. # Add any discovered videos
  115. recvids.update(info[2])
  116. recchans.update(info[3])
  117. recmixes.update(info[4])
  118. recplayl.update(info[5])
  119. if info[0] or info[1]: # ccenabled or creditdata
  120. if not isdir("out/"+str(item).strip()):
  121. mkdir("out/"+str(item).strip())
  122. if info[1]: # creditdata
  123. open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  124. if info[0]: #ccenabled
  125. ccenabledl.append(item)
  126. jobs.task_done()
  127. return True
  128. while not gkiller.kill_now:
  129. collect() #cleanup
  130. try:
  131. mkdir("out")
  132. except:
  133. pass
  134. batchcontent.clear()
  135. # Get a batch ID
  136. batchthreads = []
  137. for r in range(50):
  138. batchrunthread = Thread(target=batchfunc)
  139. batchrunthread.start()
  140. batchthreads.append(batchrunthread)
  141. del batchrunthread
  142. for xc in batchthreads:
  143. xc.join() #bug (occurred once: the script ended before the last thread finished)
  144. batchthreads.remove(xc)
  145. del xc
  146. #for ir in range(501):
  147. # batchcontent.append(tracker.request_item_from_tracker())
  148. for desit in batchcontent:
  149. if desit:
  150. if desit.split(":", 1)[0] == "video":
  151. jobs.put(desit.split(":", 1)[1])
  152. else:
  153. print("Ignoring item for now", desit)
  154. else:
  155. print("Ignoring item for now", desit)
  156. threads = []
  157. for i in range(50):
  158. runthread = Thread(target=prrun)
  159. runthread.start()
  160. threads.append(runthread)
  161. del runthread
  162. for x in threads:
  163. x.join()
  164. threads.remove(x)
  165. del x
  166. print("Sending discoveries to tracker...")
  167. submitjobs = Queue()
  168. #don't send channels and playlists as those have already been converted for video IDs
  169. #IDK how to handle mixes so send them for now
  170. print(len(recvids))
  171. for itemvid in recvids:
  172. submitjobs.put((tracker.ItemType.Video, itemvid))
  173. print(len(recmixes))
  174. for itemmix in recmixes:
  175. submitjobs.put((tracker.ItemType.MixPlaylist, itemmix))
  176. #open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))
  177. #clear
  178. recvids.clear()
  179. # recchans.clear()
  180. recmixes.clear()
  181. # recplayl.clear()
  182. submitthreads = []
  183. for r in range(50):
  184. submitrunthread = Thread(target=submitfunc, args=(submitjobs,))
  185. submitrunthread.start()
  186. submitthreads.append(submitrunthread)
  187. del submitrunthread
  188. for xb in submitthreads:
  189. xb.join() #bug (occurred once: the script ended before the last thread finished)
  190. submitthreads.remove(xb)
  191. del xb
  192. sleep(1)
  193. subtjobs = Queue()
  194. while ccenabledl:
  195. langcontent = langs.copy()
  196. intvid = ccenabledl.pop(0)
  197. while langcontent:
  198. subtjobs.put((langcontent.pop(0), intvid, "default"))
  199. del intvid
  200. del langcontent
  201. subthreads = []
  202. for r in range(50):
  203. subrunthread = Thread(target=subprrun, args=(subtjobs,mysession))
  204. subrunthread.start()
  205. subthreads.append(subrunthread)
  206. del subrunthread
  207. for xa in subthreads:
  208. xa.join() #bug (occurred once: the script ended before the last thread finished)
  209. subthreads.remove(xa)
  210. del xa
  211. sleep(1) #wait a second to hopefully allow the other threads to finish
  212. for fol in listdir("out"): #remove extra folders
  213. try:
  214. if isdir("out/"+fol):
  215. rmdir("out/"+fol)
  216. except:
  217. pass
  218. #https://stackoverflow.com/a/11968881
  219. # TODO: put the data somewhere...
  220. # TODO: put the discoveries somewhere...
  221. for fol in listdir("out"):
  222. if isdir("out/"+fol):
  223. make_archive("out/"+fol, "zip", "out/"+fol) #check this
  224. targetloc = None
  225. while not targetloc:
  226. targetloc = tracker.request_upload_target()
  227. if targetloc:
  228. break
  229. else:
  230. print("Waiting 5 minutes...")
  231. sleep(300)
  232. for zipf in listdir("out"):
  233. if isfile(zipf) in zipf.endswith(".zip"):
  234. if targetloc.startswith("rsync"):
  235. system("rsync out/"+zipf+" "+targetloc)
  236. elif targetloc.startswith("http"):
  237. upzipf = open("out/"+zipf, "rb")
  238. requests.post(targetloc, data=upzipf)
  239. upzipf.close()
  240. #upload it!
  241. # Report the batch as complete
  242. for itemb in batchcontent:
  243. if isfile("out/"+itemb.split(":", 1)[1]+".zip"):
  244. size = getsize("out/"+itemb.split(":", 1)[1]+".zip")
  245. else:
  246. size = 0
  247. tracker.mark_item_as_done(itemb, size)
  248. # clear the output directory
  249. rmtree("out")