archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.
 
 

296 Zeilen
9.3 KiB

  1. from threading import Thread
  2. import requests
  3. from time import sleep
  4. from os import mkdir, rmdir, listdir, system, environ
  5. from os.path import isdir, isfile, getsize
  6. from json import dumps, loads
  7. import signal
  8. import tracker
  9. from youtube_dl import YoutubeDL
  10. from shutil import make_archive, rmtree
  11. from queue import Queue
  12. from gc import collect
  13. from discovery import getmetadata
  14. from export import subprrun
  15. batchcontent = []
  16. def batchfunc():
  17. while len(batchcontent) < 500:
  18. batchcontent.append(tracker.request_item_from_tracker())
  19. def submitfunc(submitqueue):
  20. while not submitqueue.empty():
  21. itype, ival = submitqueue.get()
  22. tracker.add_item_to_tracker(itype, ival)
  23. WORKER_VERSION = 1
  24. SERVER_BASE_URL = "http://localhost:5000"
  25. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  26. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  27. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  28. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  29. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  30. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  31. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  32. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  33. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  34. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  35. 'xh', 'yi', 'yo', 'zu']
  36. #useful Queue example: https://stackoverflow.com/a/54658363
  37. jobs = Queue()
  38. ccenabledl = []
  39. recvids = set()
  40. recchans = set()
  41. recmixes = set()
  42. recplayl = set()
  43. #HSID, SSID, SID cookies required
  44. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  45. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  46. elif isfile("config.json"):
  47. cookies = loads(open("config.json").read())
  48. else:
  49. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  50. assert False
  51. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  52. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  53. assert False
  54. mysession = requests.session()
  55. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  56. del cookies
  57. #Graceful Shutdown
  58. class GracefulKiller:
  59. kill_now = False
  60. def __init__(self):
  61. signal.signal(signal.SIGINT, self.exit_gracefully)
  62. signal.signal(signal.SIGTERM, self.exit_gracefully)
  63. def exit_gracefully(self,signum, frame):
  64. self.kill_now = True
  65. gkiller = GracefulKiller()
  66. def prrun():
  67. while not jobs.empty():
  68. global recvids
  69. global recchans
  70. global recmixes
  71. global recplayl
  72. global ccenabledl
  73. item = jobs.get()
  74. print("Video ID:", str(item).strip())
  75. while True:
  76. try:
  77. info = getmetadata(str(item).strip())
  78. break
  79. except BaseException as e:
  80. print(e)
  81. print("Error in retrieving information, waiting 30 seconds")
  82. #raise
  83. sleep(30)
  84. ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True})
  85. for chaninfo in info[3]:
  86. if chaninfo not in recchans:
  87. y = ydl.extract_info("https://www.youtube.com/channel/"+chaninfo, download=False)
  88. for itemyv in y["entries"]:
  89. recvids.add(itemyv["id"])
  90. for playlinfo in info[5]:
  91. if playlinfo not in recplayl:
  92. y = ydl.extract_info("https://www.youtube.com/playlist?list="+playlinfo, download=False)
  93. for itemyvp in y["entries"]:
  94. recvids.add(itemyvp["id"])
  95. # Add any discovered videos
  96. recvids.update(info[2])
  97. recchans.update(info[3])
  98. recmixes.update(info[4])
  99. recplayl.update(info[5])
  100. if info[0] or info[1]: # ccenabled or creditdata
  101. if not isdir("out/"+str(item).strip()):
  102. mkdir("out/"+str(item).strip())
  103. if info[1]: # creditdata
  104. open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  105. if info[0]: #ccenabled
  106. ccenabledl.append(item)
  107. jobs.task_done()
  108. return True
  109. while not gkiller.kill_now:
  110. collect() #cleanup
  111. try:
  112. mkdir("out")
  113. except:
  114. pass
  115. batchcontent.clear()
  116. # Get a batch ID
  117. batchthreads = []
  118. for r in range(50):
  119. batchrunthread = Thread(target=batchfunc)
  120. batchrunthread.start()
  121. batchthreads.append(batchrunthread)
  122. del batchrunthread
  123. for xc in batchthreads:
  124. xc.join() #bug (occurred once: the script ended before the last thread finished)
  125. batchthreads.remove(xc)
  126. del xc
  127. #for ir in range(501):
  128. # batchcontent.append(tracker.request_item_from_tracker())
  129. for desit in batchcontent:
  130. if desit:
  131. if desit.split(":", 1)[0] == "video":
  132. jobs.put(desit.split(":", 1)[1])
  133. else:
  134. print("Ignoring item for now", desit)
  135. else:
  136. print("Ignoring item for now", desit)
  137. threads = []
  138. for i in range(50):
  139. runthread = Thread(target=prrun)
  140. runthread.start()
  141. threads.append(runthread)
  142. del runthread
  143. for x in threads:
  144. x.join()
  145. threads.remove(x)
  146. del x
  147. print("Sending discoveries to tracker...")
  148. submitjobs = Queue()
  149. #don't send channels and playlists as those have already been converted for video IDs
  150. #IDK how to handle mixes so send them for now
  151. print(len(recvids))
  152. for itemvid in recvids:
  153. submitjobs.put((tracker.ItemType.Video, itemvid))
  154. print(len(recmixes))
  155. for itemmix in recmixes:
  156. submitjobs.put((tracker.ItemType.MixPlaylist, itemmix))
  157. #open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))
  158. #clear
  159. recvids.clear()
  160. recchans.clear()
  161. recmixes.clear()
  162. recplayl.clear()
  163. submitthreads = []
  164. for r in range(50):
  165. submitrunthread = Thread(target=submitfunc, args=(submitjobs,))
  166. submitrunthread.start()
  167. submitthreads.append(submitrunthread)
  168. del submitrunthread
  169. for xb in submitthreads:
  170. xb.join() #bug (occurred once: the script ended before the last thread finished)
  171. submitthreads.remove(xb)
  172. del xb
  173. sleep(1)
  174. subtjobs = Queue()
  175. while ccenabledl:
  176. langcontent = langs.copy()
  177. intvid = ccenabledl.pop(0)
  178. while langcontent:
  179. subtjobs.put((langcontent.pop(0), intvid, "default"))
  180. del intvid
  181. del langcontent
  182. subthreads = []
  183. for r in range(50):
  184. subrunthread = Thread(target=subprrun, args=(subtjobs,mysession))
  185. subrunthread.start()
  186. subthreads.append(subrunthread)
  187. del subrunthread
  188. for xa in subthreads:
  189. xa.join() #bug (occurred once: the script ended before the last thread finished)
  190. subthreads.remove(xa)
  191. del xa
  192. sleep(1) #wait a second to hopefully allow the other threads to finish
  193. for fol in listdir("out"): #remove extra folders
  194. try:
  195. if isdir("out/"+fol):
  196. rmdir("out/"+fol)
  197. except:
  198. pass
  199. #https://stackoverflow.com/a/11968881
  200. # TODO: put the data somewhere...
  201. # TODO: put the discoveries somewhere...
  202. for fol in listdir("out"):
  203. if isdir("out/"+fol):
  204. make_archive("out/"+fol, "zip", "out/"+fol) #check this
  205. targetloc = None
  206. while not targetloc:
  207. targetloc = tracker.request_upload_target()
  208. if targetloc:
  209. break
  210. else:
  211. print("Waiting 5 minutes...")
  212. sleep(300)
  213. for zipf in listdir("out"):
  214. if isfile(zipf) in zipf.endswith(".zip"):
  215. if targetloc.startswith("rsync"):
  216. system("rsync out/"+zipf+" "+targetloc)
  217. elif targetloc.startswith("http"):
  218. upzipf = open("out/"+zipf, "rb")
  219. requests.post(targetloc, data=upzipf)
  220. upzipf.close()
  221. #upload it!
  222. # Report the batch as complete
  223. for itemb in batchcontent:
  224. if isfile("out/"+itemb.split(":", 1)[1]+".zip"):
  225. size = getsize("out/"+itemb.split(":", 1)[1]+".zip")
  226. else:
  227. size = 0
  228. tracker.mark_item_as_done(itemb, size)
  229. # clear the output directory
  230. rmtree("out")