archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

310 行
10 KiB

  1. from threading import Thread
  2. import requests
  3. from time import sleep
  4. from os import mkdir, rmdir, listdir, system, environ
  5. from os.path import isdir, isfile, getsize
  6. from json import dumps, loads
  7. import signal
  8. from youtube_dl.utils import DownloadError
  9. import tracker
  10. from youtube_dl import YoutubeDL
  11. from shutil import make_archive, rmtree
  12. from queue import Queue
  13. from gc import collect
  14. from discovery import getmetadata
  15. from export import subprrun
  16. batchcontent = []
  17. def batchfunc():
  18. ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False})
  19. while len(jobs) < 501:
  20. desit = tracker.request_item_from_tracker()
  21. if desit:
  22. if desit.split(":", 1)[0] == "video":
  23. jobs.put(desit.split(":", 1)[1])
  24. elif desit.split(":", 1)[0] == "channel":
  25. y = ydl.extract_info("https://www.youtube.com/channel/"+desit.split(":", 1)[1], download=False)
  26. for itemyv in y["entries"]:
  27. tracker.add_item_to_tracker(tracker.ItemType.Video, itemyv["id"])
  28. elif desit.split(":", 1)[0] == "playlist":
  29. y = ydl.extract_info("https://www.youtube.com/playlist?list="+desit.split(":", 1)[1], download=False)
  30. for itemyvp in y["entries"]:
  31. tracker.add_item_to_tracker(tracker.ItemType.Video, itemyvp["id"])
  32. else:
  33. print("Ignoring item for now", desit)
  34. else:
  35. print("Ignoring item for now", desit)
  36. batchcontent.append(desit.split(":", 1)[1])
  37. def submitfunc(submitqueue):
  38. while not submitqueue.empty():
  39. itype, ival = submitqueue.get()
  40. tracker.add_item_to_tracker(itype, ival)
  41. WORKER_VERSION = 1
  42. SERVER_BASE_URL = "http://localhost:5000"
  43. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  44. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  45. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  46. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  47. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  48. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  49. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  50. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  51. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  52. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  53. 'xh', 'yi', 'yo', 'zu']
  54. #useful Queue example: https://stackoverflow.com/a/54658363
  55. jobs = Queue()
  56. ccenabledl = []
  57. recvids = set()
  58. recchans = set()
  59. recmixes = set()
  60. recplayl = set()
  61. #HSID, SSID, SID cookies required
  62. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  63. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  64. elif isfile("config.json"):
  65. cookies = loads(open("config.json").read())
  66. else:
  67. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  68. assert False
  69. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  70. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  71. assert False
  72. mysession = requests.session()
  73. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  74. open("cookies.txt", "w").write("""# HTTP Cookie File
  75. .youtube.com TRUE / FALSE 1663793455 SID [SID]
  76. .youtube.com TRUE / FALSE 1663793455 HSID [HSID]
  77. .youtube.com TRUE / TRUE 1663793455 SSID [SSID]""".replace("[SID]", cookies["SID"]).replace("[HSID]", cookies["HSID"]).replace("[SSID]", cookies["SSID"]))
  78. del cookies
  79. #Graceful Shutdown
  80. class GracefulKiller:
  81. kill_now = False
  82. def __init__(self):
  83. signal.signal(signal.SIGINT, self.exit_gracefully)
  84. signal.signal(signal.SIGTERM, self.exit_gracefully)
  85. def exit_gracefully(self,signum, frame):
  86. self.kill_now = True
  87. gkiller = GracefulKiller()
  88. def prrun():
  89. while not jobs.empty():
  90. global recvids
  91. global recchans
  92. global recmixes
  93. global recplayl
  94. global ccenabledl
  95. item = jobs.get()
  96. print("Video ID:", str(item).strip())
  97. while True:
  98. try:
  99. info = getmetadata(str(item).strip())
  100. break
  101. except BaseException as e:
  102. print(e)
  103. print("Error in retrieving information, waiting 30 seconds")
  104. #raise
  105. sleep(30)
  106. # Add any discovered videos
  107. recvids.update(info[2])
  108. recchans.update(info[3])
  109. recmixes.update(info[4])
  110. recplayl.update(info[5])
  111. if info[0] or info[1]: # ccenabled or creditdata
  112. if not isdir("out/"+str(item).strip()):
  113. mkdir("out/"+str(item).strip())
  114. if info[1]: # creditdata
  115. open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  116. if info[0]: #ccenabled
  117. ccenabledl.append(item)
  118. jobs.task_done()
  119. return True
  120. while not gkiller.kill_now:
  121. collect() #cleanup
  122. try:
  123. mkdir("out")
  124. except:
  125. pass
  126. batchcontent.clear()
  127. # Get a batch ID
  128. batchthreads = []
  129. for r in range(50):
  130. batchrunthread = Thread(target=batchfunc)
  131. batchrunthread.start()
  132. batchthreads.append(batchrunthread)
  133. del batchrunthread
  134. for xc in batchthreads:
  135. xc.join() #bug (occurred once: the script ended before the last thread finished)
  136. batchthreads.remove(xc)
  137. del xc
  138. #for ir in range(501):
  139. # batchcontent.append(tracker.request_item_from_tracker())
  140. threads = []
  141. for i in range(50):
  142. runthread = Thread(target=prrun)
  143. runthread.start()
  144. threads.append(runthread)
  145. del runthread
  146. for x in threads:
  147. x.join()
  148. threads.remove(x)
  149. del x
  150. print("Sending discoveries to tracker...")
  151. submitjobs = Queue()
  152. #IDK how to handle mixes so just send them for now
  153. print("Videos:", len(recvids))
  154. for itemvid in recvids:
  155. submitjobs.put((tracker.ItemType.Video, itemvid))
  156. print("Channels:", len(recchans))
  157. for itemchan in recchans:
  158. submitjobs.put((tracker.ItemType.Channel, itemchan))
  159. print("Mix Playlists:", len(recmixes))
  160. for itemmix in recmixes:
  161. submitjobs.put((tracker.ItemType.MixPlaylist, itemmix))
  162. print("Playlists:", len(recplayl))
  163. for itemplayl in recplayl:
  164. submitjobs.put((tracker.ItemType.Playlist, itemplayl))
  165. #open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))
  166. #clear
  167. recvids.clear()
  168. recchans.clear()
  169. recmixes.clear()
  170. recplayl.clear()
  171. submitthreads = []
  172. for r in range(50):
  173. submitrunthread = Thread(target=submitfunc, args=(submitjobs,))
  174. submitrunthread.start()
  175. submitthreads.append(submitrunthread)
  176. del submitrunthread
  177. for xb in submitthreads:
  178. xb.join() #bug (occurred once: the script ended before the last thread finished)
  179. submitthreads.remove(xb)
  180. del xb
  181. sleep(1)
  182. subtjobs = Queue()
  183. while ccenabledl:
  184. langcontent = langs.copy()
  185. intvid = ccenabledl.pop(0)
  186. while langcontent:
  187. subtjobs.put((langcontent.pop(0), intvid, "default"))
  188. del intvid
  189. del langcontent
  190. subthreads = []
  191. for r in range(50):
  192. subrunthread = Thread(target=subprrun, args=(subtjobs,mysession))
  193. subrunthread.start()
  194. subthreads.append(subrunthread)
  195. del subrunthread
  196. for xa in subthreads:
  197. xa.join() #bug (occurred once: the script ended before the last thread finished)
  198. subthreads.remove(xa)
  199. del xa
  200. sleep(1) #wait a second to hopefully allow the other threads to finish
  201. for fol in listdir("out"): #remove extra folders
  202. try:
  203. if isdir("out/"+fol):
  204. rmdir("out/"+fol)
  205. except:
  206. pass
  207. #https://stackoverflow.com/a/11968881
  208. # TODO: put the data somewhere...
  209. # TODO: put the discoveries somewhere...
  210. for fol in listdir("out"):
  211. if isdir("out/"+fol):
  212. make_archive("out/"+fol, "zip", "out/"+fol) #check this
  213. targetloc = None
  214. while not targetloc:
  215. targetloc = tracker.request_upload_target()
  216. if targetloc:
  217. break
  218. else:
  219. print("Waiting 5 minutes...")
  220. sleep(300)
  221. for zipf in listdir("out"):
  222. if isfile(zipf) in zipf.endswith(".zip"):
  223. if targetloc.startswith("rsync"):
  224. system("rsync out/"+zipf+" "+targetloc)
  225. elif targetloc.startswith("http"):
  226. upzipf = open("out/"+zipf, "rb")
  227. requests.post(targetloc, data=upzipf)
  228. upzipf.close()
  229. #upload it!
  230. # Report the batch as complete
  231. for itemb in batchcontent:
  232. if isfile("out/"+itemb.split(":", 1)[1]+".zip"):
  233. size = getsize("out/"+itemb.split(":", 1)[1]+".zip")
  234. else:
  235. size = 0
  236. tracker.mark_item_as_done(itemb, size)
  237. # clear the output directory
  238. rmtree("out")