archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
 
 

251 行
8.2 KiB

  1. from threading import Thread
  2. import requests
  3. from time import sleep
  4. from os import mkdir, rmdir, listdir, system, environ
  5. from os.path import isdir, isfile, getsize
  6. from json import dumps, loads
  7. import signal
  8. import tracker
  9. from youtube_dl import YoutubeDL
  10. from shutil import make_archive, rmtree
  11. from queue import Queue
  12. from gc import collect
  13. from discovery import getmetadata
  14. from export import subprrun
  15. WORKER_VERSION = 1
  16. SERVER_BASE_URL = "http://localhost:5000"
  17. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  18. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  19. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  20. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  21. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  22. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  23. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  24. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  25. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  26. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  27. 'xh', 'yi', 'yo', 'zu']
  28. #useful Queue example: https://stackoverflow.com/a/54658363
  29. jobs = Queue()
  30. ccenabledl = []
  31. recvids = set()
  32. recchans = set()
  33. recmixes = set()
  34. recplayl = set()
  35. #HSID, SSID, SID cookies required
  36. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  37. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  38. elif isfile("config.json"):
  39. cookies = loads(open("config.json").read())
  40. else:
  41. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  42. assert False
  43. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  44. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  45. assert False
  46. mysession = requests.session()
  47. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  48. del cookies
  49. #Graceful Shutdown
  50. class GracefulKiller:
  51. kill_now = False
  52. def __init__(self):
  53. signal.signal(signal.SIGINT, self.exit_gracefully)
  54. signal.signal(signal.SIGTERM, self.exit_gracefully)
  55. def exit_gracefully(self,signum, frame):
  56. self.kill_now = True
  57. gkiller = GracefulKiller()
  58. def prrun():
  59. while not jobs.empty():
  60. global recvids
  61. global recchans
  62. global recmixes
  63. global recplayl
  64. global ccenabledl
  65. item = jobs.get()
  66. print("Video ID:", str(item).strip())
  67. while True:
  68. try:
  69. info = getmetadata(str(item).strip())
  70. break
  71. except BaseException as e:
  72. print(e)
  73. print("Error in retrieving information, waiting 30 seconds")
  74. #raise
  75. sleep(30)
  76. ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True})
  77. for chaninfo in info[3]:
  78. if chaninfo not in recchans:
  79. y = ydl.extract_info("https://www.youtube.com/channel/"+chaninfo, download=False)
  80. for item in y["entries"]:
  81. recvids.add(item["id"])
  82. for playlinfo in info[5]:
  83. if playlinfo not in recplayl:
  84. y = ydl.extract_info("https://www.youtube.com/playlist?list="+playlinfo, download=False)
  85. for item in y["entries"]:
  86. recvids.add(item["id"])
  87. # Add any discovered videos
  88. recvids.update(info[2])
  89. recchans.update(info[3])
  90. recmixes.update(info[4])
  91. recplayl.update(info[5])
  92. if info[0] or info[1]: # ccenabled or creditdata
  93. if not isdir("out/"+str(item).strip()):
  94. mkdir("out/"+str(item).strip())
  95. if info[1]: # creditdata
  96. open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  97. if info[0]: #ccenabled
  98. ccenabledl.append(item)
  99. jobs.task_done()
  100. return True
  101. while not gkiller.kill_now:
  102. collect() #cleanup
  103. try:
  104. mkdir("out")
  105. except:
  106. pass
  107. # Get a batch ID
  108. batchcontent = []
  109. for ir in range(501):
  110. batchcontent.append(tracker.request_item_from_tracker())
  111. for desit in batchcontent:
  112. if desit:
  113. if desit.split(":", 1)[0] == "video":
  114. jobs.put(desit.split(":", 1)[1])
  115. else:
  116. print("Ignoring item for now", desit)
  117. else:
  118. print("Ignoring item for now", desit)
  119. threads = []
  120. for i in range(50):
  121. runthread = Thread(target=prrun)
  122. runthread.start()
  123. threads.append(runthread)
  124. del runthread
  125. for x in threads:
  126. x.join()
  127. threads.remove(x)
  128. del x
  129. print("Sending discoveries to tracker...")
  130. #don't send channels and playlists as those have already been converted for video IDs
  131. #IDK how to handle mixes so send them for now
  132. print(len(recvids))
  133. for itemvid in recvids:
  134. tracker.add_item_to_tracker(tracker.ItemType.Video, itemvid)
  135. print(len(recmixes))
  136. for itemmix in recvids:
  137. tracker.add_item_to_tracker(tracker.ItemType.MixPlaylist, itemmix)
  138. #open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))
  139. #clear
  140. recvids.clear()
  141. recchans.clear()
  142. recmixes.clear()
  143. recplayl.clear()
  144. subtjobs = Queue()
  145. while ccenabledl:
  146. langcontent = langs.copy()
  147. intvid = ccenabledl.pop(0)
  148. while langcontent:
  149. subtjobs.put((langcontent.pop(0), intvid, "default"))
  150. del intvid
  151. del langcontent
  152. subthreads = []
  153. for r in range(50):
  154. subrunthread = Thread(target=subprrun, args=(subtjobs,mysession))
  155. subrunthread.start()
  156. subthreads.append(subrunthread)
  157. del subrunthread
  158. for xa in subthreads:
  159. xa.join() #bug (occurred once: the script ended before the last thread finished)
  160. subthreads.remove(xa)
  161. del xa
  162. sleep(1) #wait a second to hopefully allow the other threads to finish
  163. for fol in listdir("out"): #remove extra folders
  164. try:
  165. if isdir("out/"+fol):
  166. rmdir("out/"+fol)
  167. except:
  168. pass
  169. #https://stackoverflow.com/a/11968881
  170. # TODO: put the data somewhere...
  171. # TODO: put the discoveries somewhere...
  172. for fol in listdir("out"):
  173. if isdir("out/"+fol):
  174. make_archive("out/"+fol, "zip", "out/"+fol) #check this
  175. targetloc = None
  176. while not targetloc:
  177. targetloc = tracker.request_upload_target()
  178. if targetloc:
  179. break
  180. else:
  181. print("Waiting 5 minutes...")
  182. sleep(300)
  183. for zipf in listdir("out"):
  184. if isfile(zipf) in zipf.endswith(".zip"):
  185. if targetloc.startswith("rsync"):
  186. system("rsync out/"+zipf+" "+targetloc)
  187. elif targetloc.startswith("http"):
  188. upzipf = open("out/"+zipf, "rb")
  189. requests.post(targetloc, data=upzipf)
  190. upzipf.close()
  191. #upload it!
  192. # Report the batch as complete
  193. for itemb in batchcontent:
  194. if isfile("out/"+itemb.split(":", 1)[1]+".zip"):
  195. size = getsize("out/"+itemb.split(":", 1)[1]+".zip")
  196. else:
  197. size = 0
  198. tracker.mark_item_as_done(itemb, size)
  199. # clear the output directory
  200. rmtree("out")