archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 

312 lignes
11 KiB

  1. from threading import Thread
  2. import requests
  3. from time import sleep
  4. from os import mkdir, rmdir, listdir, system, environ
  5. from os.path import isdir, isfile, getsize
  6. from json import dumps, loads
  7. import signal
  8. import tracker
  9. from youtube_dl import YoutubeDL
  10. from shutil import make_archive, rmtree
  11. from queue import Queue
  12. from gc import collect
  13. from discovery import getmetadata
  14. from export import subprrun
  15. batchcontent = []
  16. def batchfunc():
  17. ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False})
  18. while jobs.qsize() < 251:
  19. desit = tracker.request_item_from_tracker()
  20. if desit:
  21. if desit.split(":", 1)[0] == "video":
  22. jobs.put(desit.split(":", 1)[1])
  23. elif desit.split(":", 1)[0] == "channel":
  24. y = ydl.extract_info("https://www.youtube.com/channel/"+desit.split(":", 1)[1], download=False)
  25. for itemyv in y["entries"]:
  26. tracker.add_item_to_tracker(tracker.ItemType.Video, itemyv["id"])
  27. elif desit.split(":", 1)[0] == "playlist":
  28. y = ydl.extract_info("https://www.youtube.com/playlist?list="+desit.split(":", 1)[1], download=False)
  29. for itemyvp in y["entries"]:
  30. tracker.add_item_to_tracker(tracker.ItemType.Video, itemyvp["id"])
  31. else:
  32. print("Ignoring item for now", desit)
  33. else:
  34. print("Ignoring item for now", desit)
  35. batchcontent.append(desit.split(":", 1)[1])
  36. def submitfunc(submitqueue):
  37. while not submitqueue.empty():
  38. itype, ival = submitqueue.get()
  39. tracker.add_item_to_tracker(itype, ival)
  40. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  41. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  42. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  43. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  44. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  45. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  46. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  47. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  48. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  49. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  50. 'xh', 'yi', 'yo', 'zu']
  51. #useful Queue example: https://stackoverflow.com/a/54658363
  52. jobs = Queue()
  53. ccenabledl = []
  54. recvids = set()
  55. recchans = set()
  56. recmixes = set()
  57. recplayl = set()
  58. #HSID, SSID, SID cookies required
  59. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  60. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  61. elif isfile("config.json"):
  62. cookies = loads(open("config.json").read())
  63. else:
  64. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  65. assert False
  66. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  67. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  68. assert False
  69. mysession = requests.session()
  70. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  71. validationtest = mysession.get("https://www.youtube.com/timedtext_editor?action_mde_edit_form=1&v=1iNTtHUwvq4&lang=en&bl=vmp&ui=hd&ref=player&tab=captions&o=U")
  72. assert not "accounts.google.com" in validationtest.url, "Please ensure you have correctly specified account cookies."
  73. assert """<button class="yt-uix-button yt-uix-button-size-default yt-uix-button-default yt-uix-button-has-icon" type="button" onclick=";return false;" id="yt-picker-language-button" data-button-action="yt.www.picker.load" data-button-menu-id="arrow-display" data-picker-key="language" data-picker-position="footer" data-button-toggle="true"><span class="yt-uix-button-icon-wrapper"><span class="yt-uix-button-icon yt-uix-button-icon-footer-language yt-sprite"></span></span><span class="yt-uix-button-content"> <span class="yt-picker-button-label">
  74. Language:
  75. </span>
  76. English
  77. </span><span class="yt-uix-button-arrow yt-sprite"></span></button>""" in validationtest.text, "Please make sure your YouTube and Google account language is set to English (United States)"
  78. del validationtest
  79. open("cookies.txt", "w").write("""# HTTP Cookie File
  80. .youtube.com TRUE / FALSE 1663793455 SID [SID]
  81. .youtube.com TRUE / FALSE 1663793455 HSID [HSID]
  82. .youtube.com TRUE / TRUE 1663793455 SSID [SSID]""".replace("[SID]", cookies["SID"]).replace("[HSID]", cookies["HSID"]).replace("[SSID]", cookies["SSID"]))
  83. del cookies
  84. #Graceful Shutdown
  85. class GracefulKiller:
  86. kill_now = False
  87. def __init__(self):
  88. signal.signal(signal.SIGINT, self.exit_gracefully)
  89. signal.signal(signal.SIGTERM, self.exit_gracefully)
  90. def exit_gracefully(self,signum, frame):
  91. self.kill_now = True
  92. gkiller = GracefulKiller()
  93. def prrun():
  94. while not jobs.empty():
  95. global recvids
  96. global recchans
  97. global recmixes
  98. global recplayl
  99. global ccenabledl
  100. item = jobs.get()
  101. print("Video ID:", str(item).strip())
  102. while True:
  103. try:
  104. info = getmetadata(mysession, str(item).strip())
  105. break
  106. except BaseException as e:
  107. print(e)
  108. print("Error in retrieving information, waiting 30 seconds")
  109. sleep(30)
  110. # Add any discovered videos
  111. recvids.update(info[2])
  112. recchans.update(info[3])
  113. recmixes.update(info[4])
  114. recplayl.update(info[5])
  115. if info[0] or info[1]: # ccenabled or creditdata
  116. if not isdir("out/"+str(item).strip()):
  117. mkdir("out/"+str(item).strip())
  118. if info[1]: # creditdata
  119. open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  120. if info[0]: #ccenabled
  121. ccenabledl.append(item)
  122. jobs.task_done()
  123. return True
  124. while not gkiller.kill_now:
  125. collect() #cleanup
  126. try:
  127. mkdir("out")
  128. except:
  129. pass
  130. try:
  131. mkdir("directory")
  132. except:
  133. pass
  134. batchcontent.clear()
  135. # Get a batch ID
  136. batchthreads = []
  137. for r in range(50):
  138. batchrunthread = Thread(target=batchfunc)
  139. batchrunthread.start()
  140. batchthreads.append(batchrunthread)
  141. del batchrunthread
  142. for xc in batchthreads:
  143. xc.join()
  144. batchthreads.remove(xc)
  145. del xc
  146. sleep(1) # prevent the script from continuing before the last thread finishes
  147. threads = []
  148. for i in range(50):
  149. runthread = Thread(target=prrun)
  150. runthread.start()
  151. threads.append(runthread)
  152. del runthread
  153. for x in threads:
  154. x.join()
  155. threads.remove(x)
  156. del x
  157. print("Sending discoveries to tracker...")
  158. submitjobs = Queue()
  159. # IDK how to handle mixes so just send them for now
  160. print("Videos:", len(recvids))
  161. for itemvid in recvids:
  162. submitjobs.put((tracker.ItemType.Video, itemvid))
  163. print("Channels:", len(recchans))
  164. for itemchan in recchans:
  165. submitjobs.put((tracker.ItemType.Channel, itemchan))
  166. print("Mix Playlists:", len(recmixes))
  167. for itemmix in recmixes:
  168. submitjobs.put((tracker.ItemType.MixPlaylist, itemmix))
  169. print("Playlists:", len(recplayl))
  170. for itemplayl in recplayl:
  171. submitjobs.put((tracker.ItemType.Playlist, itemplayl))
  172. # open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))
  173. # clear lists
  174. recvids.clear()
  175. recchans.clear()
  176. recmixes.clear()
  177. recplayl.clear()
  178. submitthreads = []
  179. for r in range(50):
  180. submitrunthread = Thread(target=submitfunc, args=(submitjobs,))
  181. submitrunthread.start()
  182. submitthreads.append(submitrunthread)
  183. del submitrunthread
  184. for xb in submitthreads:
  185. xb.join()
  186. submitthreads.remove(xb)
  187. del xb
  188. sleep(1) # prevent the script from continuing before the last thread finishes
  189. subtjobs = Queue()
  190. while ccenabledl:
  191. langcontent = langs.copy()
  192. intvid = ccenabledl.pop(0)
  193. while langcontent:
  194. subtjobs.put((langcontent.pop(0), intvid, "default"))
  195. del intvid
  196. del langcontent
  197. subthreads = []
  198. for r in range(50):
  199. subrunthread = Thread(target=subprrun, args=(subtjobs,mysession))
  200. subrunthread.start()
  201. subthreads.append(subrunthread)
  202. del subrunthread
  203. for xa in subthreads:
  204. xa.join()
  205. subthreads.remove(xa)
  206. del xa
  207. sleep(30) # wait 30 seconds to hopefully allow the other threads to finish
  208. for fol in listdir("out"): #remove empty folders
  209. try:
  210. if isdir("out/"+fol):
  211. rmdir("out/"+fol)
  212. except:
  213. pass
  214. #https://stackoverflow.com/a/11968881
  215. for fol in listdir("out"):
  216. if isdir("out/"+fol):
  217. make_archive("directory/"+fol, "zip", "out/"+fol)
  218. targetloc = None
  219. while not targetloc:
  220. targetloc = tracker.request_upload_target()
  221. if targetloc:
  222. break
  223. else:
  224. print("Waiting 5 minutes...")
  225. sleep(300)
  226. if targetloc.startswith("rsync"):
  227. system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 --files-from=- directory/ "+targetloc)
  228. elif targetloc.startswith("http"):
  229. for filzip in listdir("directory"):
  230. if filzip.endswith(".zip"):
  231. system("curl -F "+filzip+"=@directory/"+filzip+" "+targetloc)
  232. # Report the batch as complete
  233. for itemb in batchcontent:
  234. if isfile("directory/"+itemb.split(":", 1)[1]+".zip"):
  235. size = getsize("directory/"+itemb.split(":", 1)[1]+".zip")
  236. else:
  237. size = 0
  238. tracker.mark_item_as_done(itemb, size)
  239. # clear the output directories
  240. rmtree("out")
  241. rmtree("directory")