archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.
 
 

254 rader
12 KiB

  1. from threading import Thread
  2. import requests
  3. from time import sleep
  4. from os import mkdir, rmdir, listdir, system, environ
  5. from os.path import isdir, isfile, getsize
  6. from json import dumps, loads
  7. import signal
  8. import tracker
  9. from youtube_dl import YoutubeDL
  10. from shutil import rmtree, which
  11. from queue import Queue
  12. from gc import collect
  13. from discovery import getmetadata
  14. from export import subprrun
  15. #useful Queue example: https://stackoverflow.com/a/54658363
  16. jobs = Queue()
  17. langcnt = {}
  18. try:
  19. mkdir("out")
  20. except:
  21. pass
  22. try:
  23. mkdir("directory")
  24. except:
  25. pass
  26. HEROKU = False
  27. if isfile("../Procfile"):
  28. HEROKU = True
  29. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  30. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  31. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  32. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  33. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  34. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  35. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  36. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  37. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  38. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  39. 'xh', 'yi', 'yo', 'zu']
  40. assert which("zip") and which("rsync") and which("curl"), "Please ensure the zip, rsync, and curl commands are installed on your system."
  41. #HSID, SSID, SID cookies required
  42. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  43. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  44. elif isfile("config.json"):
  45. cookies = loads(open("config.json").read())
  46. else:
  47. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  48. assert False
  49. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  50. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  51. assert False
  52. mysession = requests.session()
  53. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  54. validationtest = mysession.get("https://www.youtube.com/timedtext_editor?action_mde_edit_form=1&v=1iNTtHUwvq4&lang=en&bl=vmp&ui=hd&ref=player&tab=captions&o=U")
  55. assert not "accounts.google.com" in validationtest.url, "Please ensure you have correctly specified account cookies."
  56. assert """<button class="yt-uix-button yt-uix-button-size-default yt-uix-button-default yt-uix-button-has-icon" type="button" onclick=";return false;" id="yt-picker-language-button" data-button-action="yt.www.picker.load" data-button-menu-id="arrow-display" data-picker-key="language" data-picker-position="footer" data-button-toggle="true"><span class="yt-uix-button-icon-wrapper"><span class="yt-uix-button-icon yt-uix-button-icon-footer-language yt-sprite"></span></span><span class="yt-uix-button-content"> <span class="yt-picker-button-label">
  57. Language:
  58. </span>
  59. English
  60. </span><span class="yt-uix-button-arrow yt-sprite"></span></button>""" in validationtest.text, "Please make sure your YouTube and Google account language is set to English (United States)"
  61. del validationtest
  62. open("cookies.txt", "w").write("""# HTTP Cookie File
  63. .youtube.com TRUE / FALSE 1663793455 SID [SID]
  64. .youtube.com TRUE / FALSE 1663793455 HSID [HSID]
  65. .youtube.com TRUE / TRUE 1663793455 SSID [SSID]""".replace("[SID]", cookies["SID"]).replace("[HSID]", cookies["HSID"]).replace("[SSID]", cookies["SSID"]))
  66. del cookies
  67. #Graceful Shutdown
  68. class GracefulKiller:
  69. kill_now = False
  70. def __init__(self):
  71. signal.signal(signal.SIGINT, self.exit_gracefully)
  72. signal.signal(signal.SIGTERM, self.exit_gracefully)
  73. def exit_gracefully(self, signum, frame):
  74. print("Graceful exit process initiated, stopping all tasks...")
  75. self.kill_now = True
  76. gkiller = GracefulKiller()
  77. #microtasks
  78. def threadrunner(jobs: Queue):
  79. global langcnt
  80. global lasttask
  81. ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False})
  82. while not gkiller.kill_now:
  83. if not jobs.empty():
  84. task, vid, args = jobs.get()
  85. if task == "submitdiscovery":
  86. tracker.add_item_to_tracker(args, vid)
  87. elif task == "discovery":
  88. while True:
  89. try:
  90. info = getmetadata(mysession, str(vid).strip())
  91. break
  92. except BaseException as e:
  93. print(e)
  94. print("Error in retrieving information, waiting 30 seconds")
  95. sleep(30)
  96. if info[0] or info[1]: # ccenabled or creditdata
  97. if not isdir("out/"+str(vid).strip()):
  98. mkdir("out/"+str(vid).strip())
  99. if info[1]:
  100. open("out/"+str(vid).strip()+"/"+str(vid).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  101. if info[0]:
  102. langcnt[vid] = 0
  103. for langcode in langs:
  104. jobs.put(("subtitles", vid, langcode))
  105. else:
  106. jobs.put(("complete", None, "video:"+vid))
  107. for videodisc in info[2]:
  108. jobs.put(("submitdiscovery", videodisc, tracker.ItemType.Video))
  109. for channeldisc in info[3]:
  110. jobs.put(("submitdiscovery", channeldisc, tracker.ItemType.Channel))
  111. for mixdisc in info[4]:
  112. jobs.put(("submitdiscovery", mixdisc, tracker.ItemType.MixPlaylist))
  113. for playldisc in info[5]:
  114. jobs.put(("submitdiscovery", playldisc, tracker.ItemType.Playlist))
  115. elif task == "subtitles":
  116. retval = subprrun(jobs, mysession, args, vid, "default")
  117. langcnt[vid] += retval
  118. if langcnt[vid] >= 585:
  119. jobs.put(("complete", None, "video:"+vid))
  120. elif task == "subtitles-forceedit-captions":
  121. subprrun(jobs, mysession, args, vid, "forceedit-captions")
  122. elif task == "subtitles-forceedit-metadata":
  123. subprrun(jobs, mysession, args, vid, "forceedit-metadata")
  124. elif task == "channel":
  125. while True:
  126. try:
  127. y = ydl.extract_info("https://www.youtube.com/channel/"+desit.split(":", 1)[1], download=False)
  128. for itemyv in y["entries"]:
  129. jobs.put(("submitdiscovery", itemyv["id"], tracker.ItemType.Video))
  130. jobs.put(("complete", None, "channel:"+args))
  131. break
  132. except:
  133. print("YouTube-DL error, ignoring but not marking as complete...", "https://www.youtube.com/channel/"+desit.split(":", 1)[1])
  134. elif task == "playlist":
  135. while True:
  136. try:
  137. y = ydl.extract_info("https://www.youtube.com/playlist?list="+desit.split(":", 1)[1], download=False)
  138. for itemyvp in y["entries"]:
  139. jobs.put(("submitdiscovery", itemyvp["id"], tracker.ItemType.Video))
  140. jobs.put(("complete", None, "playlist:"+args))
  141. break
  142. except:
  143. print("YouTube-DL error, ignoring but not marking as complete...", "https://www.youtube.com/playlist?list="+desit.split(":", 1)[1])
  144. elif task == "complete":
  145. size = 0
  146. if ":" in args:
  147. if args.split(":", 1)[0] == "video":
  148. #check if dir is empty, make zip if needed
  149. if isdir("out/"+args.split(":", 1)[1]):
  150. if not listdir("out/"+args.split(":", 1)[1]):
  151. rmdir("out/"+args.split(":", 1)[1])
  152. else:
  153. #zip it up
  154. if not isdir("directory/"+args.split(":", 1)[1]):
  155. mkdir("directory/"+args.split(":", 1)[1])
  156. while not isfile("directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip"):
  157. print("Attempting to zip item...")
  158. system("zip -9 -r -j directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip out/"+args.split(":", 1)[1])
  159. #get a target
  160. targetloc = None
  161. while not targetloc:
  162. targetloc = tracker.request_upload_target()
  163. if targetloc:
  164. break
  165. else:
  166. print("Waiting 5 minutes...")
  167. sleep(300)
  168. if targetloc.startswith("rsync"):
  169. system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 --files-from=- directory/"+args.split(":", 1)[1]+"/ "+targetloc)
  170. elif targetloc.startswith("http"):
  171. system("curl -F "+args.split(":", 1)[1]+".zip=@directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip "+targetloc)
  172. size = getsize("directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip")
  173. #cleanup
  174. try:
  175. del langcnt[args.split(":", 1)[1]]
  176. rmtree("directory/"+args.split(":", 1)[1]+"/")
  177. rmdir("directory/"+args.split(":", 1)[1]+"/")
  178. rmtree("out/"+args.split(":", 1)[1]+"/")
  179. rmdir("out/"+args.split(":", 1)[1]+"/")
  180. except:
  181. pass
  182. tracker.mark_item_as_done(args, size)
  183. jobs.task_done()
  184. else:
  185. # get a new task from tracker
  186. collect() #cleanup
  187. desit = tracker.request_item_from_tracker()
  188. print("New task:", desit)
  189. if desit:
  190. if desit.split(":", 1)[0] == "video":
  191. jobs.put(("discovery", desit.split(":", 1)[1], None))
  192. elif desit.split(":", 1)[0] == "channel":
  193. jobs.put(("channel", None, desit.split(":", 1)[1]))
  194. elif desit.split(":", 1)[0] == "playlist":
  195. jobs.put(("playlist", None, desit.split(":", 1)[1]))
  196. else:
  197. print("Ignoring item for now", desit)
  198. else:
  199. print("Ignoring item for now", desit)
  200. threads = []
  201. THREADCNT = 50
  202. if HEROKU:
  203. THREADCNT = 20
  204. #now create the rest of the threads
  205. for i in range(THREADCNT):
  206. runthread = Thread(target=threadrunner, args=(jobs,))
  207. runthread.start()
  208. threads.append(runthread)
  209. del runthread
  210. #https://stackoverflow.com/a/11968881
  211. for x in threads:
  212. x.join()
  213. threads.remove(x)
  214. del x
  215. print("Exiting...")