archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 

246 linhas
10 KiB

  1. from threading import Thread
  2. import requests
  3. from time import sleep
  4. from os import mkdir, rmdir, listdir, system, environ
  5. from os.path import isdir, isfile, getsize
  6. from json import dumps, loads
  7. import signal
  8. import tracker
  9. from youtube_dl import YoutubeDL
  10. from shutil import make_archive, rmtree
  11. from queue import Queue
  12. from gc import collect
  13. from discovery import getmetadata
  14. from export import subprrun
  15. #useful Queue example: https://stackoverflow.com/a/54658363
  16. jobs = Queue()
  17. langcnt = {}
  18. try:
  19. mkdir("out")
  20. except:
  21. pass
  22. try:
  23. mkdir("directory")
  24. except:
  25. pass
  26. HEROKU = False
  27. if isfile("../Procfile"):
  28. HEROKU = True
  29. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  30. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  31. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  32. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  33. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  34. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  35. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  36. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  37. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  38. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  39. 'xh', 'yi', 'yo', 'zu']
  40. #HSID, SSID, SID cookies required
  41. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  42. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  43. elif isfile("config.json"):
  44. cookies = loads(open("config.json").read())
  45. else:
  46. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  47. assert False
  48. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  49. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  50. assert False
  51. mysession = requests.session()
  52. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  53. validationtest = mysession.get("https://www.youtube.com/timedtext_editor?action_mde_edit_form=1&v=1iNTtHUwvq4&lang=en&bl=vmp&ui=hd&ref=player&tab=captions&o=U")
  54. assert not "accounts.google.com" in validationtest.url, "Please ensure you have correctly specified account cookies."
  55. assert """<button class="yt-uix-button yt-uix-button-size-default yt-uix-button-default yt-uix-button-has-icon" type="button" onclick=";return false;" id="yt-picker-language-button" data-button-action="yt.www.picker.load" data-button-menu-id="arrow-display" data-picker-key="language" data-picker-position="footer" data-button-toggle="true"><span class="yt-uix-button-icon-wrapper"><span class="yt-uix-button-icon yt-uix-button-icon-footer-language yt-sprite"></span></span><span class="yt-uix-button-content"> <span class="yt-picker-button-label">
  56. Language:
  57. </span>
  58. English
  59. </span><span class="yt-uix-button-arrow yt-sprite"></span></button>""" in validationtest.text, "Please make sure your YouTube and Google account language is set to English (United States)"
  60. del validationtest
  61. open("cookies.txt", "w").write("""# HTTP Cookie File
  62. .youtube.com TRUE / FALSE 1663793455 SID [SID]
  63. .youtube.com TRUE / FALSE 1663793455 HSID [HSID]
  64. .youtube.com TRUE / TRUE 1663793455 SSID [SSID]""".replace("[SID]", cookies["SID"]).replace("[HSID]", cookies["HSID"]).replace("[SSID]", cookies["SSID"]))
  65. del cookies
  66. #Graceful Shutdown
  67. class GracefulKiller:
  68. kill_now = False
  69. def __init__(self):
  70. signal.signal(signal.SIGINT, self.exit_gracefully)
  71. signal.signal(signal.SIGTERM, self.exit_gracefully)
  72. def exit_gracefully(self,signum, frame):
  73. self.kill_now = True
  74. gkiller = GracefulKiller()
  75. #TODO: zipping, completion of subtitles (return value), limit task retrieval count
  76. #microtasks
  77. def threadrunner(jobs: Queue):
  78. global langcnt
  79. ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False})
  80. while not gkiller.kill_now:
  81. if not jobs.empty():
  82. task, vid, args = jobs.get()
  83. if task == "submitdiscovery":
  84. tracker.add_item_to_tracker(args, vid)
  85. elif task == "discovery":
  86. while True:
  87. try:
  88. info = getmetadata(mysession, str(vid).strip())
  89. break
  90. except BaseException as e:
  91. print(e)
  92. print("Error in retrieving information, waiting 30 seconds")
  93. sleep(30)
  94. if info[0] or info[1]: # ccenabled or creditdata
  95. if not isdir("out/"+str(vid).strip()):
  96. mkdir("out/"+str(vid).strip())
  97. if info[1]:
  98. open("out/"+str(vid).strip()+"/"+str(vid).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  99. if info[0]:
  100. langcnt[vid] = 0
  101. for langcode in langs:
  102. jobs.put(("subtitles", vid, langcode))
  103. else:
  104. jobs.put(("complete", None, "video:"+vid))
  105. for videodisc in info[2]:
  106. jobs.put(("submitdiscovery", videodisc, tracker.ItemType.Video))
  107. for channeldisc in info[3]:
  108. jobs.put(("submitdiscovery", channeldisc, tracker.ItemType.Channel))
  109. for mixdisc in info[4]:
  110. jobs.put(("submitdiscovery", mixdisc, tracker.ItemType.MixPlaylist))
  111. for playldisc in info[5]:
  112. jobs.put(("submitdiscovery", playldisc, tracker.ItemType.Playlist))
  113. elif task == "subtitles":
  114. subprrun(jobs, mysession, args, vid, "default")
  115. langcnt[vid] += 1
  116. if langcnt[vid] >= 195:
  117. pass #complete(?)
  118. elif task == "subtitles-forceedit-captions":
  119. subprrun(jobs, mysession, args, vid, "forceedit-captions")
  120. elif task == "subtitles-forceedit-metadata":
  121. subprrun(jobs, mysession, args, vid, "forceedit-metadata")
  122. elif task == "channel":
  123. y = ydl.extract_info("https://www.youtube.com/channel/"+desit.split(":", 1)[1], download=False)
  124. for itemyv in y["entries"]:
  125. jobs.put(("submitdiscovery", itemyv["id"], tracker.ItemType.Video))
  126. jobs.put(("complete", None, "channel:"+args))
  127. elif task == "playlist":
  128. y = ydl.extract_info("https://www.youtube.com/playlist?list="+desit.split(":", 1)[1], download=False)
  129. for itemyvp in y["entries"]:
  130. jobs.put(("submitdiscovery", itemyvp["id"], tracker.ItemType.Video))
  131. jobs.put(("complete", None, "playlist:"+args))
  132. elif task == "complete":
  133. size = 0
  134. if ":" in args:
  135. if args.split(":", 1)[0] == "video":
  136. #check if dir is empty, make zip if needed
  137. if isfile("directory/"+args.split(":", 1)[1]+".zip"):
  138. size = getsize("directory/"+args.split(":", 1)[1]+".zip")
  139. #get a target
  140. targetloc = None
  141. while not targetloc:
  142. targetloc = tracker.request_upload_target()
  143. if targetloc:
  144. break
  145. else:
  146. print("Waiting 5 minutes...")
  147. sleep(300)
  148. if targetloc.startswith("rsync"):
  149. system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 --files-from=- directory/ "+targetloc)
  150. elif targetloc.startswith("http"):
  151. for filzip in listdir("directory"):
  152. if filzip.endswith(".zip"):
  153. system("curl -F "+filzip+"=@directory/"+filzip+" "+targetloc)
  154. tracker.mark_item_as_done(args, size)
  155. jobs.task_done()
  156. else:
  157. # get a new task from tracker
  158. collect() #cleanup
  159. desit = tracker.request_item_from_tracker()
  160. if desit:
  161. if desit.split(":", 1)[0] == "video":
  162. jobs.put(("discovery", desit.split(":", 1)[1], None))
  163. elif desit.split(":", 1)[0] == "channel":
  164. jobs.put(("channel", None, desit.split(":", 1)[1]))
  165. elif desit.split(":", 1)[0] == "playlist":
  166. jobs.put(("playlist", None, desit.split(":", 1)[1]))
  167. else:
  168. print("Ignoring item for now", desit)
  169. else:
  170. print("Ignoring item for now", desit)
  171. def prrun():
  172. print("Video ID:", str(item).strip())
  173. while not gkiller.kill_now:
  174. threads = []
  175. for i in range(50):
  176. runthread = Thread(target=prrun)
  177. runthread.start()
  178. threads.append(runthread)
  179. del runthread
  180. for x in threads:
  181. x.join()
  182. threads.remove(x)
  183. del x
  184. for fol in listdir("out"): #remove empty folders
  185. try:
  186. if isdir("out/"+fol):
  187. rmdir("out/"+fol)
  188. except:
  189. pass
  190. #https://stackoverflow.com/a/11968881
  191. for fol in listdir("out"):
  192. if isdir("out/"+fol):
  193. make_archive("directory/"+fol, "zip", "out/"+fol)
  194. # clear the output directories
  195. rmtree("out")
  196. rmtree("directory")