archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

304 lines
19 KiB

  1. from threading import Thread
  2. import requests
  3. from time import sleep
  4. from os import mkdir, rmdir, listdir, system, environ
  5. from os.path import isdir, isfile, getsize
  6. from json import dumps, loads
  7. import signal
  8. import tracker
  9. from youtube_dl import YoutubeDL
  10. from shutil import rmtree, which
  11. from queue import Queue
  12. from gc import collect
  13. from discovery import getmetadata
  14. from export import subprrun
  15. #useful Queue example: https://stackoverflow.com/a/54658363
  16. jobs = Queue()
  17. try:
  18. mkdir("out")
  19. except:
  20. pass
  21. try:
  22. mkdir("directory")
  23. except:
  24. pass
  25. HEROKU = False
  26. if isfile("../Procfile"):
  27. HEROKU = True
  28. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  29. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  30. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  31. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  32. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  33. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  34. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  35. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  36. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  37. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  38. 'xh', 'yi', 'yo', 'zu']
  39. assert which("zip") and which("rsync") and which("curl"), "Please ensure the zip, rsync, and curl commands are installed on your system."
  40. #HSID, SSID, SID cookies required
  41. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  42. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  43. elif isfile("config.json"):
  44. cookies = loads(open("config.json").read())
  45. else:
  46. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  47. assert False
  48. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  49. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  50. assert False
  51. mysession = requests.session()
  52. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  53. validationtest = mysession.get("https://www.youtube.com/timedtext_editor?action_mde_edit_form=1&v=1iNTtHUwvq4&lang=en&bl=vmp&ui=hd&ref=player&tab=captions&o=U")
  54. assert not "accounts.google.com" in validationtest.url, "Please ensure you have correctly specified account cookies."
  55. assert """<button class="yt-uix-button yt-uix-button-size-default yt-uix-button-default yt-uix-button-has-icon" type="button" onclick=";return false;" id="yt-picker-language-button" data-button-action="yt.www.picker.load" data-button-menu-id="arrow-display" data-picker-key="language" data-picker-position="footer" data-button-toggle="true"><span class="yt-uix-button-icon-wrapper"><span class="yt-uix-button-icon yt-uix-button-icon-footer-language yt-sprite"></span></span><span class="yt-uix-button-content"> <span class="yt-picker-button-label">
  56. Language:
  57. </span>
  58. English
  59. </span><span class="yt-uix-button-arrow yt-sprite"></span></button>""" in validationtest.text, "Please make sure your YouTube and Google account language is set to English (United States)"
  60. del validationtest
  61. open("cookies.txt", "w").write("""# HTTP Cookie File
  62. .youtube.com TRUE / FALSE 1663793455 SID [SID]
  63. .youtube.com TRUE / FALSE 1663793455 HSID [HSID]
  64. .youtube.com TRUE / TRUE 1663793455 SSID [SSID]""".replace("[SID]", cookies["SID"]).replace("[HSID]", cookies["HSID"]).replace("[SSID]", cookies["SSID"]))
  65. del cookies
  66. validationtimes = 0
  67. shouldgetjob = True
  68. #Graceful Shutdown
  69. class GracefulKiller:
  70. kill_now = False
  71. def __init__(self):
  72. signal.signal(signal.SIGINT, self.exit_gracefully)
  73. signal.signal(signal.SIGTERM, self.exit_gracefully)
  74. def exit_gracefully(self, signum, frame):
  75. print("Graceful exit process initiated, no longer accepting new tasks but finishing existing ones...")
  76. self.kill_now = True
  77. gkiller = GracefulKiller()
  78. enres = getmetadata(mysession, "IjJKfe-0Ty0", True)[0]
  79. if not enres:
  80. print("Community Contribution discovery has been disabled for this account, please report this on our Discord as this may have caused some videos to be incorrectly marked as having community contributions disabled.")
  81. shouldgetjob = False
  82. gkiller.kill_now = True #exit the script
  83. del enres
  84. #microtasks
  85. def threadrunner():
  86. global shouldgetjob
  87. global validationtimes
  88. jobs = Queue()
  89. ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False})
  90. while True:
  91. if not jobs.empty():
  92. task, vid, args = jobs.get()
  93. if task == "submitdiscovery":
  94. tracker.add_item_to_tracker(args, vid)
  95. elif task == "discovery":
  96. while True:
  97. try:
  98. info = getmetadata(mysession, str(vid).strip())
  99. break
  100. except BaseException as e:
  101. print(e)
  102. print("Error in retrieving information, waiting 30 seconds and trying again")
  103. sleep(30)
  104. if info[0] or info[1]: # ccenabled or creditdata
  105. if not isdir("out/"+str(vid).strip()):
  106. mkdir("out/"+str(vid).strip())
  107. if info[1]:
  108. open("out/"+str(vid).strip()+"/"+str(vid).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  109. if info[0]:
  110. for langcode in langs:
  111. jobs.put(("subtitles", vid, langcode))
  112. for langcode in langs:
  113. jobs.put(("subtitles-forceedit-metadata", vid, langcode))
  114. for langcode in langs:
  115. jobs.put(("subtitles-forceedit-captions", vid, langcode))
  116. jobs.put(("complete", None, "video:"+vid))
  117. for videodisc in info[2]:
  118. jobs.put(("submitdiscovery", videodisc, tracker.ItemType.Video))
  119. for channeldisc in info[3]:
  120. jobs.put(("submitdiscovery", channeldisc, tracker.ItemType.Channel))
  121. for mixdisc in info[4]:
  122. jobs.put(("submitdiscovery", mixdisc, tracker.ItemType.MixPlaylist))
  123. for playldisc in info[5]:
  124. jobs.put(("submitdiscovery", playldisc, tracker.ItemType.Playlist))
  125. elif task == "subtitles":
  126. subprrun(mysession, args, vid, "default", needforcemetadata, needforcecaptions)
  127. elif task == "subtitles-forceedit-captions":
  128. subprrun(mysession, args, vid, "forceedit-captions", needforcemetadata, needforcecaptions)
  129. elif task == "subtitles-forceedit-metadata":
  130. subprrun(mysession, args, vid, "forceedit-metadata", needforcemetadata, needforcecaptions)
  131. elif task == "channel":
  132. try:
  133. y = ydl.extract_info("https://www.youtube.com/channel/"+desit.split(":", 1)[1], download=False)
  134. for itemyv in y["entries"]:
  135. jobs.put(("submitdiscovery", itemyv["id"], tracker.ItemType.Video))
  136. jobs.put(("complete", None, "channel:"+args))
  137. except:
  138. print("YouTube-DL error, ignoring but not marking as complete...", "https://www.youtube.com/channel/"+desit.split(":", 1)[1])
  139. elif task == "playlist":
  140. try:
  141. y = ydl.extract_info("https://www.youtube.com/playlist?list="+desit.split(":", 1)[1], download=False)
  142. for itemyvp in y["entries"]:
  143. jobs.put(("submitdiscovery", itemyvp["id"], tracker.ItemType.Video))
  144. jobs.put(("complete", None, "playlist:"+args))
  145. except:
  146. print("YouTube-DL error, ignoring but not marking as complete...", "https://www.youtube.com/playlist?list="+desit.split(":", 1)[1])
  147. elif task == "complete":
  148. size = 0
  149. if ":" in args:
  150. if args.split(":", 1)[0] == "video":
  151. #check if dir is empty, make zip if needed
  152. if isdir("out/"+args.split(":", 1)[1]):
  153. if not listdir("out/"+args.split(":", 1)[1]):
  154. rmdir("out/"+args.split(":", 1)[1])
  155. else:
  156. #zip it up
  157. if not isdir("directory/"+args.split(":", 1)[1]):
  158. mkdir("directory/"+args.split(":", 1)[1])
  159. while not isfile("directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip"):
  160. print("Attempting to zip item...")
  161. system("zip -9 -r -j directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip out/"+args.split(":", 1)[1])
  162. #get a target
  163. targetloc = None
  164. while not targetloc:
  165. targetloc = tracker.request_upload_target()
  166. if targetloc:
  167. break
  168. else:
  169. print("Waiting 5 minutes...")
  170. sleep(300)
  171. if targetloc.startswith("rsync"):
  172. system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 directory/"+args.split(":", 1)[1]+"/ "+targetloc)
  173. elif targetloc.startswith("http"):
  174. system("curl -F "+args.split(":", 1)[1]+".zip=@directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip "+targetloc)
  175. size = getsize("directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip")
  176. #cleanup
  177. try:
  178. del langcnt[args.split(":", 1)[1]]
  179. rmtree("directory/"+args.split(":", 1)[1]+"/")
  180. rmdir("directory/"+args.split(":", 1)[1]+"/")
  181. rmtree("out/"+args.split(":", 1)[1]+"/")
  182. rmdir("out/"+args.split(":", 1)[1]+"/")
  183. except:
  184. pass
  185. tracker.mark_item_as_done(args, size)
  186. jobs.task_done()
  187. else:
  188. if not gkiller.kill_now:
  189. # get a new task from tracker
  190. collect() #cleanup
  191. #check that the account has community contributions enabled every 50th item
  192. validationtimes += 1
  193. if not validationtimes % 50:
  194. enres = getmetadata(mysession, "IjJKfe-0Ty0", True)[0]
  195. if not enres:
  196. print("Community Contribution discovery has been disabled for this account, please report this on our Discord as this may have caused some videos to be incorrectly marked as having community contributions disabled.")
  197. shouldgetjob = False
  198. gkiller.kill_now = True #exit the script
  199. del enres
  200. if shouldgetjob:
  201. desit = tracker.request_item_from_tracker()
  202. print("New task:", desit)
  203. if desit:
  204. if desit.split(":", 1)[0] == "video":
  205. needforcemetadata = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
  206. 'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
  207. 'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
  208. 'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
  209. 'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
  210. 'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
  211. 'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
  212. 'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
  213. 'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
  214. 'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
  215. 'xh': None, 'yi': None, 'yo': None, 'zu': None}
  216. needforcecaptions = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
  217. 'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
  218. 'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
  219. 'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
  220. 'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
  221. 'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
  222. 'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
  223. 'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
  224. 'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
  225. 'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
  226. 'xh': None, 'yi': None, 'yo': None, 'zu': None}
  227. jobs.put(("discovery", desit.split(":", 1)[1], None))
  228. elif desit.split(":", 1)[0] == "channel":
  229. jobs.put(("channel", None, desit.split(":", 1)[1]))
  230. elif desit.split(":", 1)[0] == "playlist":
  231. jobs.put(("playlist", None, desit.split(":", 1)[1]))
  232. else:
  233. print("Ignoring item for now", desit)
  234. else:
  235. print("Ignoring item for now", desit)
  236. else:
  237. break
  238. else:
  239. break
  240. threads = []
  241. THREADCNT = 50
  242. if HEROKU:
  243. THREADCNT = 20
  244. #now create the rest of the threads
  245. for i in range(THREADCNT):
  246. runthread = Thread(target=threadrunner)
  247. runthread.start()
  248. threads.append(runthread)
  249. del runthread
  250. #https://stackoverflow.com/a/11968881
  251. for x in threads:
  252. x.join()
  253. threads.remove(x)
  254. del x
  255. if not shouldgetjob:
  256. print("Community Contribution discovery has been disabled for this account, please report this on our Discord as this may have caused some videos to be incorrectly marked as having community contributions disabled.")
  257. print("Exiting...")