archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

304 line
9.8 KiB

  1. from threading import Thread
  2. import requests
  3. from time import sleep
  4. from os import mkdir, rmdir, listdir, system, environ
  5. from os.path import isdir, isfile, getsize
  6. from json import dumps, loads
  7. import signal
  8. from youtube_dl.utils import DownloadError
  9. import tracker
  10. from youtube_dl import YoutubeDL
  11. from shutil import make_archive, rmtree
  12. from queue import Queue
  13. from gc import collect
  14. from discovery import getmetadata
  15. from export import subprrun
  16. batchcontent = []
  17. def batchfunc():
  18. ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False})
  19. while jobs.qsize() < 501:
  20. desit = tracker.request_item_from_tracker()
  21. if desit:
  22. if desit.split(":", 1)[0] == "video":
  23. jobs.put(desit.split(":", 1)[1])
  24. elif desit.split(":", 1)[0] == "channel":
  25. y = ydl.extract_info("https://www.youtube.com/channel/"+desit.split(":", 1)[1], download=False)
  26. for itemyv in y["entries"]:
  27. tracker.add_item_to_tracker(tracker.ItemType.Video, itemyv["id"])
  28. elif desit.split(":", 1)[0] == "playlist":
  29. y = ydl.extract_info("https://www.youtube.com/playlist?list="+desit.split(":", 1)[1], download=False)
  30. for itemyvp in y["entries"]:
  31. tracker.add_item_to_tracker(tracker.ItemType.Video, itemyvp["id"])
  32. else:
  33. print("Ignoring item for now", desit)
  34. else:
  35. print("Ignoring item for now", desit)
  36. batchcontent.append(desit.split(":", 1)[1])
  37. def submitfunc(submitqueue):
  38. while not submitqueue.empty():
  39. itype, ival = submitqueue.get()
  40. tracker.add_item_to_tracker(itype, ival)
  41. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  42. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  43. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  44. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  45. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  46. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  47. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  48. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  49. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  50. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  51. 'xh', 'yi', 'yo', 'zu']
  52. #useful Queue example: https://stackoverflow.com/a/54658363
  53. jobs = Queue()
  54. ccenabledl = []
  55. recvids = set()
  56. recchans = set()
  57. recmixes = set()
  58. recplayl = set()
  59. #HSID, SSID, SID cookies required
  60. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  61. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  62. elif isfile("config.json"):
  63. cookies = loads(open("config.json").read())
  64. else:
  65. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  66. assert False
  67. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  68. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  69. assert False
  70. mysession = requests.session()
  71. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  72. open("cookies.txt", "w").write("""# HTTP Cookie File
  73. .youtube.com TRUE / FALSE 1663793455 SID [SID]
  74. .youtube.com TRUE / FALSE 1663793455 HSID [HSID]
  75. .youtube.com TRUE / TRUE 1663793455 SSID [SSID]""".replace("[SID]", cookies["SID"]).replace("[HSID]", cookies["HSID"]).replace("[SSID]", cookies["SSID"]))
  76. del cookies
  77. #Graceful Shutdown
  78. class GracefulKiller:
  79. kill_now = False
  80. def __init__(self):
  81. signal.signal(signal.SIGINT, self.exit_gracefully)
  82. signal.signal(signal.SIGTERM, self.exit_gracefully)
  83. def exit_gracefully(self,signum, frame):
  84. self.kill_now = True
  85. gkiller = GracefulKiller()
  86. def prrun():
  87. while not jobs.empty():
  88. global recvids
  89. global recchans
  90. global recmixes
  91. global recplayl
  92. global ccenabledl
  93. item = jobs.get()
  94. print("Video ID:", str(item).strip())
  95. while True:
  96. try:
  97. info = getmetadata(str(item).strip())
  98. break
  99. except BaseException as e:
  100. print(e)
  101. print("Error in retrieving information, waiting 30 seconds")
  102. #raise
  103. sleep(30)
  104. # Add any discovered videos
  105. recvids.update(info[2])
  106. recchans.update(info[3])
  107. recmixes.update(info[4])
  108. recplayl.update(info[5])
  109. if info[0] or info[1]: # ccenabled or creditdata
  110. if not isdir("out/"+str(item).strip()):
  111. mkdir("out/"+str(item).strip())
  112. if info[1]: # creditdata
  113. open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  114. if info[0]: #ccenabled
  115. ccenabledl.append(item)
  116. jobs.task_done()
  117. return True
  118. while not gkiller.kill_now:
  119. collect() #cleanup
  120. try:
  121. mkdir("out")
  122. except:
  123. pass
  124. try:
  125. mkdir("directory")
  126. except:
  127. pass
  128. batchcontent.clear()
  129. # Get a batch ID
  130. batchthreads = []
  131. for r in range(50):
  132. batchrunthread = Thread(target=batchfunc)
  133. batchrunthread.start()
  134. batchthreads.append(batchrunthread)
  135. del batchrunthread
  136. for xc in batchthreads:
  137. xc.join() #bug (occurred once: the script ended before the last thread finished)
  138. batchthreads.remove(xc)
  139. del xc
  140. #for ir in range(501):
  141. # batchcontent.append(tracker.request_item_from_tracker())
  142. threads = []
  143. for i in range(50):
  144. runthread = Thread(target=prrun)
  145. runthread.start()
  146. threads.append(runthread)
  147. del runthread
  148. for x in threads:
  149. x.join()
  150. threads.remove(x)
  151. del x
  152. print("Sending discoveries to tracker...")
  153. submitjobs = Queue()
  154. #IDK how to handle mixes so just send them for now
  155. print("Videos:", len(recvids))
  156. for itemvid in recvids:
  157. submitjobs.put((tracker.ItemType.Video, itemvid))
  158. print("Channels:", len(recchans))
  159. for itemchan in recchans:
  160. submitjobs.put((tracker.ItemType.Channel, itemchan))
  161. print("Mix Playlists:", len(recmixes))
  162. for itemmix in recmixes:
  163. submitjobs.put((tracker.ItemType.MixPlaylist, itemmix))
  164. print("Playlists:", len(recplayl))
  165. for itemplayl in recplayl:
  166. submitjobs.put((tracker.ItemType.Playlist, itemplayl))
  167. #open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))
  168. #clear
  169. recvids.clear()
  170. recchans.clear()
  171. recmixes.clear()
  172. recplayl.clear()
  173. submitthreads = []
  174. for r in range(50):
  175. submitrunthread = Thread(target=submitfunc, args=(submitjobs,))
  176. submitrunthread.start()
  177. submitthreads.append(submitrunthread)
  178. del submitrunthread
  179. for xb in submitthreads:
  180. xb.join() #bug (occurred once: the script ended before the last thread finished)
  181. submitthreads.remove(xb)
  182. del xb
  183. sleep(1)
  184. subtjobs = Queue()
  185. while ccenabledl:
  186. langcontent = langs.copy()
  187. intvid = ccenabledl.pop(0)
  188. while langcontent:
  189. subtjobs.put((langcontent.pop(0), intvid, "default"))
  190. del intvid
  191. del langcontent
  192. subthreads = []
  193. for r in range(50):
  194. subrunthread = Thread(target=subprrun, args=(subtjobs,mysession))
  195. subrunthread.start()
  196. subthreads.append(subrunthread)
  197. del subrunthread
  198. for xa in subthreads:
  199. xa.join() #bug (occurred once: the script ended before the last thread finished)
  200. subthreads.remove(xa)
  201. del xa
  202. sleep(1) #wait a second to hopefully allow the other threads to finish
  203. for fol in listdir("out"): #remove extra folders
  204. try:
  205. if isdir("out/"+fol):
  206. rmdir("out/"+fol)
  207. except:
  208. pass
  209. #https://stackoverflow.com/a/11968881
  210. # TODO: put the data somewhere...
  211. # TODO: put the discoveries somewhere...
  212. for fol in listdir("out"):
  213. if isdir("out/"+fol):
  214. make_archive("directory/"+fol, "zip", "out/"+fol) #check this
  215. targetloc = None
  216. while not targetloc:
  217. targetloc = tracker.request_upload_target()
  218. if targetloc:
  219. break
  220. else:
  221. print("Waiting 5 minutes...")
  222. sleep(300)
  223. system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 --files-from=- /directory/ "+targetloc)
  224. # Report the batch as complete
  225. for itemb in batchcontent:
  226. if isfile("out/"+itemb.split(":", 1)[1]+".zip"):
  227. size = getsize("out/"+itemb.split(":", 1)[1]+".zip")
  228. else:
  229. size = 0
  230. tracker.mark_item_as_done(itemb, size)
  231. # clear the output directory
  232. rmtree("out")