archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

225 lines
7.3 KiB

  1. from threading import Thread
  2. import requests
  3. from time import sleep
  4. from os import mkdir, rmdir, listdir, environ
  5. from os.path import isdir, isfile
  6. from json import dumps, loads
  7. import signal
  8. import tracker
  9. from shutil import make_archive, rmtree
  10. from queue import Queue
  11. from gc import collect
  12. from discovery import getmetadata
  13. from export import subprrun
  14. WORKER_VERSION = 1
  15. SERVER_BASE_URL = "http://localhost:5000"
  16. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  17. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  18. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  19. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  20. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  21. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  22. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  23. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  24. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  25. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  26. 'xh', 'yi', 'yo', 'zu']
  27. #useful Queue example: https://stackoverflow.com/a/54658363
  28. jobs = Queue()
  29. ccenabledl = []
  30. recvids = set()
  31. recchans = set()
  32. recmixes = set()
  33. recplayl = set()
  34. #HSID, SSID, SID cookies required
  35. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  36. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  37. elif isfile("config.json"):
  38. cookies = loads(open("config.json").read())
  39. else:
  40. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  41. assert False
  42. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  43. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  44. assert False
  45. mysession = requests.session()
  46. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  47. del cookies
  48. #Graceful Shutdown
  49. class GracefulKiller:
  50. kill_now = False
  51. def __init__(self):
  52. signal.signal(signal.SIGINT, self.exit_gracefully)
  53. signal.signal(signal.SIGTERM, self.exit_gracefully)
  54. def exit_gracefully(self,signum, frame):
  55. self.kill_now = True
  56. gkiller = GracefulKiller()
  57. def prrun():
  58. while not jobs.empty():
  59. global recvids
  60. global recchans
  61. global recmixes
  62. global recplayl
  63. global ccenabledl
  64. item = jobs.get()
  65. print("Video ID:", str(item).strip())
  66. while True:
  67. try:
  68. info = getmetadata(str(item).strip())
  69. break
  70. except BaseException as e:
  71. print(e)
  72. print("Error in retrieving information, waiting 30 seconds")
  73. #raise
  74. sleep(30)
  75. # Add any discovered videos
  76. recvids.update(info[2])
  77. recchans.update(info[3])
  78. recmixes.update(info[4])
  79. recplayl.update(info[5])
  80. if info[0] or info[1]: # ccenabled or creditdata
  81. if not isdir("out/"+str(item).strip()):
  82. mkdir("out/"+str(item).strip())
  83. if info[1]: # creditdata
  84. open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  85. if info[0]: #ccenabled
  86. ccenabledl.append(item)
  87. jobs.task_done()
  88. return True
  89. while not gkiller.kill_now:
  90. collect() #cleanup
  91. try:
  92. mkdir("out")
  93. except:
  94. pass
  95. # Get a batch ID
  96. batchcontent = []
  97. for ir in range(501):
  98. batchcontent.append(tracker.request_item_from_tracker())
  99. while batchcontent:
  100. desit = batchcontent.pop(0)
  101. if desit.split(":", 1)[0] == "video":
  102. jobs.put(desit)
  103. else:
  104. print("Ignoring item for now", desit)
  105. threads = []
  106. for i in range(50):
  107. runthread = Thread(target=prrun)
  108. runthread.start()
  109. threads.append(runthread)
  110. del runthread
  111. for x in threads:
  112. x.join()
  113. threads.remove(x)
  114. del x
  115. open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))
  116. #clear
  117. recvids.clear()
  118. recchans.clear()
  119. recmixes.clear()
  120. recplayl.clear()
  121. subtjobs = Queue()
  122. while ccenabledl:
  123. langcontent = langs.copy()
  124. intvid = ccenabledl.pop(0)
  125. while langcontent:
  126. subtjobs.put((langcontent.pop(0), intvid, "default"))
  127. del intvid
  128. del langcontent
  129. subthreads = []
  130. for r in range(50):
  131. subrunthread = Thread(target=subprrun, args=(subtjobs,mysession))
  132. subrunthread.start()
  133. subthreads.append(subrunthread)
  134. del subrunthread
  135. for xa in subthreads:
  136. xa.join() #bug (occurred once: the script ended before the last thread finished)
  137. subthreads.remove(xa)
  138. del xa
  139. sleep(1) #wait a second to hopefully allow the other threads to finish
  140. for fol in listdir("out"): #remove extra folders
  141. try:
  142. if isdir("out/"+fol):
  143. rmdir("out/"+fol)
  144. except:
  145. pass
  146. #https://stackoverflow.com/a/11968881
  147. # TODO: put the data somewhere...
  148. # TODO: put the discoveries somewhere...
  149. make_archive("out", "zip", "out") #check this
  150. # while True:
  151. # try:
  152. # uploadr = requests.post("https://transfersh.com/"+str(batchinfo["batchID"])+".zip", data=open("out.zip"))
  153. # if uploadr.status_code == 200:
  154. # resulturl = uploadr.text
  155. # break
  156. # except BaseException as e:
  157. # print(e)
  158. # print("Encountered error in uploading results... retrying in 10 minutes")
  159. # sleep(600)
  160. # Report the batch as complete (I can't think of a fail condition except for a worker exiting...)
  161. # TODO: handle worker exit
  162. while True:
  163. params = (
  164. ("id", WORKER_ID),
  165. ("worker_version", WORKER_VERSION),
  166. ("batchID", batchinfo["batchID"]),
  167. ("randomKey", batchinfo["randomKey"]),
  168. ("status", "c"),
  169. #("resulturl", resulturl),
  170. )
  171. statusrequest = requests.get(SERVER_BASE_URL+"/worker/updateStatus", params=params)
  172. if statusrequest.status_code == 200 and statusrequest.text == "Success":
  173. break
  174. else:
  175. print("Error in reporting success, will attempt again in 10 minutes")
  176. sleep(600)
  177. # TODO: clear the output directory
  178. rmtree("out")