archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

236 строки
7.7 KiB

  1. from threading import Thread
  2. import requests
  3. from time import sleep
  4. from os import mkdir, rmdir, listdir, environ
  5. from os.path import isdir, isfile
  6. from json import dumps, loads
  7. from shutil import make_archive, rmtree
  8. from queue import Queue
  9. from gc import collect
  10. from discovery import getmetadata
  11. from export import subprrun
  12. WORKER_VERSION = 1
  13. SERVER_BASE_URL = "http://localhost:5000"
  14. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  15. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  16. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  17. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  18. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  19. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  20. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  21. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  22. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  23. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  24. 'xh', 'yi', 'yo', 'zu']
  25. #useful Queue example: https://stackoverflow.com/a/54658363
  26. jobs = Queue()
  27. ccenabledl = []
  28. recvids = set()
  29. recchans = set()
  30. recmixes = set()
  31. recplayl = set()
  32. #HSID, SSID, SID cookies required
  33. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  34. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  35. elif isfile("config.json"):
  36. cookies = loads(open("config.json").read())
  37. else:
  38. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  39. assert False
  40. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  41. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  42. assert False
  43. mysession = requests.session()
  44. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  45. del cookies
  46. def prrun():
  47. while not jobs.empty():
  48. global recvids
  49. global recchans
  50. global recmixes
  51. global recplayl
  52. global ccenabledl
  53. item = jobs.get()
  54. print("Video ID:", str(item).strip())
  55. while True:
  56. try:
  57. info = getmetadata(str(item).strip())
  58. break
  59. except BaseException as e:
  60. print(e)
  61. print("Error in retrieving information, waiting 30 seconds")
  62. #raise
  63. sleep(30)
  64. # Add any discovered videos
  65. recvids.update(info[2])
  66. recchans.update(info[3])
  67. recmixes.update(info[4])
  68. recplayl.update(info[5])
  69. if info[0] or info[1]: # ccenabled or creditdata
  70. if not isdir("out/"+str(item).strip()):
  71. mkdir("out/"+str(item).strip())
  72. if info[1]: # creditdata
  73. open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  74. if info[0]: #ccenabled
  75. ccenabledl.append(item)
  76. jobs.task_done()
  77. return True
  78. # Get a worker ID
  79. while True:
  80. params = (
  81. ("worker_version", WORKER_VERSION),
  82. )
  83. idrequest = requests.get(SERVER_BASE_URL+"/worker/getID", params=params)
  84. if idrequest.status_code == 200:
  85. WORKER_ID = idrequest.text
  86. break
  87. else:
  88. print("Error in retrieving ID, will attempt again in 10 minutes")
  89. sleep(600)
  90. while True:
  91. collect() #cleanup
  92. try:
  93. mkdir("out")
  94. except:
  95. pass
  96. # Get a batch ID
  97. while True:
  98. params = (
  99. ("id", WORKER_ID),
  100. ("worker_version", WORKER_VERSION),
  101. )
  102. batchrequest = requests.get(SERVER_BASE_URL+"/worker/getBatch", params=params)
  103. if batchrequest.status_code == 200:
  104. batchinfo = batchrequest.json()
  105. if batchinfo["content"] != "Fail":
  106. break
  107. print("Error in retrieving batch assignment, will attempt again in 10 minutes")
  108. sleep(600)
  109. print("Received batch ID:", batchinfo["batchID"], "Content:", batchinfo["content"])
  110. # Process the batch
  111. batchcontent = requests.get(batchinfo["content"]).text.split("\n")
  112. while batchcontent:
  113. jobs.put(batchcontent.pop(0))
  114. threads = []
  115. for i in range(50):
  116. runthread = Thread(target=prrun)
  117. runthread.start()
  118. threads.append(runthread)
  119. del runthread
  120. for x in threads:
  121. x.join()
  122. threads.remove(x)
  123. del x
  124. open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))
  125. #clear
  126. recvids.clear()
  127. recchans.clear()
  128. recmixes.clear()
  129. recplayl.clear()
  130. subtjobs = Queue()
  131. while ccenabledl:
  132. langcontent = langs.copy()
  133. intvid = ccenabledl.pop(0)
  134. while langcontent:
  135. subtjobs.put((langcontent.pop(0), intvid, "default"))
  136. del intvid
  137. del langcontent
  138. subthreads = []
  139. for r in range(50):
  140. subrunthread = Thread(target=subprrun, args=(subtjobs,mysession))
  141. subrunthread.start()
  142. subthreads.append(subrunthread)
  143. del subrunthread
  144. for xa in subthreads:
  145. xa.join() #bug (occurred once: the script ended before the last thread finished)
  146. subthreads.remove(xa)
  147. del xa
  148. sleep(1) #wait a second to hopefully allow the other threads to finish
  149. for fol in listdir("out"): #remove extra folders
  150. try:
  151. if isdir("out/"+fol):
  152. rmdir("out/"+fol)
  153. except:
  154. pass
  155. #https://stackoverflow.com/a/11968881
  156. # TODO: put the data somewhere...
  157. # TODO: put the discoveries somewhere...
  158. make_archive("out", "zip", "out") #check this
  159. # while True:
  160. # try:
  161. # uploadr = requests.post("https://transfersh.com/"+str(batchinfo["batchID"])+".zip", data=open("out.zip"))
  162. # if uploadr.status_code == 200:
  163. # resulturl = uploadr.text
  164. # break
  165. # except BaseException as e:
  166. # print(e)
  167. # print("Encountered error in uploading results... retrying in 10 minutes")
  168. # sleep(600)
  169. # Report the batch as complete (I can't think of a fail condition except for a worker exiting...)
  170. # TODO: handle worker exit
  171. while True:
  172. params = (
  173. ("id", WORKER_ID),
  174. ("worker_version", WORKER_VERSION),
  175. ("batchID", batchinfo["batchID"]),
  176. ("randomKey", batchinfo["randomKey"]),
  177. ("status", "c"),
  178. #("resulturl", resulturl),
  179. )
  180. statusrequest = requests.get(SERVER_BASE_URL+"/worker/updateStatus", params=params)
  181. if statusrequest.status_code == 200 and statusrequest.text == "Success":
  182. break
  183. else:
  184. print("Error in reporting success, will attempt again in 10 minutes")
  185. sleep(600)
  186. # TODO: clear the output directory
  187. rmtree("out")