archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.
 
 

224 rader
7.0 KiB

  1. from threading import Thread
  2. import requests
  3. from time import sleep
  4. from os import mkdir
  5. from os.path import isdir
  6. from json import dumps, loads
  7. from shutil import make_archive, rmtree
  8. from queue import Queue
  9. from discovery import getmetadata
  10. from export import subprrun
  11. WORKER_VERSION = 1
  12. SERVER_BASE_URL = "http://localhost:5000"
  13. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  14. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  15. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  16. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  17. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  18. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  19. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  20. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  21. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  22. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  23. 'xh', 'yi', 'yo', 'zu']
  24. #useful Queue example: https://stackoverflow.com/a/54658363
  25. jobs = Queue()
  26. ccenabledl = []
  27. recvids = set()
  28. recchans = set()
  29. recmixes = set()
  30. recplayl = set()
  31. #HSID, SSID, SID cookies required
  32. cookies = loads(open("config.json").read())
  33. headers = {"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",}
  34. del cookies
  35. def prrun():
  36. while not jobs.empty():
  37. global recvids
  38. global recchans
  39. global recmixes
  40. global recplayl
  41. global ccenabledl
  42. item = jobs.get()
  43. print("Video ID:", str(item).strip())
  44. while True:
  45. try:
  46. info = getmetadata(str(item).strip())
  47. break
  48. except BaseException as e:
  49. print(e)
  50. print("Error in retrieving information, waiting 30 seconds")
  51. #raise
  52. sleep(30)
  53. # Add any discovered videos
  54. recvids.update(info[2])
  55. recchans.update(info[3])
  56. recmixes.update(info[4])
  57. recplayl.update(info[5])
  58. if info[0] or info[1]: # ccenabled or creditdata
  59. if not isdir("out/"+str(item).strip()):
  60. mkdir("out/"+str(item).strip())
  61. if info[1]: # creditdata
  62. open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  63. if info[0]: #ccenabled
  64. ccenabledl.append(item)
  65. jobs.task_done()
  66. return True
  67. # Get a worker ID
  68. while True:
  69. params = (
  70. ("worker_version", WORKER_VERSION),
  71. )
  72. idrequest = requests.get(SERVER_BASE_URL+"/worker/getID", params=params)
  73. if idrequest.status_code == 200:
  74. WORKER_ID = idrequest.text
  75. break
  76. else:
  77. print("Error in retrieving ID, will attempt again in 10 minutes")
  78. sleep(600)
  79. while True:
  80. try:
  81. mkdir("out")
  82. except:
  83. pass
  84. # Get a batch ID
  85. while True:
  86. params = (
  87. ("id", WORKER_ID),
  88. ("worker_version", WORKER_VERSION),
  89. )
  90. batchrequest = requests.get(SERVER_BASE_URL+"/worker/getBatch", params=params)
  91. if batchrequest.status_code == 200:
  92. batchinfo = batchrequest.json()
  93. if batchinfo["content"] != "Fail":
  94. break
  95. print("Error in retrieving batch assignment, will attempt again in 10 minutes")
  96. sleep(600)
  97. print("Received batch ID:", batchinfo["batchID"], "Content:", batchinfo["content"])
  98. # Process the batch
  99. batchcontent = requests.get(batchinfo["content"]).text.split("\n")
  100. while batchcontent:
  101. jobs.put(batchcontent.pop(0))
  102. threads = []
  103. for i in range(50):
  104. runthread = Thread(target=prrun)
  105. runthread.start()
  106. threads.append(runthread)
  107. del runthread
  108. for x in threads:
  109. x.join()
  110. threads.remove(x)
  111. del x
  112. open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))
  113. #clear
  114. recvids.clear()
  115. recchans.clear()
  116. recmixes.clear()
  117. recplayl.clear()
  118. subtjobs = Queue()
  119. while ccenabledl:
  120. langcontent = langs.copy()
  121. intvid = ccenabledl.pop(0)
  122. while langcontent:
  123. subtjobs.put((langcontent.pop(0), intvid))
  124. del intvid
  125. del langcontent
  126. subthreads = []
  127. for r in range(5):
  128. subrunthread = Thread(target=subprrun, args=(subtjobs,headers))
  129. subrunthread.start()
  130. subthreads.append(subrunthread)
  131. del subrunthread
  132. for xa in subthreads:
  133. xa.join()
  134. subthreads.remove(xa)
  135. del xa
  136. # while True:
  137. # gsres = False
  138. # try:
  139. # gsres = getsubs(str(item).strip())
  140. # except BaseException as e:
  141. # print(e)
  142. # if gsres:
  143. # break
  144. # else:
  145. # print("Error in retrieving subtitles, waiting 30 seconds")
  146. # sleep(30)
  147. #https://stackoverflow.com/a/11968881
  148. # TODO: put the data somewhere...
  149. # TODO: put the discoveries somewhere...
  150. make_archive("out", "zip", "out") #check this
  151. # while True:
  152. # try:
  153. # uploadr = requests.post("https://transfersh.com/"+str(batchinfo["batchID"])+".zip", data=open("out.zip"))
  154. # if uploadr.status_code == 200:
  155. # resulturl = uploadr.text
  156. # break
  157. # except BaseException as e:
  158. # print(e)
  159. # print("Encountered error in uploading results... retrying in 10 minutes")
  160. # sleep(600)
  161. # Report the batch as complete (I can't think of a fail condition except for a worker exiting...)
  162. # TODO: handle worker exit
  163. while True:
  164. params = (
  165. ("id", WORKER_ID),
  166. ("worker_version", WORKER_VERSION),
  167. ("batchID", batchinfo["batchID"]),
  168. ("randomKey", batchinfo["randomKey"]),
  169. ("status", "c"),
  170. #("resulturl", resulturl),
  171. )
  172. statusrequest = requests.get(SERVER_BASE_URL+"/worker/updateStatus", params=params)
  173. if statusrequest.status_code == 200 and statusrequest.text == "Success":
  174. break
  175. else:
  176. print("Error in reporting success, will attempt again in 10 minutes")
  177. sleep(600)
  178. # TODO: clear the output directory
  179. rmtree("out")