archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.
 
 

226 Zeilen
7.1 KiB

  1. from threading import Thread
  2. import requests
  3. from time import sleep
  4. from os import mkdir
  5. from os.path import isdir
  6. from json import dumps, loads
  7. from shutil import make_archive, rmtree
  8. from queue import Queue
  9. from discovery import getmetadata
  10. from export import subprrun
  11. WORKER_VERSION = 1
  12. SERVER_BASE_URL = "http://localhost:5000"
  13. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  14. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  15. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  16. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  17. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  18. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  19. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  20. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  21. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  22. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  23. 'xh', 'yi', 'yo', 'zu']
  24. #useful Queue example: https://stackoverflow.com/a/54658363
  25. jobs = Queue()
  26. ccenabledl = []
  27. recvids = set()
  28. recchans = set()
  29. recmixes = set()
  30. recplayl = set()
  31. #HSID, SSID, SID cookies required
  32. cookies = loads(open("config.json").read())
  33. headers = {"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",}
  34. del cookies
  35. def prrun():
  36. while not jobs.empty():
  37. global recvids
  38. global recchans
  39. global recmixes
  40. global recplayl
  41. global ccenabledl
  42. item = jobs.get()
  43. print("Video ID:", str(item).strip())
  44. while True:
  45. try:
  46. info = getmetadata(str(item).strip())
  47. break
  48. except BaseException as e:
  49. print(e)
  50. print("Error in retrieving information, waiting 30 seconds")
  51. #raise
  52. sleep(30)
  53. # Add any discovered videos
  54. recvids.update(info[2])
  55. recchans.update(info[3])
  56. recmixes.update(info[4])
  57. recplayl.update(info[5])
  58. if info[0] or info[1]: # ccenabled or creditdata
  59. if not isdir("out/"+str(item).strip()):
  60. mkdir("out/"+str(item).strip())
  61. if info[1]: # creditdata
  62. open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  63. if info[0]: #ccenabled
  64. ccenabledl.append(item)
  65. jobs.task_done()
  66. return True
  67. # Get a worker ID
  68. while True:
  69. params = (
  70. ("worker_version", WORKER_VERSION),
  71. )
  72. idrequest = requests.get(SERVER_BASE_URL+"/worker/getID", params=params)
  73. if idrequest.status_code == 200:
  74. WORKER_ID = idrequest.text
  75. break
  76. else:
  77. print("Error in retrieving ID, will attempt again in 10 minutes")
  78. sleep(600)
  79. while True:
  80. try:
  81. mkdir("out")
  82. except:
  83. pass
  84. # Get a batch ID
  85. while True:
  86. params = (
  87. ("id", WORKER_ID),
  88. ("worker_version", WORKER_VERSION),
  89. )
  90. batchrequest = requests.get(SERVER_BASE_URL+"/worker/getBatch", params=params)
  91. if batchrequest.status_code == 200:
  92. batchinfo = batchrequest.json()
  93. if batchinfo["content"] != "Fail":
  94. break
  95. print("Error in retrieving batch assignment, will attempt again in 10 minutes")
  96. sleep(600)
  97. print("Received batch ID:", batchinfo["batchID"], "Content:", batchinfo["content"])
  98. # Process the batch
  99. batchcontent = requests.get(batchinfo["content"]).text.split("\n")
  100. while batchcontent:
  101. jobs.put(batchcontent.pop(0))
  102. threads = []
  103. for i in range(50):
  104. runthread = Thread(target=prrun)
  105. runthread.start()
  106. threads.append(runthread)
  107. del runthread
  108. for x in threads:
  109. x.join()
  110. threads.remove(x)
  111. del x
  112. open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))
  113. #clear
  114. recvids.clear()
  115. recchans.clear()
  116. recmixes.clear()
  117. recplayl.clear()
  118. subtjobs = Queue()
  119. while ccenabledl:
  120. langcontent = langs.copy()
  121. intvid = ccenabledl.pop(0)
  122. while langcontent:
  123. subtjobs.put((langcontent.pop(0), intvid))
  124. del intvid
  125. del langcontent
  126. subthreads = []
  127. for r in range(50):
  128. subrunthread = Thread(target=subprrun, args=(subtjobs,headers))
  129. subrunthread.start()
  130. subthreads.append(subrunthread)
  131. del subrunthread
  132. for xa in subthreads:
  133. xa.join() #bug (occurred once: the script ended before the last thread finished)
  134. subthreads.remove(xa)
  135. del xa
  136. sleep(1)
  137. # while True:
  138. # gsres = False
  139. # try:
  140. # gsres = getsubs(str(item).strip())
  141. # except BaseException as e:
  142. # print(e)
  143. # if gsres:
  144. # break
  145. # else:
  146. # print("Error in retrieving subtitles, waiting 30 seconds")
  147. # sleep(30)
  148. #https://stackoverflow.com/a/11968881
  149. # TODO: put the data somewhere...
  150. # TODO: put the discoveries somewhere...
  151. make_archive("out", "zip", "out") #check this
  152. # while True:
  153. # try:
  154. # uploadr = requests.post("https://transfersh.com/"+str(batchinfo["batchID"])+".zip", data=open("out.zip"))
  155. # if uploadr.status_code == 200:
  156. # resulturl = uploadr.text
  157. # break
  158. # except BaseException as e:
  159. # print(e)
  160. # print("Encountered error in uploading results... retrying in 10 minutes")
  161. # sleep(600)
  162. # Report the batch as complete (I can't think of a fail condition except for a worker exiting...)
  163. # TODO: handle worker exit
  164. while True:
  165. params = (
  166. ("id", WORKER_ID),
  167. ("worker_version", WORKER_VERSION),
  168. ("batchID", batchinfo["batchID"]),
  169. ("randomKey", batchinfo["randomKey"]),
  170. ("status", "c"),
  171. #("resulturl", resulturl),
  172. )
  173. statusrequest = requests.get(SERVER_BASE_URL+"/worker/updateStatus", params=params)
  174. if statusrequest.status_code == 200 and statusrequest.text == "Success":
  175. break
  176. else:
  177. print("Error in reporting success, will attempt again in 10 minutes")
  178. sleep(600)
  179. # TODO: clear the output directory
  180. rmtree("out")