archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.
 
 

226 řádky
7.1 KiB

  1. from threading import Thread
  2. import requests
  3. from time import sleep
  4. from os import mkdir, rmdir, listdir
  5. from os.path import isdir
  6. from json import dumps, loads
  7. from shutil import make_archive, rmtree
  8. from queue import Queue
  9. from gc import collect
  10. from discovery import getmetadata
  11. from export import subprrun
  12. WORKER_VERSION = 1
  13. SERVER_BASE_URL = "http://localhost:5000"
  14. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  15. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  16. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  17. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  18. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  19. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  20. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  21. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  22. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  23. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  24. 'xh', 'yi', 'yo', 'zu']
  25. #useful Queue example: https://stackoverflow.com/a/54658363
  26. jobs = Queue()
  27. ccenabledl = []
  28. recvids = set()
  29. recchans = set()
  30. recmixes = set()
  31. recplayl = set()
  32. #HSID, SSID, SID cookies required
  33. cookies = loads(open("config.json").read())
  34. headers = {"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",}
  35. del cookies
  36. def prrun():
  37. while not jobs.empty():
  38. global recvids
  39. global recchans
  40. global recmixes
  41. global recplayl
  42. global ccenabledl
  43. item = jobs.get()
  44. print("Video ID:", str(item).strip())
  45. while True:
  46. try:
  47. info = getmetadata(str(item).strip())
  48. break
  49. except BaseException as e:
  50. print(e)
  51. print("Error in retrieving information, waiting 30 seconds")
  52. #raise
  53. sleep(30)
  54. # Add any discovered videos
  55. recvids.update(info[2])
  56. recchans.update(info[3])
  57. recmixes.update(info[4])
  58. recplayl.update(info[5])
  59. if info[0] or info[1]: # ccenabled or creditdata
  60. if not isdir("out/"+str(item).strip()):
  61. mkdir("out/"+str(item).strip())
  62. if info[1]: # creditdata
  63. open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  64. if info[0]: #ccenabled
  65. ccenabledl.append(item)
  66. jobs.task_done()
  67. return True
  68. # Get a worker ID
  69. while True:
  70. params = (
  71. ("worker_version", WORKER_VERSION),
  72. )
  73. idrequest = requests.get(SERVER_BASE_URL+"/worker/getID", params=params)
  74. if idrequest.status_code == 200:
  75. WORKER_ID = idrequest.text
  76. break
  77. else:
  78. print("Error in retrieving ID, will attempt again in 10 minutes")
  79. sleep(600)
  80. while True:
  81. collect() #cleanup
  82. try:
  83. mkdir("out")
  84. except:
  85. pass
  86. # Get a batch ID
  87. while True:
  88. params = (
  89. ("id", WORKER_ID),
  90. ("worker_version", WORKER_VERSION),
  91. )
  92. batchrequest = requests.get(SERVER_BASE_URL+"/worker/getBatch", params=params)
  93. if batchrequest.status_code == 200:
  94. batchinfo = batchrequest.json()
  95. if batchinfo["content"] != "Fail":
  96. break
  97. print("Error in retrieving batch assignment, will attempt again in 10 minutes")
  98. sleep(600)
  99. print("Received batch ID:", batchinfo["batchID"], "Content:", batchinfo["content"])
  100. # Process the batch
  101. batchcontent = requests.get(batchinfo["content"]).text.split("\n")
  102. while batchcontent:
  103. jobs.put(batchcontent.pop(0))
  104. threads = []
  105. for i in range(50):
  106. runthread = Thread(target=prrun)
  107. runthread.start()
  108. threads.append(runthread)
  109. del runthread
  110. for x in threads:
  111. x.join()
  112. threads.remove(x)
  113. del x
  114. open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))
  115. #clear
  116. recvids.clear()
  117. recchans.clear()
  118. recmixes.clear()
  119. recplayl.clear()
  120. subtjobs = Queue()
  121. while ccenabledl:
  122. langcontent = langs.copy()
  123. intvid = ccenabledl.pop(0)
  124. while langcontent:
  125. subtjobs.put((langcontent.pop(0), intvid))
  126. del intvid
  127. del langcontent
  128. subthreads = []
  129. for r in range(50):
  130. subrunthread = Thread(target=subprrun, args=(subtjobs,headers))
  131. subrunthread.start()
  132. subthreads.append(subrunthread)
  133. del subrunthread
  134. for xa in subthreads:
  135. xa.join() #bug (occurred once: the script ended before the last thread finished)
  136. subthreads.remove(xa)
  137. del xa
  138. sleep(1) #wait a second to hopefully allow the other threads to finish
  139. for fol in listdir("out"): #remove extra folders
  140. try:
  141. if isdir("out/"+fol):
  142. rmdir("out/"+fol)
  143. except:
  144. pass
  145. #https://stackoverflow.com/a/11968881
  146. # TODO: put the data somewhere...
  147. # TODO: put the discoveries somewhere...
  148. make_archive("out", "zip", "out") #check this
  149. # while True:
  150. # try:
  151. # uploadr = requests.post("https://transfersh.com/"+str(batchinfo["batchID"])+".zip", data=open("out.zip"))
  152. # if uploadr.status_code == 200:
  153. # resulturl = uploadr.text
  154. # break
  155. # except BaseException as e:
  156. # print(e)
  157. # print("Encountered error in uploading results... retrying in 10 minutes")
  158. # sleep(600)
  159. # Report the batch as complete (I can't think of a fail condition except for a worker exiting...)
  160. # TODO: handle worker exit
  161. while True:
  162. params = (
  163. ("id", WORKER_ID),
  164. ("worker_version", WORKER_VERSION),
  165. ("batchID", batchinfo["batchID"]),
  166. ("randomKey", batchinfo["randomKey"]),
  167. ("status", "c"),
  168. #("resulturl", resulturl),
  169. )
  170. statusrequest = requests.get(SERVER_BASE_URL+"/worker/updateStatus", params=params)
  171. if statusrequest.status_code == 200 and statusrequest.text == "Success":
  172. break
  173. else:
  174. print("Error in reporting success, will attempt again in 10 minutes")
  175. sleep(600)
  176. # TODO: clear the output directory
  177. rmtree("out")