archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
 
 

155 行
4.5 KiB

  1. import requests
  2. from time import sleep
  3. from os import mkdir
  4. from json import dumps
  5. import threading
  6. from shutil import make_archive, rmtree
  7. from discovery import getmetadata
  8. from export import getsubs
  9. WORKER_VERSION = 1
  10. SERVER_BASE_URL = "http://localhost:5000"
  11. class batchthread(threading.Thread):
  12. def run(self):
  13. item = self.getName()
  14. global recvids
  15. global recchans
  16. global recmixes
  17. global recplayl
  18. print("Video ID:", str(item).strip())
  19. while True:
  20. try:
  21. info = getmetadata(str(item).strip())
  22. break
  23. except BaseException as e:
  24. print(e)
  25. print("Error in retrieving information, waiting 30 seconds")
  26. sleep(30)
  27. # Add any discovered videos
  28. recvids.update(info[2])
  29. recchans.update(info[3])
  30. recmixes.update(info[4])
  31. recplayl.update(info[5])
  32. if info[0] or info[1]: # ccenabled or creditdata
  33. mkdir("out/"+str(item).strip())
  34. if info[1]: # creditdata
  35. open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  36. if info[0]: #ccenabled
  37. while True:
  38. gsres = False
  39. try:
  40. gsres = getsubs(str(item).strip())
  41. except BaseException as e:
  42. print(e)
  43. if gsres:
  44. break
  45. else:
  46. print("Error in retrieving subtitles, waiting 30 seconds")
  47. sleep(30)
  48. return True
  49. # Get a worker ID
  50. while True:
  51. params = (
  52. ("worker_version", WORKER_VERSION),
  53. )
  54. idrequest = requests.get(SERVER_BASE_URL+"/worker/getID", params=params)
  55. if idrequest.status_code == 200:
  56. WORKER_ID = idrequest.text
  57. break
  58. else:
  59. print("Error in retrieving ID, will attempt again in 10 minutes")
  60. sleep(600)
  61. while True:
  62. try:
  63. mkdir("out")
  64. except:
  65. pass
  66. recvids = set()
  67. recchans = set()
  68. recmixes = set()
  69. recplayl = set()
  70. # Get a batch ID
  71. while True:
  72. params = (
  73. ("id", WORKER_ID),
  74. ("worker_version", WORKER_VERSION),
  75. )
  76. batchrequest = requests.get(SERVER_BASE_URL+"/worker/getBatch", params=params)
  77. if batchrequest.status_code == 200:
  78. batchinfo = batchrequest.json()
  79. break
  80. else:
  81. print("Error in retrieving batch assignment, will attempt again in 10 minutes")
  82. sleep(600)
  83. print("Received batch ID:", batchinfo["batchID"], "Content:", batchinfo["content"])
  84. # Process the batch
  85. batchcontent = requests.get(batchinfo["content"]).text.split("\n")
  86. threads = []
  87. for item in batchcontent:
  88. runthread = batchthread(name = item)
  89. runthread.start()
  90. threads.append(runthread)
  91. for x in threads:
  92. x.join()
  93. #https://stackoverflow.com/a/11968881
  94. # TODO: put the data somewhere...
  95. # TODO: put the discoveries somewhere...
  96. open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))
  97. make_archive("out.zip", "zip", "out") #check this
  98. # while True:
  99. # try:
  100. # uploadr = requests.post("https://transfersh.com/"+str(batchinfo["batchID"])+".zip", data=open("out.zip"))
  101. # if uploadr.status_code == 200:
  102. # resulturl = uploadr.text
  103. # break
  104. # except BaseException as e:
  105. # print(e)
  106. # print("Encountered error in uploading results... retrying in 10 minutes")
  107. # sleep(600)
  108. # Report the batch as complete (I can't think of a fail condition except for a worker exiting...)
  109. # TODO: handle worker exit
  110. while True:
  111. params = (
  112. ("id", WORKER_ID),
  113. ("worker_version", WORKER_VERSION),
  114. ("batchID", batchinfo["batchID"]),
  115. ("randomKey", batchinfo["randomKey"]),
  116. ("status", "c"),
  117. #("resulturl", resulturl),
  118. )
  119. statusrequest = requests.get(SERVER_BASE_URL+"/worker/updateStatus", params=params)
  120. if statusrequest.status_code == 200 and statusrequest.text == "Success":
  121. break
  122. else:
  123. print("Error in reporting success, will attempt again in 10 minutes")
  124. sleep(600)
  125. # TODO: clear the output directory
  126. rmtree("out")