archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.
 
 

160 Zeilen
4.7 KiB

  1. import requests
  2. from time import sleep
  3. from os import mkdir
  4. from os.path import isdir
  5. from json import dumps
  6. import threading
  7. from shutil import make_archive, rmtree
  8. from discovery import getmetadata
  9. from export import getsubs
  10. WORKER_VERSION = 1
  11. SERVER_BASE_URL = "http://localhost:5000"
  12. class batchthread(threading.Thread):
  13. def run(self):
  14. item = self.getName()
  15. global recvids
  16. global recchans
  17. global recmixes
  18. global recplayl
  19. print("Video ID:", str(item).strip())
  20. while True:
  21. try:
  22. info = getmetadata(str(item).strip())
  23. break
  24. except BaseException as e:
  25. print(e)
  26. print("Error in retrieving information, waiting 30 seconds")
  27. raise
  28. sleep(30)
  29. # Add any discovered videos
  30. recvids.update(info[2])
  31. recchans.update(info[3])
  32. recmixes.update(info[4])
  33. recplayl.update(info[5])
  34. if info[0] or info[1]: # ccenabled or creditdata
  35. if not isdir("out/"+str(item).strip()):
  36. mkdir("out/"+str(item).strip())
  37. if info[1]: # creditdata
  38. open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  39. if info[0]: #ccenabled
  40. while True:
  41. gsres = False
  42. try:
  43. gsres = getsubs(str(item).strip())
  44. except BaseException as e:
  45. print(e)
  46. if gsres:
  47. break
  48. else:
  49. print("Error in retrieving subtitles, waiting 30 seconds")
  50. sleep(30)
  51. return True
  52. # Get a worker ID
  53. while True:
  54. params = (
  55. ("worker_version", WORKER_VERSION),
  56. )
  57. idrequest = requests.get(SERVER_BASE_URL+"/worker/getID", params=params)
  58. if idrequest.status_code == 200:
  59. WORKER_ID = idrequest.text
  60. break
  61. else:
  62. print("Error in retrieving ID, will attempt again in 10 minutes")
  63. sleep(600)
  64. while True:
  65. try:
  66. mkdir("out")
  67. except:
  68. pass
  69. recvids = set()
  70. recchans = set()
  71. recmixes = set()
  72. recplayl = set()
  73. # Get a batch ID
  74. while True:
  75. params = (
  76. ("id", WORKER_ID),
  77. ("worker_version", WORKER_VERSION),
  78. )
  79. batchrequest = requests.get(SERVER_BASE_URL+"/worker/getBatch", params=params)
  80. if batchrequest.status_code == 200:
  81. batchinfo = batchrequest.json()
  82. break
  83. else:
  84. print("Error in retrieving batch assignment, will attempt again in 10 minutes")
  85. sleep(600)
  86. print("Received batch ID:", batchinfo["batchID"], "Content:", batchinfo["content"])
  87. # Process the batch
  88. batchcontent = requests.get(batchinfo["content"]).text.split("\n")
  89. threads = []
  90. while batchcontent:
  91. while len(threads) <= 50 and batchcontent:
  92. item = batchcontent.pop(0)
  93. runthread = batchthread(name = item)
  94. runthread.start()
  95. threads.append(runthread)
  96. for x in threads:
  97. x.join()
  98. threads.remove(x)
  99. #https://stackoverflow.com/a/11968881
  100. # TODO: put the data somewhere...
  101. # TODO: put the discoveries somewhere...
  102. open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))
  103. make_archive("out", "zip", "out") #check this
  104. # while True:
  105. # try:
  106. # uploadr = requests.post("https://transfersh.com/"+str(batchinfo["batchID"])+".zip", data=open("out.zip"))
  107. # if uploadr.status_code == 200:
  108. # resulturl = uploadr.text
  109. # break
  110. # except BaseException as e:
  111. # print(e)
  112. # print("Encountered error in uploading results... retrying in 10 minutes")
  113. # sleep(600)
  114. # Report the batch as complete (I can't think of a fail condition except for a worker exiting...)
  115. # TODO: handle worker exit
  116. while True:
  117. params = (
  118. ("id", WORKER_ID),
  119. ("worker_version", WORKER_VERSION),
  120. ("batchID", batchinfo["batchID"]),
  121. ("randomKey", batchinfo["randomKey"]),
  122. ("status", "c"),
  123. #("resulturl", resulturl),
  124. )
  125. statusrequest = requests.get(SERVER_BASE_URL+"/worker/updateStatus", params=params)
  126. if statusrequest.status_code == 200 and statusrequest.text == "Success":
  127. break
  128. else:
  129. print("Error in reporting success, will attempt again in 10 minutes")
  130. sleep(600)
  131. # TODO: clear the output directory
  132. rmtree("out")