archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

155 regels
4.5 KiB

  1. import requests
  2. from time import sleep
  3. from os import mkdir
  4. from json import dumps
  5. import threading
  6. from shutil import make_archive, rmtree
  7. from discovery import getmetadata
  8. from export import getsubs
  9. WORKER_VERSION = 1
  10. SERVER_BASE_URL = "http://localhost:5000"
  11. class batchthread(threading.Thread):
  12. def run(self):
  13. item = self.getName()
  14. global recvids
  15. global recchans
  16. global recmixes
  17. global recplayl
  18. print("Video ID:", str(item).strip())
  19. while True:
  20. try:
  21. info = getmetadata(str(item).strip())
  22. break
  23. except BaseException as e:
  24. print(e)
  25. print("Error in retrieving information, waiting 10 minutes")
  26. sleep(600)
  27. # Add any discovered videos
  28. recvids.update(info[2])
  29. recchans.update(info[3])
  30. recmixes.update(info[4])
  31. recplayl.update(info[5])
  32. if info[0] or info[1]: # ccenabled or creditdata
  33. mkdir("out/"+str(item).strip())
  34. if info[1]: # creditdata
  35. open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  36. if info[0]: #ccenabled
  37. while True:
  38. gsres = False
  39. try:
  40. gsres = getsubs(str(item).strip())
  41. except BaseException as e:
  42. print(e)
  43. if gsres:
  44. break
  45. else:
  46. print("Error in retrieving subtitles, waiting 10 minutes")
  47. sleep(600)
  48. return True
  49. # Get a worker ID
  50. while True:
  51. params = (
  52. ("worker_version", WORKER_VERSION),
  53. )
  54. idrequest = requests.get(SERVER_BASE_URL+"/worker/getID", params=params)
  55. if idrequest.status_code == 200:
  56. WORKER_ID = idrequest.text
  57. break
  58. else:
  59. print("Error in retrieving ID, will attempt again in 10 minutes")
  60. sleep(600)
  61. while True:
  62. try:
  63. mkdir("out")
  64. except:
  65. pass
  66. recvids = set()
  67. recchans = set()
  68. recmixes = set()
  69. recplayl = set()
  70. # Get a batch ID
  71. while True:
  72. params = (
  73. ("id", WORKER_ID),
  74. ("worker_version", WORKER_VERSION),
  75. )
  76. batchrequest = requests.get(SERVER_BASE_URL+"/worker/getBatch", params=params)
  77. if batchrequest.status_code == 200:
  78. batchinfo = batchrequest.json()
  79. break
  80. else:
  81. print("Error in retrieving batch assignment, will attempt again in 10 minutes")
  82. sleep(600)
  83. print("Received batch ID:", batchinfo["batchID"], "Content:", batchinfo["content"])
  84. # Process the batch
  85. batchcontent = requests.get(batchinfo["content"]).text.split("\n")
  86. threads = []
  87. for item in batchcontent:
  88. runthread = batchthread(name = item)
  89. runthread.start()
  90. threads.append(runthread)
  91. for x in threads:
  92. x.join()
  93. #https://stackoverflow.com/a/11968881
  94. # TODO: put the data somewhere...
  95. # TODO: put the discoveries somewhere...
  96. open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))
  97. make_archive("out.zip", "zip", "out") #check this
  98. # while True:
  99. # try:
  100. # uploadr = requests.post("https://transfersh.com/"+str(batchinfo["batchID"])+".zip", data=open("out.zip"))
  101. # if uploadr.status_code == 200:
  102. # resulturl = uploadr.text
  103. # break
  104. # except BaseException as e:
  105. # print(e)
  106. # print("Encountered error in uploading results... retrying in 10 minutes")
  107. # sleep(600)
  108. # Report the batch as complete (I can't think of a fail condition except for a worker exiting...)
  109. # TODO: handle worker exit
  110. while True:
  111. params = (
  112. ("id", WORKER_ID),
  113. ("worker_version", WORKER_VERSION),
  114. ("batchID", batchinfo["batchID"]),
  115. ("randomKey", batchinfo["randomKey"]),
  116. ("status", "c"),
  117. #("resulturl", resulturl),
  118. )
  119. statusrequest = requests.get(SERVER_BASE_URL+"/worker/updateStatus", params=params)
  120. if statusrequest.status_code == 200 and statusrequest.text == "Success":
  121. break
  122. else:
  123. print("Error in reporting success, will attempt again in 10 minutes")
  124. sleep(600)
  125. # TODO: clear the output directory
  126. rmtree("out")