archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Não pode escolher mais do que 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 

133 linhas
4.0 KiB

  1. import requests
  2. from time import sleep
  3. from os import mkdir
  4. from json import dumps
  5. from shutil import make_archive, rmtree
  6. from discovery import getmetadata
  7. from export import getsubs
  8. WORKER_VERSION = 1
  9. SERVER_BASE_URL = "http://localhost:5000"
  10. # Get a worker ID
  11. while True:
  12. params = (
  13. ("worker_version", WORKER_VERSION),
  14. )
  15. idrequest = requests.get(SERVER_BASE_URL+"/worker/getID", params=params)
  16. if idrequest.status_code == 200:
  17. WORKER_ID = idrequest.text
  18. break
  19. else:
  20. print("Error in retrieving ID, will attempt again in 10 minutes")
  21. sleep(600)
  22. while True:
  23. try:
  24. mkdir("out")
  25. except:
  26. pass
  27. recvids = set()
  28. recchans = set()
  29. recmixes = set()
  30. recplayl = set()
  31. # Get a batch ID
  32. while True:
  33. params = (
  34. ("id", WORKER_ID),
  35. ("worker_version", WORKER_VERSION),
  36. )
  37. batchrequest = requests.get(SERVER_BASE_URL+"/worker/getBatch", params=params)
  38. if batchrequest.status_code == 200:
  39. batchinfo = batchrequest.json()
  40. break
  41. else:
  42. print("Error in retrieving batch assignment, will attempt again in 10 minutes")
  43. sleep(600)
  44. print("Received batch ID:", batchinfo["batchID"], "Content:", batchinfo["content"])
  45. # Process the batch
  46. batchcontent = requests.get(batchinfo["content"]).text.split("\n")
  47. for item in batchcontent:
  48. print("Video ID:", str(item).strip())
  49. while True:
  50. try:
  51. info = getmetadata(str(item).strip())
  52. break
  53. except BaseException as e:
  54. print(e)
  55. print("Error in retrieving information, waiting 10 minutes")
  56. sleep(600)
  57. # Add any discovered videos
  58. recvids.update(info[2])
  59. recchans.update(info[3])
  60. recmixes.update(info[4])
  61. recplayl.update(info[5])
  62. if info[0] or info[1]: # ccenabled or creditdata
  63. mkdir("out/"+str(item).strip())
  64. if info[1]: # creditdata
  65. open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  66. if info[0]: #ccenabled
  67. while True:
  68. gsres = False
  69. try:
  70. gsres = getsubs(str(item).strip())
  71. except BaseException as e:
  72. print(e)
  73. if gsres:
  74. break
  75. else:
  76. print("Error in retrieving subtitles, waiting 10 minutes")
  77. sleep(600)
  78. # TODO: put the data somewhere...
  79. # TODO: put the discoveries somewhere...
  80. open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))
  81. make_archive("out.zip", "zip", "out") #check this
  82. while True:
  83. try:
  84. uploadr = requests.post("https://transfersh.com/"+str(batchinfo["batchID"])+".zip", data=open("out.zip"))
  85. if uploadr.status_code == 200:
  86. resulturl = uploadr.text
  87. break
  88. except BaseException as e:
  89. print(e)
  90. print("Encountered error in uploading results... retrying in 10 minutes")
  91. sleep(600)
  92. # Report the batch as complete (I can't think of a fail condition except for a worker exiting...)
  93. # TODO: handle worker exit
  94. while True:
  95. params = (
  96. ("id", WORKER_ID),
  97. ("worker_version", WORKER_VERSION),
  98. ("batchID", batchinfo["batchID"]),
  99. ("randomKey", batchinfo["randomKey"]),
  100. ("status", "c"),
  101. ("resulturl", resulturl),
  102. )
  103. statusrequest = requests.get(SERVER_BASE_URL+"/worker/updateStatus", params=params)
  104. if statusrequest.status_code == 200 and statusrequest.text == "Success":
  105. break
  106. else:
  107. print("Error in reporting success, will attempt again in 10 minutes")
  108. sleep(600)
  109. # TODO: clear the output directory
  110. rmtree("out")