archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Não pode escolher mais do que 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 

166 linhas
4.8 KiB

  1. from typing import Optional, List
  2. from enum import Enum, auto
  3. import requests
  4. # TODO: Implement backoff for 500 response codes
  5. # https://github.com/ArchiveTeam/tencent-weibo-grab/blob/9bae5f9747e014db9227821a9c11557267967023/pipeline.py
  6. VERSION = "20200921.01"
  7. TRACKER_ID = "ext-yt-communitycontribs"
  8. TRACKER_HOST = "trackerproxy.meo.ws"
  9. BACKFEED_HOST = "blackbird-amqp.meo.ws:23038"
  10. BACKFEED_ENDPOINT = f"http://{BACKFEED_HOST}/{TRACKER_ID}-kj57sxhhzcn2kqjp/"
  11. TRACKER_ENDPOINT = f"http://{TRACKER_HOST}/{TRACKER_ID}"
  12. class ItemType(Enum):
  13. Video = auto()
  14. Channel = auto()
  15. MixPlaylist = auto()
  16. Playlist = auto()
  17. def add_item_to_tracker(item_type: ItemType, item_id: str) -> bool:
  18. """Feed items into the tracker through backfeed (item names will be deduplicated):
  19. # curl -d 'ITEMNAME' -so/dev/null $amqp_endpoint
  20. # Response codes:
  21. # 200 - Item added to tracker
  22. # 409 - Item is already in tracker
  23. # 404 - Project backfeed channel not found
  24. # 400 - Item name has a bad format
  25. """
  26. type_name = item_type.name.lower()
  27. item_name = f"{type_name}:{item_id}"
  28. req = requests.post(BACKFEED_ENDPOINT, data=item_name)
  29. code = req.status_code
  30. if code == 200:
  31. print(f"[INFO] Item ID \'{item_name}\' added to tracker successfully")
  32. return True
  33. elif code == 409:
  34. print(f"[INFO] Item ID \'{item_name}\' has already been added to tracker")
  35. return True
  36. elif code == 404:
  37. print(f"[ERROR] Unable to add item ID \'{item_name}\' to tracker. Project backfeed channel not found: {BACKFEED_ENDPOINT}")
  38. elif code == 400:
  39. print(f"[ERROR] Item ID \'{item_name}\' has a bad format")
  40. else:
  41. print(f"[ERROR] Unknown response code adding item \'{item_name}\' to tracker: {code}")
  42. return False
  43. def request_item_from_tracker() -> Optional[str]:
  44. data = {
  45. # TODO: Ask Fusl what this should be
  46. # https://www.archiveteam.org/index.php?title=Dev/Seesaw
  47. # ^ says it would be filled in by the Seesaw library
  48. "downloader": "Fusl",
  49. "api_version": "2",
  50. "version": VERSION
  51. }
  52. req = requests.post(f"{TRACKER_ENDPOINT}/request", json=data)
  53. code = req.status_code
  54. if code == 200:
  55. data = req.json()
  56. if "item_name" in data:
  57. item_name = data["item_name"]
  58. print(f"[INFO] Received an item from tracker: {item_name}")
  59. return item_name
  60. else:
  61. print(f"[ERROR] Received item is missing the \'item_name\' key: {data}")
  62. else:
  63. print(f"[ERROR] Unable to get an item from tracker. Status: {code}")
  64. def request_upload_target() -> Optional[str]:
  65. req = requests.get(
  66. # "https://httpbin.org/get",
  67. f"{TRACKER_ENDPOINT}/upload",
  68. )
  69. code = req.status_code
  70. if code == 200:
  71. data = req.json()
  72. if "upload_target" in data:
  73. upload_target = data["upload_target"]
  74. print(f"[INFO] Received an upload target from tracker: {upload_target}")
  75. return upload_target
  76. else:
  77. print(f"[ERROR] Response is missing the \'upload_target\' key: {data}")
  78. else:
  79. print(f"[ERROR] Unable to get an upload target from tracker. Status: {code}")
  80. def request_all_upload_targets() -> Optional[List[str]]:
  81. req = requests.get(
  82. # "https://httpbin.org/get",
  83. f"{TRACKER_ENDPOINT}/upload",
  84. )
  85. code = req.status_code
  86. if code == 200:
  87. data = req.json()
  88. print(f"[INFO] Received all upload targets from tracker: {data}")
  89. return data
  90. else:
  91. print(f"[ERROR] Unable to get all upload targets from tracker. Status: {code}")
  92. # `item_name` includes type prefix (video:id, playlist:id, etc)
  93. def mark_item_as_done(item_name: str, item_size_bytes: int) -> bool:
  94. data = {
  95. # TODO: Ask Fusl what this should be
  96. # https://www.archiveteam.org/index.php?title=Dev/Seesaw
  97. # ^ says it would be filled in by the Seesaw library
  98. "downloader": "Fusl",
  99. "version": VERSION,
  100. "item": item_name,
  101. "bytes": {
  102. "data": item_size_bytes
  103. }
  104. }
  105. req = requests.post(f"{TRACKER_ENDPOINT}/done", json=data)
  106. code = req.status_code
  107. if code == 200:
  108. print(f"[INFO] Marked item \'{item_name}\' as done")
  109. return True
  110. elif code > 399 and code < 500:
  111. print(f"[ERROR] Unable to mark item as done. Status: {code}")
  112. elif code > 499 and code < 600:
  113. # TODO: retry here
  114. pass
  115. else:
  116. print(f"[ERROR] Unknown response code while marking item \'{item_name}\' as done: {code}")
  117. return False
  118. if __name__ == "__main__":
  119. # print(add_item_to_tracker(ItemType.Channel, "test6"))
  120. # print(request_item_from_tracker())
  121. # print(request_upload_target())
  122. # print(request_all_upload_targets())
  123. # print(mark_item_as_done("test4", 200))