archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 

174 linhas
5.0 KiB

  1. from typing import Optional, List
  2. from enum import Enum, auto
  3. import requests
  4. # TODO: Implement backoff for 500 response codes
  5. # https://github.com/ArchiveTeam/tencent-weibo-grab/blob/9bae5f9747e014db9227821a9c11557267967023/pipeline.py
  6. VERSION = "20200921.01"
  7. TRACKER_ID = "ext-yt-communitycontribs"
  8. TRACKER_HOST = "trackerproxy.meo.ws"
  9. BACKFEED_HOST = "blackbird-amqp.meo.ws:23038"
  10. BACKFEED_ENDPOINT = f"http://{BACKFEED_HOST}/{TRACKER_ID}-kj57sxhhzcn2kqjp/"
  11. TRACKER_ENDPOINT = f"http://{TRACKER_HOST}/{TRACKER_ID}"
  12. from os import environ
  13. if "TRACKER_USERNAME" in environ.keys():
  14. TRACKER_USERNAME = environ["TRACKER_USERNAME"]
  15. else:
  16. TRACKER_USERNAME = "Unnamed"
  17. mysession = requests.session()
  18. class ItemType(Enum):
  19. Video = auto()
  20. Channel = auto()
  21. MixPlaylist = auto()
  22. Playlist = auto()
  23. def add_item_to_tracker(item_type: ItemType, item_id: str) -> bool:
  24. """Feed items into the tracker through backfeed (item names will be deduplicated):
  25. # curl -d 'ITEMNAME' -so/dev/null $amqp_endpoint
  26. # Response codes:
  27. # 200 - Item added to tracker
  28. # 409 - Item is already in tracker
  29. # 404 - Project backfeed channel not found
  30. # 400 - Item name has a bad format
  31. """
  32. type_name = item_type.name.lower()
  33. item_name = f"{type_name}:{item_id}"
  34. req = mysession.post(BACKFEED_ENDPOINT, data=item_name)
  35. code = req.status_code
  36. if code == 200:
  37. print(f"[INFO] Item ID \'{item_name}\' added to tracker successfully")
  38. return True
  39. elif code == 409:
  40. print(f"[INFO] Item ID \'{item_name}\' has already been added to tracker")
  41. return True
  42. elif code == 404:
  43. print(f"[ERROR] Unable to add item ID \'{item_name}\' to tracker. Project backfeed channel not found: {BACKFEED_ENDPOINT}")
  44. elif code == 400:
  45. print(f"[ERROR] Item ID \'{item_name}\' has a bad format")
  46. else:
  47. print(f"[ERROR] Unknown response code adding item \'{item_name}\' to tracker: {code}")
  48. return False
  49. def request_item_from_tracker() -> Optional[str]:
  50. data = {
  51. # TODO: Ask Fusl what this should be
  52. # https://www.archiveteam.org/index.php?title=Dev/Seesaw
  53. # ^ says it would be filled in by the Seesaw library
  54. "downloader": TRACKER_USERNAME,
  55. "api_version": "2",
  56. "version": VERSION
  57. }
  58. req = mysession.post(f"{TRACKER_ENDPOINT}/request", json=data)
  59. code = req.status_code
  60. if code == 200:
  61. data = req.json()
  62. if "item_name" in data:
  63. item_name = data["item_name"]
  64. print(f"[INFO] Received an item from tracker: {item_name}")
  65. return item_name
  66. else:
  67. print(f"[ERROR] Received item is missing the \'item_name\' key: {data}")
  68. else:
  69. print(f"[ERROR] Unable to get an item from tracker. Status: {code}")
  70. def request_upload_target() -> Optional[str]:
  71. req = mysession.get(
  72. # "https://httpbin.org/get",
  73. f"{TRACKER_ENDPOINT}/upload",
  74. )
  75. code = req.status_code
  76. if code == 200:
  77. data = req.json()
  78. if "upload_target" in data:
  79. upload_target = data["upload_target"]
  80. print(f"[INFO] Received an upload target from tracker: {upload_target}")
  81. return upload_target
  82. else:
  83. print(f"[ERROR] Response is missing the \'upload_target\' key: {data}")
  84. else:
  85. print(f"[ERROR] Unable to get an upload target from tracker. Status: {code}")
  86. def request_all_upload_targets() -> Optional[List[str]]:
  87. req = mysession.get(
  88. # "https://httpbin.org/get",
  89. f"{TRACKER_ENDPOINT}/upload",
  90. )
  91. code = req.status_code
  92. if code == 200:
  93. data = req.json()
  94. print(f"[INFO] Received all upload targets from tracker: {data}")
  95. return data
  96. else:
  97. print(f"[ERROR] Unable to get all upload targets from tracker. Status: {code}")
  98. # `item_name` includes type prefix (video:id, playlist:id, etc)
  99. def mark_item_as_done(item_name: str, item_size_bytes: int) -> bool:
  100. data = {
  101. # TODO: Ask Fusl what this should be
  102. # https://www.archiveteam.org/index.php?title=Dev/Seesaw
  103. # ^ says it would be filled in by the Seesaw library
  104. "downloader": TRACKER_USERNAME,
  105. "version": VERSION,
  106. "item": item_name,
  107. "bytes": {
  108. "data": item_size_bytes
  109. }
  110. }
  111. req = mysession.post(f"{TRACKER_ENDPOINT}/done", json=data)
  112. code = req.status_code
  113. if code == 200:
  114. print(f"[INFO] Marked item \'{item_name}\' as done")
  115. return True
  116. elif code > 399 and code < 500:
  117. print(f"[ERROR] Unable to mark item as done. Status: {code}")
  118. elif code > 499 and code < 600:
  119. # TODO: retry here
  120. pass
  121. else:
  122. print(f"[ERROR] Unknown response code while marking item \'{item_name}\' as done: {code}")
  123. return False
  124. # if __name__ == "__main__":
  125. # print(add_item_to_tracker(ItemType.Channel, "test6"))
  126. # print(request_item_from_tracker())
  127. # print(request_upload_target())
  128. # print(request_all_upload_targets())
  129. # print(mark_item_as_done("test4", 200))