archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

174 lines
5.0 KiB

  1. from typing import Optional, List
  2. from enum import Enum, auto
  3. import requests
  4. # TODO: Implement backoff for 500 response codes
  5. # https://github.com/ArchiveTeam/tencent-weibo-grab/blob/9bae5f9747e014db9227821a9c11557267967023/pipeline.py
  6. VERSION = "20200921.01"
  7. TRACKER_ID = "ext-yt-communitycontribs"
  8. TRACKER_HOST = "trackerproxy.meo.ws"
  9. BACKFEED_HOST = "blackbird-amqp.meo.ws:23038"
  10. BACKFEED_ENDPOINT = f"http://{BACKFEED_HOST}/{TRACKER_ID}-kj57sxhhzcn2kqjp/"
  11. TRACKER_ENDPOINT = f"http://{TRACKER_HOST}/{TRACKER_ID}"
  12. from os import environ
  13. if "TRACKER_USERNAME" in environ.keys():
  14. TRACKER_USERNAME = environ["TRACKER_USERNAME"]
  15. else:
  16. TRACKER_USERNAME = "Unnamed"
  17. mysession = requests.session()
  18. class ItemType(Enum):
  19. Video = auto()
  20. Channel = auto()
  21. MixPlaylist = auto()
  22. Playlist = auto()
  23. def add_item_to_tracker(item_type: ItemType, item_id: str) -> bool:
  24. """Feed items into the tracker through backfeed (item names will be deduplicated):
  25. # curl -d 'ITEMNAME' -so/dev/null $amqp_endpoint
  26. # Response codes:
  27. # 200 - Item added to tracker
  28. # 409 - Item is already in tracker
  29. # 404 - Project backfeed channel not found
  30. # 400 - Item name has a bad format
  31. """
  32. type_name = item_type.name.lower()
  33. item_name = f"{type_name}:{item_id}"
  34. req = mysession.post(BACKFEED_ENDPOINT, data=item_name)
  35. code = req.status_code
  36. if code == 200:
  37. print(f"[INFO] Item ID \'{item_name}\' added to tracker successfully")
  38. return True
  39. elif code == 409:
  40. print(f"[INFO] Item ID \'{item_name}\' has already been added to tracker")
  41. return True
  42. elif code == 404:
  43. print(f"[ERROR] Unable to add item ID \'{item_name}\' to tracker. Project backfeed channel not found: {BACKFEED_ENDPOINT}")
  44. elif code == 400:
  45. print(f"[ERROR] Item ID \'{item_name}\' has a bad format")
  46. else:
  47. print(f"[ERROR] Unknown response code adding item \'{item_name}\' to tracker: {code}")
  48. return False
  49. def request_item_from_tracker() -> Optional[str]:
  50. data = {
  51. # TODO: Ask Fusl what this should be
  52. # https://www.archiveteam.org/index.php?title=Dev/Seesaw
  53. # ^ says it would be filled in by the Seesaw library
  54. "downloader": TRACKER_USERNAME,
  55. "api_version": "2",
  56. "version": VERSION
  57. }
  58. req = mysession.post(f"{TRACKER_ENDPOINT}/request", json=data)
  59. code = req.status_code
  60. if code == 200:
  61. data = req.json()
  62. if "item_name" in data:
  63. item_name = data["item_name"]
  64. print(f"[INFO] Received an item from tracker: {item_name}")
  65. return item_name
  66. else:
  67. print(f"[ERROR] Received item is missing the \'item_name\' key: {data}")
  68. else:
  69. print(f"[ERROR] Unable to get an item from tracker. Status: {code}")
  70. def request_upload_target() -> Optional[str]:
  71. req = mysession.get(
  72. # "https://httpbin.org/get",
  73. f"{TRACKER_ENDPOINT}/upload",
  74. )
  75. code = req.status_code
  76. if code == 200:
  77. data = req.json()
  78. if "upload_target" in data:
  79. upload_target = data["upload_target"]
  80. print(f"[INFO] Received an upload target from tracker: {upload_target}")
  81. return upload_target
  82. else:
  83. print(f"[ERROR] Response is missing the \'upload_target\' key: {data}")
  84. else:
  85. print(f"[ERROR] Unable to get an upload target from tracker. Status: {code}")
  86. def request_all_upload_targets() -> Optional[List[str]]:
  87. req = mysession.get(
  88. # "https://httpbin.org/get",
  89. f"{TRACKER_ENDPOINT}/upload",
  90. )
  91. code = req.status_code
  92. if code == 200:
  93. data = req.json()
  94. print(f"[INFO] Received all upload targets from tracker: {data}")
  95. return data
  96. else:
  97. print(f"[ERROR] Unable to get all upload targets from tracker. Status: {code}")
  98. # `item_name` includes type prefix (video:id, playlist:id, etc)
  99. def mark_item_as_done(item_name: str, item_size_bytes: int) -> bool:
  100. data = {
  101. # TODO: Ask Fusl what this should be
  102. # https://www.archiveteam.org/index.php?title=Dev/Seesaw
  103. # ^ says it would be filled in by the Seesaw library
  104. "downloader": TRACKER_USERNAME,
  105. "version": VERSION,
  106. "item": item_name,
  107. "bytes": {
  108. "data": item_size_bytes
  109. }
  110. }
  111. req = mysession.post(f"{TRACKER_ENDPOINT}/done", json=data)
  112. code = req.status_code
  113. if code == 200:
  114. print(f"[INFO] Marked item \'{item_name}\' as done")
  115. return True
  116. elif code > 399 and code < 500:
  117. print(f"[ERROR] Unable to mark item as done. Status: {code}")
  118. elif code > 499 and code < 600:
  119. # TODO: retry here
  120. pass
  121. else:
  122. print(f"[ERROR] Unknown response code while marking item \'{item_name}\' as done: {code}")
  123. return False
  124. # if __name__ == "__main__":
  125. # print(add_item_to_tracker(ItemType.Channel, "test6"))
  126. # print(request_item_from_tracker())
  127. # print(request_upload_target())
  128. # print(request_all_upload_targets())
  129. # print(mark_item_as_done("test4", 200))