archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 

168 lignes
4.9 KiB

  1. from typing import Optional, List
  2. from enum import Enum, auto
  3. import requests
  4. # TODO: Implement backoff for 500 response codes
  5. # https://github.com/ArchiveTeam/tencent-weibo-grab/blob/9bae5f9747e014db9227821a9c11557267967023/pipeline.py
  6. VERSION = "20200921.01"
  7. TRACKER_ID = "ext-yt-communitycontribs"
  8. TRACKER_HOST = "trackerproxy.meo.ws"
  9. BACKFEED_HOST = "blackbird-amqp.meo.ws:23038"
  10. BACKFEED_ENDPOINT = f"http://{BACKFEED_HOST}/{TRACKER_ID}-kj57sxhhzcn2kqjp/"
  11. TRACKER_ENDPOINT = f"http://{TRACKER_HOST}/{TRACKER_ID}"
  12. mysession = requests.session()
  13. class ItemType(Enum):
  14. Video = auto()
  15. Channel = auto()
  16. MixPlaylist = auto()
  17. Playlist = auto()
  18. def add_item_to_tracker(item_type: ItemType, item_id: str) -> bool:
  19. """Feed items into the tracker through backfeed (item names will be deduplicated):
  20. # curl -d 'ITEMNAME' -so/dev/null $amqp_endpoint
  21. # Response codes:
  22. # 200 - Item added to tracker
  23. # 409 - Item is already in tracker
  24. # 404 - Project backfeed channel not found
  25. # 400 - Item name has a bad format
  26. """
  27. type_name = item_type.name.lower()
  28. item_name = f"{type_name}:{item_id}"
  29. req = mysession.post(BACKFEED_ENDPOINT, data=item_name)
  30. code = req.status_code
  31. if code == 200:
  32. print(f"[INFO] Item ID \'{item_name}\' added to tracker successfully")
  33. return True
  34. elif code == 409:
  35. print(f"[INFO] Item ID \'{item_name}\' has already been added to tracker")
  36. return True
  37. elif code == 404:
  38. print(f"[ERROR] Unable to add item ID \'{item_name}\' to tracker. Project backfeed channel not found: {BACKFEED_ENDPOINT}")
  39. elif code == 400:
  40. print(f"[ERROR] Item ID \'{item_name}\' has a bad format")
  41. else:
  42. print(f"[ERROR] Unknown response code adding item \'{item_name}\' to tracker: {code}")
  43. return False
  44. def request_item_from_tracker() -> Optional[str]:
  45. data = {
  46. # TODO: Ask Fusl what this should be
  47. # https://www.archiveteam.org/index.php?title=Dev/Seesaw
  48. # ^ says it would be filled in by the Seesaw library
  49. "downloader": "Fusl",
  50. "api_version": "2",
  51. "version": VERSION
  52. }
  53. req = mysession.post(f"{TRACKER_ENDPOINT}/request", json=data)
  54. code = req.status_code
  55. if code == 200:
  56. data = req.json()
  57. if "item_name" in data:
  58. item_name = data["item_name"]
  59. print(f"[INFO] Received an item from tracker: {item_name}")
  60. return item_name
  61. else:
  62. print(f"[ERROR] Received item is missing the \'item_name\' key: {data}")
  63. else:
  64. print(f"[ERROR] Unable to get an item from tracker. Status: {code}")
  65. def request_upload_target() -> Optional[str]:
  66. req = mysession.get(
  67. # "https://httpbin.org/get",
  68. f"{TRACKER_ENDPOINT}/upload",
  69. )
  70. code = req.status_code
  71. if code == 200:
  72. data = req.json()
  73. if "upload_target" in data:
  74. upload_target = data["upload_target"]
  75. print(f"[INFO] Received an upload target from tracker: {upload_target}")
  76. return upload_target
  77. else:
  78. print(f"[ERROR] Response is missing the \'upload_target\' key: {data}")
  79. else:
  80. print(f"[ERROR] Unable to get an upload target from tracker. Status: {code}")
  81. def request_all_upload_targets() -> Optional[List[str]]:
  82. req = mysession.get(
  83. # "https://httpbin.org/get",
  84. f"{TRACKER_ENDPOINT}/upload",
  85. )
  86. code = req.status_code
  87. if code == 200:
  88. data = req.json()
  89. print(f"[INFO] Received all upload targets from tracker: {data}")
  90. return data
  91. else:
  92. print(f"[ERROR] Unable to get all upload targets from tracker. Status: {code}")
  93. # `item_name` includes type prefix (video:id, playlist:id, etc)
  94. def mark_item_as_done(item_name: str, item_size_bytes: int) -> bool:
  95. data = {
  96. # TODO: Ask Fusl what this should be
  97. # https://www.archiveteam.org/index.php?title=Dev/Seesaw
  98. # ^ says it would be filled in by the Seesaw library
  99. "downloader": "Fusl",
  100. "version": VERSION,
  101. "item": item_name,
  102. "bytes": {
  103. "data": item_size_bytes
  104. }
  105. }
  106. req = mysession.post(f"{TRACKER_ENDPOINT}/done", json=data)
  107. code = req.status_code
  108. if code == 200:
  109. print(f"[INFO] Marked item \'{item_name}\' as done")
  110. return True
  111. elif code > 399 and code < 500:
  112. print(f"[ERROR] Unable to mark item as done. Status: {code}")
  113. elif code > 499 and code < 600:
  114. # TODO: retry here
  115. pass
  116. else:
  117. print(f"[ERROR] Unknown response code while marking item \'{item_name}\' as done: {code}")
  118. return False
  119. # if __name__ == "__main__":
  120. # print(add_item_to_tracker(ItemType.Channel, "test6"))
  121. # print(request_item_from_tracker())
  122. # print(request_upload_target())
  123. # print(request_all_upload_targets())
  124. # print(mark_item_as_done("test4", 200))