archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 

166 linhas
4.8 KiB

  1. from typing import Optional, List
  2. from enum import Enum, auto
  3. import requests
  4. from requests.adapters import HTTPAdapter
  5. from requests.packages.urllib3.util.retry import Retry
  6. # https://github.com/ArchiveTeam/tencent-weibo-grab/blob/9bae5f9747e014db9227821a9c11557267967023/pipeline.py
  7. VERSION = "20200921.01"
  8. TRACKER_ID = "ext-yt-communitycontribs"
  9. TRACKER_HOST = "trackerproxy.meo.ws"
  10. BACKFEED_HOST = "blackbird-amqp.meo.ws:23038"
  11. BACKFEED_ENDPOINT = f"http://{BACKFEED_HOST}/{TRACKER_ID}-kj57sxhhzcn2kqjp/"
  12. TRACKER_ENDPOINT = f"http://{TRACKER_HOST}/{TRACKER_ID}"
  13. # https://findwork.dev/blog/advanced-usage-python-requests-timeouts-retries-hooks/
  14. retry_strategy = Retry(
  15. total=4,
  16. backoff_factor=2,
  17. status_forcelist=[x for x in range(500, 600)] + [429],
  18. method_whitelist=["GET", "POST"]
  19. )
  20. adapter = HTTPAdapter(max_retries=retry_strategy)
  21. tracker_session = requests.Session()
  22. tracker_session.mount("https://", adapter)
  23. tracker_session.mount("http://", adapter)
  24. class ItemType(Enum):
  25. Video = auto()
  26. Channel = auto()
  27. MixPlaylist = auto()
  28. Playlist = auto()
  29. def add_item_to_tracker(item_type: ItemType, item_id: str) -> bool:
  30. """Feed items into the tracker through backfeed (item names will be deduplicated):
  31. # curl -d 'ITEMNAME' -so/dev/null $amqp_endpoint
  32. # Response codes:
  33. # 200 - Item added to tracker
  34. # 409 - Item is already in tracker
  35. # 404 - Project backfeed channel not found
  36. # 400 - Item name has a bad format
  37. """
  38. type_name = item_type.name.lower()
  39. item_name = f"{type_name}:{item_id}"
  40. req = tracker_session.post(BACKFEED_ENDPOINT, data=item_name)
  41. code = req.status_code
  42. if code == 200:
  43. print(f"[INFO] Item ID \'{item_name}\' added to tracker successfully")
  44. return True
  45. elif code == 409:
  46. print(f"[INFO] Item ID \'{item_name}\' has already been added to tracker")
  47. return True
  48. elif code == 404:
  49. print(f"[ERROR] Unable to add item ID \'{item_name}\' to tracker. Project backfeed channel not found: {BACKFEED_ENDPOINT}")
  50. elif code == 400:
  51. print(f"[ERROR] Item ID \'{item_name}\' has a bad format")
  52. else:
  53. print(f"[ERROR] Unknown response code adding item \'{item_name}\' to tracker: {code}")
  54. return False
  55. def request_item_from_tracker() -> Optional[str]:
  56. data = {
  57. "downloader": "Fusl",
  58. "api_version": "2",
  59. "version": VERSION
  60. }
  61. req = tracker_session.post(f"{TRACKER_ENDPOINT}/request", json=data)
  62. code = req.status_code
  63. if code == 200:
  64. data = req.json()
  65. if "item_name" in data:
  66. item_name = data["item_name"]
  67. print(f"[INFO] Received an item from tracker: {item_name}")
  68. return item_name
  69. else:
  70. print(f"[ERROR] Received item is missing the \'item_name\' key: {data}")
  71. else:
  72. print(f"[ERROR] Unable to get an item from tracker. Status: {code}")
  73. def request_upload_target() -> Optional[str]:
  74. req = tracker_session.get(f"{TRACKER_ENDPOINT}/upload")
  75. code = req.status_code
  76. if code == 200:
  77. data = req.json()
  78. if "upload_target" in data:
  79. upload_target = data["upload_target"]
  80. print(f"[INFO] Received an upload target from tracker: {upload_target}")
  81. return upload_target
  82. else:
  83. print(f"[ERROR] Response is missing the \'upload_target\' key: {data}")
  84. else:
  85. print(f"[ERROR] Unable to get an upload target from tracker. Status: {code}")
  86. def request_all_upload_targets() -> Optional[List[str]]:
  87. req = tracker_session.get(f"{TRACKER_ENDPOINT}/upload")
  88. code = req.status_code
  89. if code == 200:
  90. data = req.json()
  91. print(f"[INFO] Received all upload targets from tracker: {data}")
  92. return data
  93. else:
  94. print(f"[ERROR] Unable to get all upload targets from tracker. Status: {code}")
  95. # `item_name` includes type prefix (video:id, playlist:id, etc)
  96. def mark_item_as_done(item_name: str, item_size_bytes: int) -> bool:
  97. data = {
  98. "downloader": "Fusl",
  99. "version": VERSION,
  100. "item": item_name,
  101. "bytes": {
  102. "data": item_size_bytes
  103. }
  104. }
  105. req = tracker_session.post(f"{TRACKER_ENDPOINT}/done", json=data)
  106. code = req.status_code
  107. if code == 200:
  108. print(f"[INFO] Marked item \'{item_name}\' as done")
  109. return True
  110. elif code > 399 and code < 500:
  111. print(f"[ERROR] Unable to mark item as done. Status: {code}")
  112. else:
  113. print(f"[ERROR] Unknown response code while marking item \'{item_name}\' as done: {code}")
  114. return False
  115. # if __name__ == "__main__":
  116. # print(add_item_to_tracker(ItemType.Channel, "test10"))
  117. # print(request_item_from_tracker())
  118. # print(request_upload_target())
  119. # print(request_all_upload_targets())
  120. # print(mark_item_as_done("test4", 200))