From 6c657f42d20aad332b51dffa9d3f6af7081faf30 Mon Sep 17 00:00:00 2001 From: tech234a <46801700+tech234a@users.noreply.github.com> Date: Mon, 21 Sep 2020 12:24:20 -0400 Subject: [PATCH 1/2] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 10fad22..369befb 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # YouTube Community Contributions Archiving Worker -Export YouTube community-contributed captioning drafts to SBV files. Export YouTube community-contributed titles and descriptions to JSON (coming soon). +Export YouTube community-contributed captioning drafts to SBV files. Export YouTube community-contributed titles and descriptions to JSON. Export published caption credits to JSON. ## Setup Install the requirements in the requirements.txt file (`pip install -r requirements.txt`). Because the captioning editor is only available to logged-in users, you must specify the values of three session cookies for any Google account (`HSID`, `SSID`, and `SID`). You can get these cookie values by opening the developer tools on any youtube.com webpage, going to the "Application" (Chrome) or "Storage" (Firefox) tab, selecting "Cookies", and copying the required values. From 503143dc88cf9bb83f315304a5d99647a27a5653 Mon Sep 17 00:00:00 2001 From: afrmtbl Date: Mon, 21 Sep 2020 15:03:01 -0400 Subject: [PATCH 2/2] Implement initial tracker API --- tracker.py | 165 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 tracker.py diff --git a/tracker.py b/tracker.py new file mode 100644 index 0000000..8c7a551 --- /dev/null +++ b/tracker.py @@ -0,0 +1,165 @@ +from typing import Optional, List +from enum import Enum, auto +import requests + +# TODO: Implement backoff for 500 response codes + +# https://github.com/ArchiveTeam/tencent-weibo-grab/blob/9bae5f9747e014db9227821a9c11557267967023/pipeline.py +VERSION = "20200921.01" + +TRACKER_ID = "ext-yt-communitycontribs" +TRACKER_HOST = "trackerproxy.meo.ws" + +BACKFEED_HOST = "blackbird-amqp.meo.ws:23038" + +BACKFEED_ENDPOINT = f"http://{BACKFEED_HOST}/{TRACKER_ID}-kj57sxhhzcn2kqjp/" +TRACKER_ENDPOINT = f"http://{TRACKER_HOST}/{TRACKER_ID}" + + +class ItemType(Enum): + Video = auto() + Channel = auto() + MixPlaylist = auto() + Playlist = auto() + + +def add_item_to_tracker(item_type: ItemType, item_id: str) -> bool: + """Feed items into the tracker through backfeed (item names will be deduplicated): + # curl -d 'ITEMNAME' -so/dev/null $amqp_endpoint + + # Response codes: + # 200 - Item added to tracker + # 409 - Item is already in tracker + # 404 - Project backfeed channel not found + # 400 - Item name has a bad format + """ + type_name = item_type.name.lower() + item_name = f"{type_name}:{item_id}" + + req = requests.post(BACKFEED_ENDPOINT, data=item_name) + + code = req.status_code + + if code == 200: + print(f"[INFO] Item ID \'{item_name}\' added to tracker successfully") + return True + elif code == 409: + print(f"[INFO] Item ID \'{item_name}\' has already been added to tracker") + return True + elif code == 404: + print(f"[ERROR] Unable to add item ID \'{item_name}\' to tracker. Project backfeed channel not found: {BACKFEED_ENDPOINT}") + elif code == 400: + print(f"[ERROR] Item ID \'{item_name}\' has a bad format") + else: + print(f"[ERROR] Unknown response code adding item \'{item_name}\' to tracker: {code}") + + return False + + +def request_item_from_tracker() -> Optional[str]: + + data = { + # TODO: Ask Fusl what this should be + # https://www.archiveteam.org/index.php?title=Dev/Seesaw + # ^ says it would be filled in by the Seesaw library + "downloader": "Fusl", + "api_version": "2", + "version": VERSION + } + + req = requests.post(f"{TRACKER_ENDPOINT}/request", json=data) + + code = req.status_code + + if code == 200: + data = req.json() + + if "item_name" in data: + item_name = data["item_name"] + print(f"[INFO] Received an item from tracker: {item_name}") + + return item_name + else: + print(f"[ERROR] Received item is missing the \'item_name\' key: {data}") + + else: + print(f"[ERROR] Unable to get an item from tracker. Status: {code}") + + +def request_upload_target() -> Optional[str]: + req = requests.get( + # "https://httpbin.org/get", + f"{TRACKER_ENDPOINT}/upload", + ) + + code = req.status_code + + if code == 200: + data = req.json() + + if "upload_target" in data: + upload_target = data["upload_target"] + print(f"[INFO] Received an upload target from tracker: {upload_target}") + return upload_target + else: + print(f"[ERROR] Response is missing the \'upload_target\' key: {data}") + + else: + print(f"[ERROR] Unable to get an upload target from tracker. Status: {code}") + + +def request_all_upload_targets() -> Optional[List[str]]: + req = requests.get( + # "https://httpbin.org/get", + f"{TRACKER_ENDPOINT}/upload", + ) + + code = req.status_code + + if code == 200: + data = req.json() + print(f"[INFO] Received all upload targets from tracker: {data}") + return data + else: + print(f"[ERROR] Unable to get all upload targets from tracker. Status: {code}") + + +# `item_name` includes type prefix (video:id, playlist:id, etc) +def mark_item_as_done(item_name: str, item_size_bytes: int) -> bool: + + data = { + # TODO: Ask Fusl what this should be + # https://www.archiveteam.org/index.php?title=Dev/Seesaw + # ^ says it would be filled in by the Seesaw library + "downloader": "Fusl", + "version": VERSION, + "item": item_name, + "bytes": { + "data": item_size_bytes + } + } + + req = requests.post(f"{TRACKER_ENDPOINT}/done", json=data) + + code = req.status_code + + if code == 200: + print(f"[INFO] Marked item \'{item_name}\' as done") + return True + elif code > 399 and code < 500: + print(f"[ERROR] Unable to mark item as done. Status: {code}") + elif code > 499 and code < 600: + # TODO: retry here + pass + else: + print(f"[ERROR] Unknown response code while marking item \'{item_name}\' as done: {code}") + + return False + + +if __name__ == "__main__": + # print(add_item_to_tracker(ItemType.Channel, "test6")) + # print(request_item_from_tracker()) + # print(request_upload_target()) + # print(request_all_upload_targets()) + # print(mark_item_as_done("test4", 200))