Browse Source

Merge branch 'master' of github.com:Data-Horde/ytcc-archive

pull/5/head
tech234a 3 years ago
parent
commit
a4b250f002
2 changed files with 166 additions and 1 deletions
  1. +1
    -1
      README.md
  2. +165
    -0
      tracker.py

+ 1
- 1
README.md View File

@@ -1,5 +1,5 @@
# YouTube Community Contributions Archiving Worker
Export YouTube community-contributed captioning drafts to SBV files. Export YouTube community-contributed titles and descriptions to JSON (coming soon).
Export YouTube community-contributed captioning drafts to SBV files. Export YouTube community-contributed titles and descriptions to JSON. Export published caption credits to JSON.
## Setup
Install the requirements in the requirements.txt file (`pip install -r requirements.txt`). Because the captioning editor is only available to logged-in users, you must specify the values of three session cookies for any Google account (`HSID`, `SSID`, and `SID`). You can get these cookie values by opening the developer tools on any youtube.com webpage, going to the "Application" (Chrome) or "Storage" (Firefox) tab, selecting "Cookies", and copying the required values.


+ 165
- 0
tracker.py View File

@@ -0,0 +1,165 @@
from typing import Optional, List
from enum import Enum, auto
import requests

# TODO: Implement backoff for 500 response codes

# https://github.com/ArchiveTeam/tencent-weibo-grab/blob/9bae5f9747e014db9227821a9c11557267967023/pipeline.py
VERSION = "20200921.01"

TRACKER_ID = "ext-yt-communitycontribs"
TRACKER_HOST = "trackerproxy.meo.ws"

BACKFEED_HOST = "blackbird-amqp.meo.ws:23038"

BACKFEED_ENDPOINT = f"http://{BACKFEED_HOST}/{TRACKER_ID}-kj57sxhhzcn2kqjp/"
TRACKER_ENDPOINT = f"http://{TRACKER_HOST}/{TRACKER_ID}"


class ItemType(Enum):
Video = auto()
Channel = auto()
MixPlaylist = auto()
Playlist = auto()


def add_item_to_tracker(item_type: ItemType, item_id: str) -> bool:
"""Feed items into the tracker through backfeed (item names will be deduplicated):
# curl -d 'ITEMNAME' -so/dev/null $amqp_endpoint

# Response codes:
# 200 - Item added to tracker
# 409 - Item is already in tracker
# 404 - Project backfeed channel not found
# 400 - Item name has a bad format
"""
type_name = item_type.name.lower()
item_name = f"{type_name}:{item_id}"

req = requests.post(BACKFEED_ENDPOINT, data=item_name)

code = req.status_code

if code == 200:
print(f"[INFO] Item ID \'{item_name}\' added to tracker successfully")
return True
elif code == 409:
print(f"[INFO] Item ID \'{item_name}\' has already been added to tracker")
return True
elif code == 404:
print(f"[ERROR] Unable to add item ID \'{item_name}\' to tracker. Project backfeed channel not found: {BACKFEED_ENDPOINT}")
elif code == 400:
print(f"[ERROR] Item ID \'{item_name}\' has a bad format")
else:
print(f"[ERROR] Unknown response code adding item \'{item_name}\' to tracker: {code}")

return False


def request_item_from_tracker() -> Optional[str]:

data = {
# TODO: Ask Fusl what this should be
# https://www.archiveteam.org/index.php?title=Dev/Seesaw
# ^ says it would be filled in by the Seesaw library
"downloader": "Fusl",
"api_version": "2",
"version": VERSION
}

req = requests.post(f"{TRACKER_ENDPOINT}/request", json=data)

code = req.status_code

if code == 200:
data = req.json()

if "item_name" in data:
item_name = data["item_name"]
print(f"[INFO] Received an item from tracker: {item_name}")

return item_name
else:
print(f"[ERROR] Received item is missing the \'item_name\' key: {data}")

else:
print(f"[ERROR] Unable to get an item from tracker. Status: {code}")


def request_upload_target() -> Optional[str]:
req = requests.get(
# "https://httpbin.org/get",
f"{TRACKER_ENDPOINT}/upload",
)

code = req.status_code

if code == 200:
data = req.json()

if "upload_target" in data:
upload_target = data["upload_target"]
print(f"[INFO] Received an upload target from tracker: {upload_target}")
return upload_target
else:
print(f"[ERROR] Response is missing the \'upload_target\' key: {data}")

else:
print(f"[ERROR] Unable to get an upload target from tracker. Status: {code}")


def request_all_upload_targets() -> Optional[List[str]]:
req = requests.get(
# "https://httpbin.org/get",
f"{TRACKER_ENDPOINT}/upload",
)

code = req.status_code

if code == 200:
data = req.json()
print(f"[INFO] Received all upload targets from tracker: {data}")
return data
else:
print(f"[ERROR] Unable to get all upload targets from tracker. Status: {code}")


# `item_name` includes type prefix (video:id, playlist:id, etc)
def mark_item_as_done(item_name: str, item_size_bytes: int) -> bool:

data = {
# TODO: Ask Fusl what this should be
# https://www.archiveteam.org/index.php?title=Dev/Seesaw
# ^ says it would be filled in by the Seesaw library
"downloader": "Fusl",
"version": VERSION,
"item": item_name,
"bytes": {
"data": item_size_bytes
}
}

req = requests.post(f"{TRACKER_ENDPOINT}/done", json=data)

code = req.status_code

if code == 200:
print(f"[INFO] Marked item \'{item_name}\' as done")
return True
elif code > 399 and code < 500:
print(f"[ERROR] Unable to mark item as done. Status: {code}")
elif code > 499 and code < 600:
# TODO: retry here
pass
else:
print(f"[ERROR] Unknown response code while marking item \'{item_name}\' as done: {code}")

return False


if __name__ == "__main__":
# print(add_item_to_tracker(ItemType.Channel, "test6"))
# print(request_item_from_tracker())
# print(request_upload_target())
# print(request_all_upload_targets())
# print(mark_item_as_done("test4", 200))

Loading…
Cancel
Save