Initial HTTP3 Support

3 years ago · d303c266be
--- a/discovery.py
+++ b/discovery.py
@@ -2,22 +2,32 @@ from time import sleep
 from typing import Dict
 from json import loads

 from switchable_request import get

 backend = "http3"

 langcodes = {"Afar": "aa", "Abkhazian": "ab", "Afrikaans": "af", "Akan": "ak", "all": "all", "Amharic": "am", "Aragonese": "an", "Arabic": "ar", "Aramaic": "arc", "Algerian Arabic": "arq", "Assamese": "as", "American Sign Language": "ase", "Asturian": "ast", "Avaric": "av", "Aymara": "ay", "Azerbaijani": "az", "Bashkir": "ba", "Belarusian": "be", "Bulgarian": "bg", "Bihari": "bh", "Bislama": "bi", "Bangla": "bn", "Tibetan": "bo", "Breton": "br", "Bosnian": "bs", "Catalan": "ca", "Cebuano": "ceb", "Choctaw": "cho", "Cherokee": "chr", "Corsican": "co", "Czech": "cs", "Church Slavic": "cu", "Welsh": "cy", "Danish": "da", "Danish (Denmark)": "da-DK", "German": "de", "German (Austria)": "de-AT", "German (Switzerland)": "de-CH", "German (Germany)": "de-DE", "Divehi": "dv", "Dzongkha": "dz", "Ewe": "ee", "Greek": "el", "English": "en", "English (United Arab Emirates)": "en-AE", "English (Canada)": "en-CA", "English (United Kingdom)": "en-GB", "English (Ireland)": "en-IE", "English (India)": "en-IN", "English (United States)": "en-US", "Esperanto": "eo", "Spanish": "es", "Spanish (Latin America)": "es-419", "Spanish (Argentina)": "es-AR", "Spanish (Chile)": "es-CL", "Spanish (Colombia)": "es-CO", "Spanish (Costa Rica)": "es-CR", "Spanish (Spain)": "es-ES", "Spanish (Mexico)": "es-MX", "Spanish (Nicaragua)": "es-NI", "Spanish (United States)": "es-US", "Estonian": "et", "Basque": "eu", "Persian": "fa", "Persian (Afghanistan)": "fa-AF", "Persian (Iran)": "fa-IR", "Fulah": "ff", "Finnish": "fi", "Filipino": "fil", "Fijian": "fj", "Faroese": "fo", "French": "fr", "French (Belgium)": "fr-BE", "French (Canada)": "fr-CA", "French (Switzerland)": "fr-CH", "French (France)": "fr-FR", "Western Frisian": "fy", "Irish": "ga", "Scottish Gaelic": "gd", "Galician": "gl", "Guarani": "gn", "Swiss German": "gsw", "Gujarati": "gu", "Hausa": "ha", "Hakka Chinese": "hak", "Hakka Chinese (Taiwan)": "hak-TW", "Hindi": "hi-Latn", "Hmong": "hmn", "Croatian": "hr", "Haitian Creole": "ht", "Hungarian": "hu", "Armenian": "hy", "Interlingua": "ia", "Indonesian": "id", "Interlingue": "ie", "Igbo": "ig", "Sichuan Yi": "ii", "Inupiaq": "ik", "Icelandic": "is", "Italian": "it", "Italian (Italy)": "it-IT", "Inuktitut": "iu", "Hebrew": "iw", "Japanese": "ja", "Javanese": "jv", "Georgian": "ka", "Kazakh": "kk", "Kalaallisut": "kl", "Khmer": "km", "Kannada": "kn", "Korean": "ko", "Korean (South Korea)": "ko-KR", "Kanuri": "kr", "Kashmiri": "ks", "Kurdish": "ku", "Kyrgyz": "ky", "Latin": "la", "Luxembourgish": "lb", "Lingala": "ln", "Lao": "lo", "Lithuanian": "lt", "Mizo": "lus", "Latvian": "lv", "Masai": "mas", "Malagasy": "mg", "Maori": "mi", "Miscellaneous languages": "mis", "Macedonian": "mk", "Malayalam": "ml", "Mongolian": "mn", "Manipuri": "mni", "Moldavian": "mo", "Marathi": "mr", "Malay": "ms", "Maltese": "mt", "Burmese": "my", "Nauru": "na", "Min Nan Chinese": "nan", "Min Nan Chinese (Taiwan)": "nan-TW", "Nepali": "ne", "Dutch": "nl", "Dutch (Belgium)": "nl-BE", "Dutch (Netherlands)": "nl-NL", "Norwegian Nynorsk": "nn", "Norwegian": "no", "not": "not", "Navajo": "nv", "Occitan": "oc", "Oromo": "om", "Odia": "or", "Punjabi": "pa", "Polish": "pl", "Polish (Poland)": "pl-PL", "Pashto": "ps", "Portuguese": "pt", "Portuguese (Brazil)": "pt-BR", "Portuguese (Portugal)": "pt-PT", "Quechua": "qu", "Romansh": "rm", "Rundi": "rn", "Romanian": "ro", "Romanian (Moldova)": "ro-MD", "Russian": "ru-Latn", "Russian (Russia)": "ru-RU", "Kinyarwanda": "rw", "Sanskrit": "sa", "Sardinian": "sc", "Sicilian": "scn", "Scots": "sco", "Sindhi": "sd", "Sherdukpen": "sdp", "Northern Sami": "se", "Sango": "sg", "Serbo-Croatian": "sh", "Sinhala": "si", "Slovak": "sk", "Slovenian": "sl", "Samoan": "sm", "Shona": "sn", "Somali": "so", "Albanian": "sq", "Serbian": "sr", "Serbian (Cyrillic)": "sr-Cyrl", "Serbian (Latin)": "sr-Latn", "Swati": "ss", "Southern Sotho": "st", "Sundanese": "su", "Swedish": "sv", "Swahili": "sw", "Tamil": "ta", "Telugu": "te", "Tajik": "tg", "Thai": "th", "Tigrinya": "ti", "Turkmen": "tk", "Tagalog": "tl", "Klingon": "tlh", "Tswana": "tn", "Tongan": "to", "Turkish": "tr", "Turkish (Turkey)": "tr-TR", "Tsonga": "ts", "Tatar": "tt", "Twi": "tw", "Ukrainian": "uk", "Urdu": "ur", "Uzbek": "uz", "Vietnamese": "vi", "Volap\\xFCk": "vo", "Wolof": "wo", "Xhosa": "xh", "Yiddish": "yi", "Yoruba": "yo", "Cantonese": "yue", "Cantonese (Hong Kong)": "yue-HK", "Chinese": "zh", "Chinese (China)": "zh-CN", "Chinese (Hong Kong)": "zh-HK", "Chinese (Simplified)": "zh-Hans", "Chinese (Simplified, China)": "zh-Hans-CN", "Chinese (Simplified, Singapore)": "zh-Hans-SG", "Chinese (Traditional)": "zh-Hant", "Chinese (Traditional, Hong Kong)": "zh-Hant-HK", "Chinese (Traditional, Taiwan)": "zh-Hant-TW", "Chinese (Singapore)": "zh-SG", "Chinese (Taiwan)": "zh-TW", "Zulu": "zu", "Hiri Motu": "ho", "Tok Pisin": "tpi", "Voro": "vor"}

 def getmetadata(mysession, vid):
 def getmetadata(mysession, vid, allheaders):
    global backend
    params = (
        ("v", vid),
    )

    while True:
        wpage = mysession.get("https://www.youtube.com/watch", params=params)
        wpage = get("https://www.youtube.com/watch", params=params, mysession=mysession, backend=backend, http3headers=allheaders)

        if not """</div><div id="content" class="  content-alignment" role="main"><p class='largeText'>Sorry for the interruption. We have been receiving a large volume of requests from your network.</p>

 <p>To continue with your YouTube experience, please fill out the form below.</p>""" in wpage.text and not wpage.status_code == 429 and 'window["ytInitialPlayerResponse"] = ' in wpage.text and 'window["ytInitialData"] = ' in wpage.text:
            break
        else:
            print("Captcha detected, waiting 30 seconds")
            sleep(30)
            if backend == "requests":
                backend = "http3"
                print("Captcha detected, switching discovery to HTTP3/QUIC")
            else:
                print("Captcha detected, waiting 30 seconds")
                sleep(30)

    wptext = wpage.text

--- a/export.py
+++ b/export.py
@@ -38,6 +38,10 @@ from time import sleep
 # https://docs.python.org/3/library/html.parser.html
 from html.parser import HTMLParser

 backend = "http3"

 from switchable_request import get

 class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
@@ -79,7 +83,8 @@ class MyHTMLParser(HTMLParser):
        elif self.get_starttag_text() and self.get_starttag_text().startswith('<div id="original-video-title"'):
            self.inittitle += data

 def subprrun(mysession, langcode, vid, mode, needforcemetadata, needforcecaptions):
 def subprrun(mysession, langcode, vid, mode, needforcemetadata, needforcecaptions, allheaders):
    global backend
    if mode == "forceedit-metadata":
        while needforcemetadata[langcode] == None: #extra logic
            print("Awaiting forcemetadata")
@@ -114,7 +119,7 @@ def subprrun(mysession, langcode, vid, mode, needforcemetadata, needforcecaption
                    ("o", "U")
                )

                page = mysession.get("https://www.youtube.com/timedtext_editor", params=pparams)
                page = get("https://www.youtube.com/timedtext_editor", params=pparams, mysession=mysession, backend=backend, http3headers=allheaders)
            elif mode == "forceedit-metadata":
                pparams = (
                    ("v", vid),
@@ -124,7 +129,7 @@ def subprrun(mysession, langcode, vid, mode, needforcemetadata, needforcecaption
                    ('tab', 'metadata')
                )

                page = mysession.get("https://www.youtube.com/timedtext_editor", params=pparams)
                page = get("https://www.youtube.com/timedtext_editor", params=pparams, mysession=mysession, backend=backend, http3headers=allheaders)
            elif mode == "forceedit-captions":
                pparams = (
                    ("v", vid),
@@ -137,13 +142,17 @@ def subprrun(mysession, langcode, vid, mode, needforcemetadata, needforcecaption
                    ("o", "U")
                )

                page = mysession.get("https://www.youtube.com/timedtext_editor", params=pparams)
                page = get("https://www.youtube.com/timedtext_editor", params=pparams, mysession=mysession, backend=backend, http3headers=allheaders)

            if not "accounts.google.com" in page.url and page.status_code != 429 and 'Subtitles/CC' in page.text and 'Title &amp; description' in page.text:
                break
            else:
                print("[Retrying in 30 seconds for rate limit or login failure] Please supply authentication cookie information in config.json or environment variables. See README.md for more information.")
                sleep(30)
                if backend == "requests":
                    backend = "http3"
                    print("Rate limit or login failure, switching export to HTTP3/QUIC...")
                else:
                    print("[Retrying in 30 seconds for rate limit or login failure] Please supply authentication cookie information in config.json or environment variables. See README.md for more information.")
                    sleep(30)
        except:
            print("Error in request, retrying in 5 seconds...")
            sleep(5)
--- a/http3.py
+++ b/http3.py
@@ -0,0 +1,54 @@
 import asyncio
 from typing import cast
 from urllib.parse import urlparse
 from aioquic.h3.connection import H3_ALPN
 from aioquic.asyncio.client import connect
 from aioquic.quic.configuration import QuicConfiguration
 from http3_base import HttpClient, prepare_response, perform_http_request

 class HTTP3Response:
    def __init__(self, input) -> None:
        headers, content = input
        self.content = content
        try:
            self.text = content.decode()
        except:
            print("Text decoding error")
            self.text = ""
        self.headers = {}
        for k, v in headers.items():
            self.headers[k.decode()] = v.decode()
        try:
            self.status_code = int(headers[b":status"])
        except:
            print("Status code not included as header, defaulting to 200")
            self.status_code = 200
        self.ok = self.status_code < 400

 async def main(address, headers={}):
    parsed = urlparse(address)

    configuration = QuicConfiguration(
            is_client=True, alpn_protocols=H3_ALPN
        )

    async with connect(parsed.netloc, port=443, configuration=configuration, create_protocol=HttpClient) as client:
        client = cast(HttpClient, client)

        events = await perform_http_request(client=client, url=address, headers=headers)

        return HTTP3Response(prepare_response(events))

 def get(url, headers={}, params={}):
    plist = []
    for item in params:
        #print(item)
        k, v = item
        plist.append(str(k)+"="+str(v))
    if plist:
        pstring = "?"+"&".join(plist)
    else:
        pstring = ""
    #print(url+pstring)
    loop = asyncio.new_event_loop()
    return loop.run_until_complete(main(url+pstring, headers=headers))
--- a/http3_base.py
+++ b/http3_base.py
@@ -0,0 +1,158 @@
 import asyncio
 import logging
 import time
 from collections import deque
 from typing import Deque, Dict, Optional
 from urllib.parse import urlparse

 import aioquic
 from aioquic.asyncio.protocol import QuicConnectionProtocol
 from aioquic.h3.connection import H3Connection
 from aioquic.h3.events import (
    DataReceived,
    H3Event,
    HeadersReceived,
    PushPromiseReceived,
 )
 from aioquic.quic.events import QuicEvent

 logger = logging.getLogger("client")

 USER_AGENT = "aioquic/" + aioquic.__version__


 class URL:
    def __init__(self, url: str) -> None:
        parsed = urlparse(url)

        self.authority = parsed.netloc
        self.full_path = parsed.path
        if parsed.query:
            self.full_path += "?" + parsed.query
        self.scheme = parsed.scheme


 class HttpRequest:
    def __init__(
        self, method: str, url: URL, content: bytes = b"", headers: Dict = {}
    ) -> None:
        self.content = content
        self.headers = headers
        self.method = method
        self.url = url


 class HttpClient(QuicConnectionProtocol):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)

        self.pushes: Dict[int, Deque[H3Event]] = {}
        self._request_events: Dict[int, Deque[H3Event]] = {}
        self._request_waiter: Dict[int, asyncio.Future[Deque[H3Event]]] = {}

        self._http = H3Connection(self._quic)

    async def get(self, url: str, headers: Dict = {}) -> Deque[H3Event]:
        """
        Perform a GET request.
        """
        return await self._request(
            HttpRequest(method="GET", url=URL(url), headers=headers)
        )

    async def post(self, url: str, data: bytes, headers: Dict = {}) -> Deque[H3Event]:
        """
        Perform a POST request.
        """
        return await self._request(
            HttpRequest(method="POST", url=URL(url), content=data, headers=headers)
        )

    def http_event_received(self, event: H3Event) -> None:
        if isinstance(event, (HeadersReceived, DataReceived)):
            stream_id = event.stream_id
            if stream_id in self._request_events:
                # http
                self._request_events[event.stream_id].append(event)
                if event.stream_ended:
                    request_waiter = self._request_waiter.pop(stream_id)
                    request_waiter.set_result(self._request_events.pop(stream_id))

            elif event.push_id in self.pushes:
                # push
                self.pushes[event.push_id].append(event)

        elif isinstance(event, PushPromiseReceived):
            self.pushes[event.push_id] = deque()
            self.pushes[event.push_id].append(event)

    def quic_event_received(self, event: QuicEvent) -> None:
        #  pass event to the HTTP layer
        if self._http is not None:
            for http_event in self._http.handle_event(event):
                self.http_event_received(http_event)

    async def _request(self, request: HttpRequest) -> Deque[H3Event]:
        stream_id = self._quic.get_next_available_stream_id()
        self._http.send_headers(
            stream_id=stream_id,
            headers=[
                (b":method", request.method.encode()),
                (b":scheme", request.url.scheme.encode()),
                (b":authority", request.url.authority.encode()),
                (b":path", request.url.full_path.encode()),
                (b"user-agent", USER_AGENT.encode()),
            ]
            + [(k.lower().encode(), v.encode()) for (k, v) in request.headers.items()],
        )
        self._http.send_data(stream_id=stream_id, data=request.content, end_stream=True)

        waiter = self._loop.create_future()
        self._request_events[stream_id] = deque()
        self._request_waiter[stream_id] = waiter
        self.transmit()

        return await asyncio.shield(waiter)


 async def perform_http_request(
    client: HttpClient,
    url: str,
    headers: Optional[dict]
 ) -> Dict[int, Deque[H3Event]] :
    # perform request
    start = time.time()
    if headers:
        http_events = await client.get(url, headers=headers)
    else:
        http_events = await client.get(url)
    method = "GET"
    elapsed = time.time() - start

    # print speed
    octets = 0
    for http_event in http_events:
        if isinstance(http_event, DataReceived):
            octets += len(http_event.data)
    logger.info(
        "Response received for %s %s : %d bytes in %.1f s (%.3f Mbps)"
        % (method, urlparse(url).path, octets, elapsed, octets * 8 / elapsed / 1000000)
    )

    return http_events


 def prepare_response(
    http_events: Deque[H3Event]
 ) -> str:

    byteslist = []
    headers = {}

    for http_event in http_events:
        if isinstance(http_event, HeadersReceived):
            headers.update(http_event.headers)
        elif isinstance(http_event, DataReceived):
            byteslist.append(http_event.data)

    return headers, b''.join(byteslist)
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 requests
 beautifulsoup4
 html5lib
 youtube_dl
 youtube_dl
 aioquic
--- a/switchable_request.py
+++ b/switchable_request.py
@@ -0,0 +1,7 @@
 import http3
 def get(url: str, params: tuple = (), backend="requests", mysession=None, http3headers: dict ={}):
    if backend == "requests":
        return mysession.get(url, params)
    elif backend == "http3":
        #print(http3headers)
        return http3.get(url, headers=http3headers, params=params)
--- a/tracker.py
+++ b/tracker.py
@@ -9,7 +9,7 @@ from os.path import isfile
 from json import loads

 # https://github.com/ArchiveTeam/tencent-weibo-grab/blob/9bae5f9747e014db9227821a9c11557267967023/pipeline.py
 VERSION = "20200924.07"
 VERSION = "20200924.10"

 TRACKER_ID = "ext-yt-communitycontribs"
 TRACKER_HOST = "trackerproxy.meo.ws"
--- a/worker.py
+++ b/worker.py
@@ -64,6 +64,7 @@ if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
    assert False

 mysession = requests.session()
 allheaders = {"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",}
 mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})

 validationtest = mysession.get("https://www.youtube.com/timedtext_editor?action_mde_edit_form=1&v=1iNTtHUwvq4&lang=en&bl=vmp&ui=hd&ref=player&tab=captions&o=U")
@@ -109,11 +110,12 @@ def threadrunner():
            elif task == "discovery":
                while True:
                    try:
                        info = getmetadata(mysession, str(vid).strip())
                        info = getmetadata(mysession, str(vid).strip(), allheaders)
                        break
                    except BaseException as e:
                        print(e)
                        print("Error in retrieving information, waiting 30 seconds and trying again")
                        #raise
                        sleep(30)
                if info[0] or info[1]: # ccenabled or creditdata
                    if not isdir("out/"+str(vid).strip()):
@@ -143,11 +145,11 @@ def threadrunner():
                    jobs.put(("submitdiscovery", playldisc, tracker.ItemType.Playlist))

            elif task == "subtitles":
                subprrun(mysession, args, vid, "default", needforcemetadata, needforcecaptions)
                subprrun(mysession, args, vid, "default", needforcemetadata, needforcecaptions, allheaders)
            elif task == "subtitles-forceedit-captions":
                subprrun(mysession, args, vid, "forceedit-captions", needforcemetadata, needforcecaptions)
                subprrun(mysession, args, vid, "forceedit-captions", needforcemetadata, needforcecaptions, allheaders)
            elif task == "subtitles-forceedit-metadata":
                subprrun(mysession, args, vid, "forceedit-metadata", needforcemetadata, needforcecaptions)
                subprrun(mysession, args, vid, "forceedit-metadata", needforcemetadata, needforcecaptions, allheaders)
            elif task == "channel":
                try:
                    y = ydl.extract_info("https://www.youtube.com/channel/"+desit.split(":", 1)[1], download=False)