Browse Source

Initial HTTP3 Support

http3
tech234a 3 years ago
parent
commit
d303c266be
8 changed files with 257 additions and 16 deletions
  1. +14
    -4
      discovery.py
  2. +15
    -6
      export.py
  3. +54
    -0
      http3.py
  4. +158
    -0
      http3_base.py
  5. +2
    -1
      requirements.txt
  6. +7
    -0
      switchable_request.py
  7. +1
    -1
      tracker.py
  8. +6
    -4
      worker.py

+ 14
- 4
discovery.py View File

@@ -2,22 +2,32 @@ from time import sleep
from typing import Dict
from json import loads
from switchable_request import get
backend = "http3"
langcodes = {"Afar": "aa", "Abkhazian": "ab", "Afrikaans": "af", "Akan": "ak", "all": "all", "Amharic": "am", "Aragonese": "an", "Arabic": "ar", "Aramaic": "arc", "Algerian Arabic": "arq", "Assamese": "as", "American Sign Language": "ase", "Asturian": "ast", "Avaric": "av", "Aymara": "ay", "Azerbaijani": "az", "Bashkir": "ba", "Belarusian": "be", "Bulgarian": "bg", "Bihari": "bh", "Bislama": "bi", "Bangla": "bn", "Tibetan": "bo", "Breton": "br", "Bosnian": "bs", "Catalan": "ca", "Cebuano": "ceb", "Choctaw": "cho", "Cherokee": "chr", "Corsican": "co", "Czech": "cs", "Church Slavic": "cu", "Welsh": "cy", "Danish": "da", "Danish (Denmark)": "da-DK", "German": "de", "German (Austria)": "de-AT", "German (Switzerland)": "de-CH", "German (Germany)": "de-DE", "Divehi": "dv", "Dzongkha": "dz", "Ewe": "ee", "Greek": "el", "English": "en", "English (United Arab Emirates)": "en-AE", "English (Canada)": "en-CA", "English (United Kingdom)": "en-GB", "English (Ireland)": "en-IE", "English (India)": "en-IN", "English (United States)": "en-US", "Esperanto": "eo", "Spanish": "es", "Spanish (Latin America)": "es-419", "Spanish (Argentina)": "es-AR", "Spanish (Chile)": "es-CL", "Spanish (Colombia)": "es-CO", "Spanish (Costa Rica)": "es-CR", "Spanish (Spain)": "es-ES", "Spanish (Mexico)": "es-MX", "Spanish (Nicaragua)": "es-NI", "Spanish (United States)": "es-US", "Estonian": "et", "Basque": "eu", "Persian": "fa", "Persian (Afghanistan)": "fa-AF", "Persian (Iran)": "fa-IR", "Fulah": "ff", "Finnish": "fi", "Filipino": "fil", "Fijian": "fj", "Faroese": "fo", "French": "fr", "French (Belgium)": "fr-BE", "French (Canada)": "fr-CA", "French (Switzerland)": "fr-CH", "French (France)": "fr-FR", "Western Frisian": "fy", "Irish": "ga", "Scottish Gaelic": "gd", "Galician": "gl", "Guarani": "gn", "Swiss German": "gsw", "Gujarati": "gu", "Hausa": "ha", "Hakka Chinese": "hak", "Hakka Chinese (Taiwan)": "hak-TW", "Hindi": "hi-Latn", "Hmong": "hmn", "Croatian": "hr", "Haitian Creole": "ht", "Hungarian": "hu", "Armenian": "hy", "Interlingua": "ia", "Indonesian": "id", "Interlingue": "ie", "Igbo": "ig", "Sichuan Yi": "ii", "Inupiaq": "ik", "Icelandic": "is", "Italian": "it", "Italian (Italy)": "it-IT", "Inuktitut": "iu", "Hebrew": "iw", "Japanese": "ja", "Javanese": "jv", "Georgian": "ka", "Kazakh": "kk", "Kalaallisut": "kl", "Khmer": "km", "Kannada": "kn", "Korean": "ko", "Korean (South Korea)": "ko-KR", "Kanuri": "kr", "Kashmiri": "ks", "Kurdish": "ku", "Kyrgyz": "ky", "Latin": "la", "Luxembourgish": "lb", "Lingala": "ln", "Lao": "lo", "Lithuanian": "lt", "Mizo": "lus", "Latvian": "lv", "Masai": "mas", "Malagasy": "mg", "Maori": "mi", "Miscellaneous languages": "mis", "Macedonian": "mk", "Malayalam": "ml", "Mongolian": "mn", "Manipuri": "mni", "Moldavian": "mo", "Marathi": "mr", "Malay": "ms", "Maltese": "mt", "Burmese": "my", "Nauru": "na", "Min Nan Chinese": "nan", "Min Nan Chinese (Taiwan)": "nan-TW", "Nepali": "ne", "Dutch": "nl", "Dutch (Belgium)": "nl-BE", "Dutch (Netherlands)": "nl-NL", "Norwegian Nynorsk": "nn", "Norwegian": "no", "not": "not", "Navajo": "nv", "Occitan": "oc", "Oromo": "om", "Odia": "or", "Punjabi": "pa", "Polish": "pl", "Polish (Poland)": "pl-PL", "Pashto": "ps", "Portuguese": "pt", "Portuguese (Brazil)": "pt-BR", "Portuguese (Portugal)": "pt-PT", "Quechua": "qu", "Romansh": "rm", "Rundi": "rn", "Romanian": "ro", "Romanian (Moldova)": "ro-MD", "Russian": "ru-Latn", "Russian (Russia)": "ru-RU", "Kinyarwanda": "rw", "Sanskrit": "sa", "Sardinian": "sc", "Sicilian": "scn", "Scots": "sco", "Sindhi": "sd", "Sherdukpen": "sdp", "Northern Sami": "se", "Sango": "sg", "Serbo-Croatian": "sh", "Sinhala": "si", "Slovak": "sk", "Slovenian": "sl", "Samoan": "sm", "Shona": "sn", "Somali": "so", "Albanian": "sq", "Serbian": "sr", "Serbian (Cyrillic)": "sr-Cyrl", "Serbian (Latin)": "sr-Latn", "Swati": "ss", "Southern Sotho": "st", "Sundanese": "su", "Swedish": "sv", "Swahili": "sw", "Tamil": "ta", "Telugu": "te", "Tajik": "tg", "Thai": "th", "Tigrinya": "ti", "Turkmen": "tk", "Tagalog": "tl", "Klingon": "tlh", "Tswana": "tn", "Tongan": "to", "Turkish": "tr", "Turkish (Turkey)": "tr-TR", "Tsonga": "ts", "Tatar": "tt", "Twi": "tw", "Ukrainian": "uk", "Urdu": "ur", "Uzbek": "uz", "Vietnamese": "vi", "Volap\\xFCk": "vo", "Wolof": "wo", "Xhosa": "xh", "Yiddish": "yi", "Yoruba": "yo", "Cantonese": "yue", "Cantonese (Hong Kong)": "yue-HK", "Chinese": "zh", "Chinese (China)": "zh-CN", "Chinese (Hong Kong)": "zh-HK", "Chinese (Simplified)": "zh-Hans", "Chinese (Simplified, China)": "zh-Hans-CN", "Chinese (Simplified, Singapore)": "zh-Hans-SG", "Chinese (Traditional)": "zh-Hant", "Chinese (Traditional, Hong Kong)": "zh-Hant-HK", "Chinese (Traditional, Taiwan)": "zh-Hant-TW", "Chinese (Singapore)": "zh-SG", "Chinese (Taiwan)": "zh-TW", "Zulu": "zu", "Hiri Motu": "ho", "Tok Pisin": "tpi", "Voro": "vor"}
def getmetadata(mysession, vid):
def getmetadata(mysession, vid, allheaders):
global backend
params = (
("v", vid),
)
while True:
wpage = mysession.get("https://www.youtube.com/watch", params=params)
wpage = get("https://www.youtube.com/watch", params=params, mysession=mysession, backend=backend, http3headers=allheaders)
if not """</div><div id="content" class=" content-alignment" role="main"><p class='largeText'>Sorry for the interruption. We have been receiving a large volume of requests from your network.</p>
<p>To continue with your YouTube experience, please fill out the form below.</p>""" in wpage.text and not wpage.status_code == 429 and 'window["ytInitialPlayerResponse"] = ' in wpage.text and 'window["ytInitialData"] = ' in wpage.text:
break
else:
print("Captcha detected, waiting 30 seconds")
sleep(30)
if backend == "requests":
backend = "http3"
print("Captcha detected, switching discovery to HTTP3/QUIC")
else:
print("Captcha detected, waiting 30 seconds")
sleep(30)
wptext = wpage.text


+ 15
- 6
export.py View File

@@ -38,6 +38,10 @@ from time import sleep
# https://docs.python.org/3/library/html.parser.html
from html.parser import HTMLParser

backend = "http3"

from switchable_request import get

class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
@@ -79,7 +83,8 @@ class MyHTMLParser(HTMLParser):
elif self.get_starttag_text() and self.get_starttag_text().startswith('<div id="original-video-title"'):
self.inittitle += data

def subprrun(mysession, langcode, vid, mode, needforcemetadata, needforcecaptions):
def subprrun(mysession, langcode, vid, mode, needforcemetadata, needforcecaptions, allheaders):
global backend
if mode == "forceedit-metadata":
while needforcemetadata[langcode] == None: #extra logic
print("Awaiting forcemetadata")
@@ -114,7 +119,7 @@ def subprrun(mysession, langcode, vid, mode, needforcemetadata, needforcecaption
("o", "U")
)

page = mysession.get("https://www.youtube.com/timedtext_editor", params=pparams)
page = get("https://www.youtube.com/timedtext_editor", params=pparams, mysession=mysession, backend=backend, http3headers=allheaders)
elif mode == "forceedit-metadata":
pparams = (
("v", vid),
@@ -124,7 +129,7 @@ def subprrun(mysession, langcode, vid, mode, needforcemetadata, needforcecaption
('tab', 'metadata')
)

page = mysession.get("https://www.youtube.com/timedtext_editor", params=pparams)
page = get("https://www.youtube.com/timedtext_editor", params=pparams, mysession=mysession, backend=backend, http3headers=allheaders)
elif mode == "forceedit-captions":
pparams = (
("v", vid),
@@ -137,13 +142,17 @@ def subprrun(mysession, langcode, vid, mode, needforcemetadata, needforcecaption
("o", "U")
)

page = mysession.get("https://www.youtube.com/timedtext_editor", params=pparams)
page = get("https://www.youtube.com/timedtext_editor", params=pparams, mysession=mysession, backend=backend, http3headers=allheaders)

if not "accounts.google.com" in page.url and page.status_code != 429 and 'Subtitles/CC' in page.text and 'Title &amp; description' in page.text:
break
else:
print("[Retrying in 30 seconds for rate limit or login failure] Please supply authentication cookie information in config.json or environment variables. See README.md for more information.")
sleep(30)
if backend == "requests":
backend = "http3"
print("Rate limit or login failure, switching export to HTTP3/QUIC...")
else:
print("[Retrying in 30 seconds for rate limit or login failure] Please supply authentication cookie information in config.json or environment variables. See README.md for more information.")
sleep(30)
except:
print("Error in request, retrying in 5 seconds...")
sleep(5)


+ 54
- 0
http3.py View File

@@ -0,0 +1,54 @@
import asyncio
from typing import cast
from urllib.parse import urlparse
from aioquic.h3.connection import H3_ALPN
from aioquic.asyncio.client import connect
from aioquic.quic.configuration import QuicConfiguration
from http3_base import HttpClient, prepare_response, perform_http_request

class HTTP3Response:
def __init__(self, input) -> None:
headers, content = input
self.content = content
try:
self.text = content.decode()
except:
print("Text decoding error")
self.text = ""
self.headers = {}
for k, v in headers.items():
self.headers[k.decode()] = v.decode()
try:
self.status_code = int(headers[b":status"])
except:
print("Status code not included as header, defaulting to 200")
self.status_code = 200
self.ok = self.status_code < 400

async def main(address, headers={}):
parsed = urlparse(address)

configuration = QuicConfiguration(
is_client=True, alpn_protocols=H3_ALPN
)

async with connect(parsed.netloc, port=443, configuration=configuration, create_protocol=HttpClient) as client:
client = cast(HttpClient, client)

events = await perform_http_request(client=client, url=address, headers=headers)

return HTTP3Response(prepare_response(events))

def get(url, headers={}, params={}):
plist = []
for item in params:
#print(item)
k, v = item
plist.append(str(k)+"="+str(v))
if plist:
pstring = "?"+"&".join(plist)
else:
pstring = ""
#print(url+pstring)
loop = asyncio.new_event_loop()
return loop.run_until_complete(main(url+pstring, headers=headers))

+ 158
- 0
http3_base.py View File

@@ -0,0 +1,158 @@
import asyncio
import logging
import time
from collections import deque
from typing import Deque, Dict, Optional
from urllib.parse import urlparse

import aioquic
from aioquic.asyncio.protocol import QuicConnectionProtocol
from aioquic.h3.connection import H3Connection
from aioquic.h3.events import (
DataReceived,
H3Event,
HeadersReceived,
PushPromiseReceived,
)
from aioquic.quic.events import QuicEvent

logger = logging.getLogger("client")

USER_AGENT = "aioquic/" + aioquic.__version__


class URL:
def __init__(self, url: str) -> None:
parsed = urlparse(url)

self.authority = parsed.netloc
self.full_path = parsed.path
if parsed.query:
self.full_path += "?" + parsed.query
self.scheme = parsed.scheme


class HttpRequest:
def __init__(
self, method: str, url: URL, content: bytes = b"", headers: Dict = {}
) -> None:
self.content = content
self.headers = headers
self.method = method
self.url = url


class HttpClient(QuicConnectionProtocol):
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)

self.pushes: Dict[int, Deque[H3Event]] = {}
self._request_events: Dict[int, Deque[H3Event]] = {}
self._request_waiter: Dict[int, asyncio.Future[Deque[H3Event]]] = {}

self._http = H3Connection(self._quic)

async def get(self, url: str, headers: Dict = {}) -> Deque[H3Event]:
"""
Perform a GET request.
"""
return await self._request(
HttpRequest(method="GET", url=URL(url), headers=headers)
)

async def post(self, url: str, data: bytes, headers: Dict = {}) -> Deque[H3Event]:
"""
Perform a POST request.
"""
return await self._request(
HttpRequest(method="POST", url=URL(url), content=data, headers=headers)
)

def http_event_received(self, event: H3Event) -> None:
if isinstance(event, (HeadersReceived, DataReceived)):
stream_id = event.stream_id
if stream_id in self._request_events:
# http
self._request_events[event.stream_id].append(event)
if event.stream_ended:
request_waiter = self._request_waiter.pop(stream_id)
request_waiter.set_result(self._request_events.pop(stream_id))

elif event.push_id in self.pushes:
# push
self.pushes[event.push_id].append(event)

elif isinstance(event, PushPromiseReceived):
self.pushes[event.push_id] = deque()
self.pushes[event.push_id].append(event)

def quic_event_received(self, event: QuicEvent) -> None:
# pass event to the HTTP layer
if self._http is not None:
for http_event in self._http.handle_event(event):
self.http_event_received(http_event)

async def _request(self, request: HttpRequest) -> Deque[H3Event]:
stream_id = self._quic.get_next_available_stream_id()
self._http.send_headers(
stream_id=stream_id,
headers=[
(b":method", request.method.encode()),
(b":scheme", request.url.scheme.encode()),
(b":authority", request.url.authority.encode()),
(b":path", request.url.full_path.encode()),
(b"user-agent", USER_AGENT.encode()),
]
+ [(k.lower().encode(), v.encode()) for (k, v) in request.headers.items()],
)
self._http.send_data(stream_id=stream_id, data=request.content, end_stream=True)

waiter = self._loop.create_future()
self._request_events[stream_id] = deque()
self._request_waiter[stream_id] = waiter
self.transmit()

return await asyncio.shield(waiter)


async def perform_http_request(
client: HttpClient,
url: str,
headers: Optional[dict]
) -> Dict[int, Deque[H3Event]] :
# perform request
start = time.time()
if headers:
http_events = await client.get(url, headers=headers)
else:
http_events = await client.get(url)
method = "GET"
elapsed = time.time() - start

# print speed
octets = 0
for http_event in http_events:
if isinstance(http_event, DataReceived):
octets += len(http_event.data)
logger.info(
"Response received for %s %s : %d bytes in %.1f s (%.3f Mbps)"
% (method, urlparse(url).path, octets, elapsed, octets * 8 / elapsed / 1000000)
)

return http_events


def prepare_response(
http_events: Deque[H3Event]
) -> str:

byteslist = []
headers = {}

for http_event in http_events:
if isinstance(http_event, HeadersReceived):
headers.update(http_event.headers)
elif isinstance(http_event, DataReceived):
byteslist.append(http_event.data)

return headers, b''.join(byteslist)

+ 2
- 1
requirements.txt View File

@@ -1,4 +1,5 @@
requests
beautifulsoup4
html5lib
youtube_dl
youtube_dl
aioquic

+ 7
- 0
switchable_request.py View File

@@ -0,0 +1,7 @@
import http3
def get(url: str, params: tuple = (), backend="requests", mysession=None, http3headers: dict ={}):
if backend == "requests":
return mysession.get(url, params)
elif backend == "http3":
#print(http3headers)
return http3.get(url, headers=http3headers, params=params)

+ 1
- 1
tracker.py View File

@@ -9,7 +9,7 @@ from os.path import isfile
from json import loads

# https://github.com/ArchiveTeam/tencent-weibo-grab/blob/9bae5f9747e014db9227821a9c11557267967023/pipeline.py
VERSION = "20200924.07"
VERSION = "20200924.10"

TRACKER_ID = "ext-yt-communitycontribs"
TRACKER_HOST = "trackerproxy.meo.ws"


+ 6
- 4
worker.py View File

@@ -64,6 +64,7 @@ if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
assert False

mysession = requests.session()
allheaders = {"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",}
mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})

validationtest = mysession.get("https://www.youtube.com/timedtext_editor?action_mde_edit_form=1&v=1iNTtHUwvq4&lang=en&bl=vmp&ui=hd&ref=player&tab=captions&o=U")
@@ -109,11 +110,12 @@ def threadrunner():
elif task == "discovery":
while True:
try:
info = getmetadata(mysession, str(vid).strip())
info = getmetadata(mysession, str(vid).strip(), allheaders)
break
except BaseException as e:
print(e)
print("Error in retrieving information, waiting 30 seconds and trying again")
#raise
sleep(30)
if info[0] or info[1]: # ccenabled or creditdata
if not isdir("out/"+str(vid).strip()):
@@ -143,11 +145,11 @@ def threadrunner():
jobs.put(("submitdiscovery", playldisc, tracker.ItemType.Playlist))

elif task == "subtitles":
subprrun(mysession, args, vid, "default", needforcemetadata, needforcecaptions)
subprrun(mysession, args, vid, "default", needforcemetadata, needforcecaptions, allheaders)
elif task == "subtitles-forceedit-captions":
subprrun(mysession, args, vid, "forceedit-captions", needforcemetadata, needforcecaptions)
subprrun(mysession, args, vid, "forceedit-captions", needforcemetadata, needforcecaptions, allheaders)
elif task == "subtitles-forceedit-metadata":
subprrun(mysession, args, vid, "forceedit-metadata", needforcemetadata, needforcecaptions)
subprrun(mysession, args, vid, "forceedit-metadata", needforcemetadata, needforcecaptions, allheaders)
elif task == "channel":
try:
y = ydl.extract_info("https://www.youtube.com/channel/"+desit.split(":", 1)[1], download=False)


Loading…
Cancel
Save