Browse Source

Add fromResponse parameter for URL completion and automatic Referer header

master
JustAnotherArchivist 3 years ago
parent
commit
59ae1183d2
1 changed files with 12 additions and 5 deletions
  1. +12
    -5
      qwarc/__init__.py

+ 12
- 5
qwarc/__init__.py View File

@@ -49,12 +49,15 @@ class Item:
else:
self._baseUrl = yarl.URL(baseUrl)

def _merge_headers(self, headers):
def _merge_headers(self, headers, extraHeaders = []):
d = {} # Preserves order from Python 3.7 (guaranteed) or CPython 3.6 (implementation detail)
keys = {} # casefolded key -> d key
for key, value in self.headers:
d[key] = value
keys[key.casefold()] = key
for key, value in extraHeaders:
d[key] = value
keys[key.casefold()] = key
for key, value in headers:
keyc = key.casefold()
if value is None:
@@ -75,7 +78,7 @@ class Item:
out.append((key, value))
return out

async def fetch(self, url, responseHandler = None, method = 'GET', data = None, headers = [], verify_ssl = True, timeout = 60):
async def fetch(self, url, responseHandler = None, method = 'GET', data = None, headers = [], verify_ssl = True, timeout = 60, fromResponse = None):
'''
HTTP GET or POST a URL

@@ -88,6 +91,7 @@ class Item:
If a header appears multiple times, only the last one is used. To send a header multiple times, pass a tuple of values.
verify_ssl: bool, whether the SSL/TLS certificate should be validated
timeout: int or float, how long the fetch may take at most in total (sending request until finishing reading the response)
fromResponse: ClientResponse or None; if provided, use fromResponse.url for the url completion (instead of self.baseUrl) and add it as a Referer header

Returns response (a ClientResponse object or a qwarc.utils.DummyClientResponse object)
'''
@@ -96,13 +100,16 @@ class Item:

url = yarl.URL(url) # Explicitly convert for normalisation, percent-encoding, etc.
if not url.scheme or not url.host:
if not self.baseUrl:
if fromResponse is not None:
url = fromResponse.url.join(url)
elif not self.baseUrl:
raise ValueError('Incomplete URL and no baseUrl to join it with')
url = self.baseUrl.join(url)
else:
url = self.baseUrl.join(url)
if responseHandler is None:
responseHandler = self.defaultResponseHandler
assert method in ('GET', 'POST'), 'method must be GET or POST'
headers = self._merge_headers(headers)
headers = self._merge_headers(headers, extraHeaders = [('Referer', str(fromResponse.url))] if fromResponse is not None else [])
history = []
attempt = 0
#TODO redirectLevel


Loading…
Cancel
Save