Browse Source

Track redirect depth

master
JustAnotherArchivist 3 years ago
parent
commit
f8f5258197
2 changed files with 8 additions and 7 deletions
  1. +4
    -3
      qwarc/__init__.py
  2. +4
    -4
      qwarc/utils.py

+ 4
- 3
qwarc/__init__.py View File

@@ -106,7 +106,7 @@ class Item:
headers = self._merge_headers(headers, extraHeaders = [('Referer', str(fromResponse.url))] if fromResponse is not None else [])
history = []
attempt = 0
#TODO redirectLevel
redirectLevel = 0
while True:
attempt += 1
response = None
@@ -138,10 +138,10 @@ class Item:
self.stats['requests'] += 1
except (asyncio.TimeoutError, _aiohttp.ClientError) as e:
self.logger.warning(f'Request for {url} failed: {e!r}')
action, writeToWarc = await responseHandler(url = url, attempt = attempt, response = response, exc = e, item = self)
action, writeToWarc = await responseHandler(url = url, attempt = attempt, response = response, exc = e, redirectLevel = redirectLevel, item = self)
exc = e # Pass the exception outward for the history
else:
action, writeToWarc = await responseHandler(url = url, attempt = attempt, response = response, exc = None, item = self)
action, writeToWarc = await responseHandler(url = url, attempt = attempt, response = response, exc = None, redirectLevel = redirectLevel, item = self)
if response and exc is None and writeToWarc:
self.warc.write_client_response(response)
history.append((response, exc))
@@ -159,6 +159,7 @@ class Item:
method = 'GET'
data = None
attempt = 0
redirectLevel += 1
elif action == ACTION_RETRIES_EXCEEDED:
self.logger.error(f'Request for {url} failed {attempt} times')
retResponse.qhistory = tuple(history)


+ 4
- 4
qwarc/utils.py View File

@@ -127,7 +127,7 @@ def generate_range_items(start, stop, step):
yield f'{i}-{min(i + step - 1, stop)}'


async def handle_response_default(*, url, attempt, response, exc, item):
async def handle_response_default(*, url, attempt, response, exc, redirectLevel, item):
'''
The default response handler, which behaves as follows:
- If there is no response (e.g. timeout error), retry the retrieval after a delay of 5 seconds.
@@ -141,14 +141,14 @@ async def handle_response_default(*, url, attempt, response, exc, item):

Note that this handler does not limit the number of retries on errors.

Parameters: url (yarl.URL instance), attempt (int), response (aiohttp.ClientResponse or None), exc (Exception or None), item (qwarc.Item instance)
Parameters: url (yarl.URL instance), attempt (int), response (aiohttp.ClientResponse or None), exc (Exception or None), redirectLevel (int), item (qwarc.Item instance)
At least one of response and exc is not None.
The redirectLevel indicates how many redirects were followed to get to this url, i.e. it starts out as zero and increases by one for every redirect.
The attempt starts from 1 for every url, i.e. it is reset on redirects. The handler is invoked at most once for each attempt.
Returns: (one of the qwarc.RESPONSE_* constants, bool signifying whether to write to WARC or not)
The latter is ignored when exc is not None; responses that triggered an exception are never written to WARC.
'''

#TODO: Document that `attempt` is reset on redirects

if response is None:
await asyncio.sleep(5)
return ACTION_RETRY, True


Loading…
Cancel
Save