Browse Source

Add retries

master
JustAnotherArchivist 2 years ago
parent
commit
8f7619ff3a
1 changed files with 47 additions and 22 deletions
  1. +47
    -22
      ia-cdx-search

+ 47
- 22
ia-cdx-search View File

@@ -5,17 +5,26 @@ import json
import re
import shlex
import sys
import urllib.error
import urllib.request


def fetch(url):
print(f'GET {url}', file = sys.stderr)
req = urllib.request.Request(url)
with urllib.request.urlopen(req) as r:
if r.code != 200:
raise RuntimeError(f'Could not fetch {url}')
code = r.code
o = json.load(r)
def fetch(url, tries):
for i in range(tries):
try:
print(f'GET {url}', file = sys.stderr)
req = urllib.request.Request(url)
with urllib.request.urlopen(req) as r:
code = r.code
print(f'{code} {url}', file = sys.stderr)
if code != 200:
raise RuntimeError(f'Could not fetch {url}')
o = json.load(r)
break
except (RuntimeError, TimeoutError, urllib.error.URLError, json.JSONDecodeError) as e:
print(f'Error retrieving {url}: {type(e).__module__}.{type(e).__name__} {e!s}', file = sys.stderr)
if i == tries - 1:
raise
return url, code, o


@@ -24,21 +33,21 @@ async def wait_first_and_print(tasks):
return
task = tasks.popleft()
url, code, o = await task
print(f'{code} {url}', file = sys.stderr)
assert o, 'got empty response'
fields = o[0]
assert all(len(v) == len(fields) for v in o[1:]), 'got unexpected response format'
for row in o[1:]:
print(json.dumps(dict(zip(fields, row))))
print(f'Completed processing page {task._ia_cdx_page}', file = sys.stderr)
return task._ia_cdx_page


async def main(query, concurrency = 1, startPage = None, numPages = None):
async def main(query, concurrency = 1, tries = 1, startPage = None, numPages = None):
assert (startPage is None) == (numPages is None)
baseUrl = f'https://web.archive.org/cdx/search/cdx?{query}'
if startPage is None:
url = f'{baseUrl}&showNumPages=true'
numPages = int(fetch(url)[2])
numPages = int(fetch(url, tries)[2])
startPage = 0
print(f'{numPages} pages', file = sys.stderr)

@@ -51,7 +60,7 @@ async def main(query, concurrency = 1, startPage = None, numPages = None):
while len(tasks) >= concurrency:
lastGoodPage = await wait_first_and_print(tasks)
url = f'{baseUrl}&output=json&page={page}'
task = loop.run_in_executor(None, fetch, url)
task = loop.run_in_executor(None, fetch, url, tries)
task._ia_cdx_page = page
tasks.append(task)
while len(tasks) > 0:
@@ -64,16 +73,16 @@ async def main(query, concurrency = 1, startPage = None, numPages = None):
pass
raise
except (RuntimeError, json.JSONDecodeError, AssertionError):
concurrencyS = f'{concurrency} ' if concurrency != 1 else ''
print(f'To resume this search from where it crashed, run: ia-cdx-search {shlex.quote(query)} {concurrencyS}{lastGoodPage + 1} {numPages}', file = sys.stderr)
concurrencyS = f'--concurrency {concurrency} ' if concurrency != 1 else ''
triesS = f'--tries {tries} ' if tries != 1 else ''
print(f'To resume this search from where it crashed, run: ia-cdx-search {concurrencyS}{triesS}--page {lastGoodPage + 1} --numpages {numPages} {shlex.quote(query)}', file = sys.stderr)
raise
except (BrokenPipeError, KeyboardInterrupt):
pass


args = sys.argv[1:]
if not 1 <= len(args) <= 4 or args[0].lower() in ('-h', '--help') or re.search(r'(^|&)(output|limit|resumekey|showresumekey|page|shownumpages)=', args[0], re.IGNORECASE):
print('Usage: ia-cdx-search QUERY [CONCURRENCY] [PAGE NUMPAGES]', file = sys.stderr)
def usage():
print('Usage: ia-cdx-search [--concurrency N] [--tries N] [--page N --numpages N] QUERY', file = sys.stderr)
print('Please refer to https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server for the relevant query parameters', file = sys.stderr)
print('The output, limit, resumeKey, showResumeKey, page, and showNumPages parameters must not be included.', file = sys.stderr)
print('To resume a search that failed for some reason, provide the page number and number of pages through the second argument instead.', file = sys.stderr)
@@ -85,10 +94,26 @@ if not 1 <= len(args) <= 4 or args[0].lower() in ('-h', '--help') or re.search(r
print(" - Subdirectories: ia-cdex-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/[^/]*/'", file = sys.stderr)
print(' The same caveat applies. The directory must have been retrieved directly without an additional trailing path or query string.', file = sys.stderr)
sys.exit(1)
query = args[0]


args = sys.argv[1:]
if args[0].lower() in ('-h', '--help'):
usage()
kwargs = {}
if len(args) in (2, 4):
kwargs['concurrency'] = int(args[1])
if len(args) in (3, 4):
kwargs['startPage'], kwargs['numPages'] = map(int, args[-2:])
while args[0].startswith('--'):
if args[0] == '--concurrency':
kwargs['concurrency'] = int(args[1])
args = args[2:]
elif args[0] == '--tries':
kwargs['tries'] = int(args[1])
args = args[2:]
elif args[0] == '--page' and args[2].lower() == '--numpages':
kwargs['startPage'] = int(args[1])
kwargs['numPages'] = int(args[3])
args = args[4:]
else:
break
if len(args) != 1 or re.search(r'(^|&)(output|limit|resumekey|showresumekey|page|shownumpages)=', args[0], re.IGNORECASE):
usage()
query = args[0]
asyncio.run(main(query, **kwargs))

Loading…
Cancel
Save