diff --git a/ia-cdx-search b/ia-cdx-search index a02b468..0ca52cc 100755 --- a/ia-cdx-search +++ b/ia-cdx-search @@ -5,17 +5,26 @@ import json import re import shlex import sys +import urllib.error import urllib.request -def fetch(url): - print(f'GET {url}', file = sys.stderr) - req = urllib.request.Request(url) - with urllib.request.urlopen(req) as r: - if r.code != 200: - raise RuntimeError(f'Could not fetch {url}') - code = r.code - o = json.load(r) +def fetch(url, tries): + for i in range(tries): + try: + print(f'GET {url}', file = sys.stderr) + req = urllib.request.Request(url) + with urllib.request.urlopen(req) as r: + code = r.code + print(f'{code} {url}', file = sys.stderr) + if code != 200: + raise RuntimeError(f'Could not fetch {url}') + o = json.load(r) + break + except (RuntimeError, TimeoutError, urllib.error.URLError, json.JSONDecodeError) as e: + print(f'Error retrieving {url}: {type(e).__module__}.{type(e).__name__} {e!s}', file = sys.stderr) + if i == tries - 1: + raise return url, code, o @@ -24,21 +33,21 @@ async def wait_first_and_print(tasks): return task = tasks.popleft() url, code, o = await task - print(f'{code} {url}', file = sys.stderr) assert o, 'got empty response' fields = o[0] assert all(len(v) == len(fields) for v in o[1:]), 'got unexpected response format' for row in o[1:]: print(json.dumps(dict(zip(fields, row)))) + print(f'Completed processing page {task._ia_cdx_page}', file = sys.stderr) return task._ia_cdx_page -async def main(query, concurrency = 1, startPage = None, numPages = None): +async def main(query, concurrency = 1, tries = 1, startPage = None, numPages = None): assert (startPage is None) == (numPages is None) baseUrl = f'https://web.archive.org/cdx/search/cdx?{query}' if startPage is None: url = f'{baseUrl}&showNumPages=true' - numPages = int(fetch(url)[2]) + numPages = int(fetch(url, tries)[2]) startPage = 0 print(f'{numPages} pages', file = sys.stderr) @@ -51,7 +60,7 @@ async def main(query, concurrency = 1, startPage = None, numPages = None): while len(tasks) >= concurrency: lastGoodPage = await wait_first_and_print(tasks) url = f'{baseUrl}&output=json&page={page}' - task = loop.run_in_executor(None, fetch, url) + task = loop.run_in_executor(None, fetch, url, tries) task._ia_cdx_page = page tasks.append(task) while len(tasks) > 0: @@ -64,16 +73,16 @@ async def main(query, concurrency = 1, startPage = None, numPages = None): pass raise except (RuntimeError, json.JSONDecodeError, AssertionError): - concurrencyS = f'{concurrency} ' if concurrency != 1 else '' - print(f'To resume this search from where it crashed, run: ia-cdx-search {shlex.quote(query)} {concurrencyS}{lastGoodPage + 1} {numPages}', file = sys.stderr) + concurrencyS = f'--concurrency {concurrency} ' if concurrency != 1 else '' + triesS = f'--tries {tries} ' if tries != 1 else '' + print(f'To resume this search from where it crashed, run: ia-cdx-search {concurrencyS}{triesS}--page {lastGoodPage + 1} --numpages {numPages} {shlex.quote(query)}', file = sys.stderr) raise except (BrokenPipeError, KeyboardInterrupt): pass -args = sys.argv[1:] -if not 1 <= len(args) <= 4 or args[0].lower() in ('-h', '--help') or re.search(r'(^|&)(output|limit|resumekey|showresumekey|page|shownumpages)=', args[0], re.IGNORECASE): - print('Usage: ia-cdx-search QUERY [CONCURRENCY] [PAGE NUMPAGES]', file = sys.stderr) +def usage(): + print('Usage: ia-cdx-search [--concurrency N] [--tries N] [--page N --numpages N] QUERY', file = sys.stderr) print('Please refer to https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server for the relevant query parameters', file = sys.stderr) print('The output, limit, resumeKey, showResumeKey, page, and showNumPages parameters must not be included.', file = sys.stderr) print('To resume a search that failed for some reason, provide the page number and number of pages through the second argument instead.', file = sys.stderr) @@ -85,10 +94,26 @@ if not 1 <= len(args) <= 4 or args[0].lower() in ('-h', '--help') or re.search(r print(" - Subdirectories: ia-cdex-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/[^/]*/'", file = sys.stderr) print(' The same caveat applies. The directory must have been retrieved directly without an additional trailing path or query string.', file = sys.stderr) sys.exit(1) -query = args[0] + + +args = sys.argv[1:] +if args[0].lower() in ('-h', '--help'): + usage() kwargs = {} -if len(args) in (2, 4): - kwargs['concurrency'] = int(args[1]) -if len(args) in (3, 4): - kwargs['startPage'], kwargs['numPages'] = map(int, args[-2:]) +while args[0].startswith('--'): + if args[0] == '--concurrency': + kwargs['concurrency'] = int(args[1]) + args = args[2:] + elif args[0] == '--tries': + kwargs['tries'] = int(args[1]) + args = args[2:] + elif args[0] == '--page' and args[2].lower() == '--numpages': + kwargs['startPage'] = int(args[1]) + kwargs['numPages'] = int(args[3]) + args = args[4:] + else: + break +if len(args) != 1 or re.search(r'(^|&)(output|limit|resumekey|showresumekey|page|shownumpages)=', args[0], re.IGNORECASE): + usage() +query = args[0] asyncio.run(main(query, **kwargs))