Add retries

2 years ago · 8f7619ff3a
--- a/+ 47
+++ b/+ 47
@@ -5,17 +5,26 @@ import json
 import re
 import shlex
 import sys
 import urllib.error
 import urllib.request


 def fetch(url):
 	print(f'GET {url}', file = sys.stderr)
 	req = urllib.request.Request(url)
 	with urllib.request.urlopen(req) as r:
 		if r.code != 200:
 			raise RuntimeError(f'Could not fetch {url}')
 		code = r.code
 		o = json.load(r)
 def fetch(url, tries):
 	for i in range(tries):
 		try:
 			print(f'GET {url}', file = sys.stderr)
 			req = urllib.request.Request(url)
 			with urllib.request.urlopen(req) as r:
 				code = r.code
 				print(f'{code} {url}', file = sys.stderr)
 				if code != 200:
 					raise RuntimeError(f'Could not fetch {url}')
 				o = json.load(r)
 				break
 		except (RuntimeError, TimeoutError, urllib.error.URLError, json.JSONDecodeError) as e:
 			print(f'Error retrieving {url}: {type(e).__module__}.{type(e).__name__} {e!s}', file = sys.stderr)
 			if i == tries - 1:
 				raise
 	return url, code, o


@@ -24,21 +33,21 @@ async def wait_first_and_print(tasks):
 		return
 	task = tasks.popleft()
 	url, code, o = await task
 	print(f'{code} {url}', file = sys.stderr)
 	assert o, 'got empty response'
 	fields = o[0]
 	assert all(len(v) == len(fields) for v in o[1:]), 'got unexpected response format'
 	for row in o[1:]:
 		print(json.dumps(dict(zip(fields, row))))
 	print(f'Completed processing page {task._ia_cdx_page}', file = sys.stderr)
 	return task._ia_cdx_page


 async def main(query, concurrency = 1, startPage = None, numPages = None):
 async def main(query, concurrency = 1, tries = 1, startPage = None, numPages = None):
 	assert (startPage is None) == (numPages is None)
 	baseUrl = f'https://web.archive.org/cdx/search/cdx?{query}'
 	if startPage is None:
 		url = f'{baseUrl}&showNumPages=true'
 		numPages = int(fetch(url)[2])
 		numPages = int(fetch(url, tries)[2])
 		startPage = 0
 		print(f'{numPages} pages', file = sys.stderr)

@@ -51,7 +60,7 @@ async def main(query, concurrency = 1, startPage = None, numPages = None):
 				while len(tasks) >= concurrency:
 					lastGoodPage = await wait_first_and_print(tasks)
 				url = f'{baseUrl}&output=json&page={page}'
 				task = loop.run_in_executor(None, fetch, url)
 				task = loop.run_in_executor(None, fetch, url, tries)
 				task._ia_cdx_page = page
 				tasks.append(task)
 			while len(tasks) > 0:
@@ -64,16 +73,16 @@ async def main(query, concurrency = 1, startPage = None, numPages = None):
 				pass
 			raise
 	except (RuntimeError, json.JSONDecodeError, AssertionError):
 		concurrencyS = f'{concurrency} ' if concurrency != 1 else ''
 		print(f'To resume this search from where it crashed, run: ia-cdx-search {shlex.quote(query)} {concurrencyS}{lastGoodPage + 1} {numPages}', file = sys.stderr)
 		concurrencyS = f'--concurrency {concurrency} ' if concurrency != 1 else ''
 		triesS = f'--tries {tries} ' if tries != 1 else ''
 		print(f'To resume this search from where it crashed, run: ia-cdx-search {concurrencyS}{triesS}--page {lastGoodPage + 1} --numpages {numPages} {shlex.quote(query)}', file = sys.stderr)
 		raise
 	except (BrokenPipeError, KeyboardInterrupt):
 		pass


 args = sys.argv[1:]
 if not 1 <= len(args) <= 4 or args[0].lower() in ('-h', '--help') or re.search(r'(^|&)(output|limit|resumekey|showresumekey|page|shownumpages)=', args[0], re.IGNORECASE):
 	print('Usage: ia-cdx-search QUERY [CONCURRENCY] [PAGE NUMPAGES]', file = sys.stderr)
 def usage():
 	print('Usage: ia-cdx-search [--concurrency N] [--tries N] [--page N --numpages N] QUERY', file = sys.stderr)
 	print('Please refer to https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server for the relevant query parameters', file = sys.stderr)
 	print('The output, limit, resumeKey, showResumeKey, page, and showNumPages parameters must not be included.', file = sys.stderr)
 	print('To resume a search that failed for some reason, provide the page number and number of pages through the second argument instead.', file = sys.stderr)
@@ -85,10 +94,26 @@ if not 1 <= len(args) <= 4 or args[0].lower() in ('-h', '--help') or re.search(r
 	print(" - Subdirectories: ia-cdex-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/[^/]*/'", file = sys.stderr)
 	print('   The same caveat applies. The directory must have been retrieved directly without an additional trailing path or query string.', file = sys.stderr)
 	sys.exit(1)
 query = args[0]


 args = sys.argv[1:]
 if args[0].lower() in ('-h', '--help'):
 	usage()
 kwargs = {}
 if len(args) in (2, 4):
 	kwargs['concurrency'] = int(args[1])
 if len(args) in (3, 4):
 	kwargs['startPage'], kwargs['numPages'] = map(int, args[-2:])
 while args[0].startswith('--'):
 	if args[0] == '--concurrency':
 		kwargs['concurrency'] = int(args[1])
 		args = args[2:]
 	elif args[0] == '--tries':
 		kwargs['tries'] = int(args[1])
 		args = args[2:]
 	elif args[0] == '--page' and args[2].lower() == '--numpages':
 		kwargs['startPage'] = int(args[1])
 		kwargs['numPages'] = int(args[3])
 		args = args[4:]
 	else:
 		break
 if len(args) != 1 or re.search(r'(^|&)(output|limit|resumekey|showresumekey|page|shownumpages)=', args[0], re.IGNORECASE):
 	usage()
 query = args[0]
 asyncio.run(main(query, **kwargs))