The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

120 lines
4.6 KiB

  1. #!/usr/bin/env python3
  2. import asyncio
  3. import collections
  4. import json
  5. import re
  6. import shlex
  7. import sys
  8. import urllib.error
  9. import urllib.request
  10. def fetch(url, tries):
  11. for i in range(tries):
  12. try:
  13. print(f'GET {url}', file = sys.stderr)
  14. req = urllib.request.Request(url)
  15. with urllib.request.urlopen(req) as r:
  16. code = r.code
  17. print(f'{code} {url}', file = sys.stderr)
  18. if code != 200:
  19. raise RuntimeError(f'Could not fetch {url}')
  20. o = json.load(r)
  21. break
  22. except (RuntimeError, TimeoutError, urllib.error.URLError, json.JSONDecodeError) as e:
  23. print(f'Error retrieving {url}: {type(e).__module__}.{type(e).__name__} {e!s}', file = sys.stderr)
  24. if i == tries - 1:
  25. raise
  26. return url, code, o
  27. async def wait_first_and_print(tasks):
  28. if not tasks:
  29. return
  30. task = tasks.popleft()
  31. url, code, o = await task
  32. assert o, 'got empty response'
  33. fields = o[0]
  34. assert all(len(v) == len(fields) for v in o[1:]), 'got unexpected response format'
  35. for row in o[1:]:
  36. print(json.dumps(dict(zip(fields, row))))
  37. print(f'Completed processing page {task._ia_cdx_page}', file = sys.stderr)
  38. return task._ia_cdx_page
  39. async def main(query, concurrency = 1, tries = 1, startPage = None, numPages = None):
  40. assert (startPage is None) == (numPages is None)
  41. baseUrl = f'https://web.archive.org/cdx/search/cdx?{query}'
  42. if startPage is None:
  43. url = f'{baseUrl}&showNumPages=true'
  44. numPages = int(fetch(url, tries)[2])
  45. startPage = 0
  46. print(f'{numPages} pages', file = sys.stderr)
  47. loop = asyncio.get_running_loop()
  48. tasks = collections.deque()
  49. lastGoodPage = -1
  50. try:
  51. try:
  52. for page in range(startPage, numPages):
  53. while len(tasks) >= concurrency:
  54. lastGoodPage = await wait_first_and_print(tasks)
  55. url = f'{baseUrl}&output=json&page={page}'
  56. task = loop.run_in_executor(None, fetch, url, tries)
  57. task._ia_cdx_page = page
  58. tasks.append(task)
  59. while len(tasks) > 0:
  60. lastGoodPage = await wait_first_and_print(tasks)
  61. except:
  62. # It isn't possible to actually cancel a task running in a thread, so need to await them and discard any additional errors that occur.
  63. try:
  64. await asyncio.gather(*tasks)
  65. except:
  66. pass
  67. raise
  68. except (RuntimeError, json.JSONDecodeError, AssertionError):
  69. concurrencyS = f'--concurrency {concurrency} ' if concurrency != 1 else ''
  70. triesS = f'--tries {tries} ' if tries != 1 else ''
  71. print(f'To resume this search from where it crashed, run: ia-cdx-search {concurrencyS}{triesS}--page {lastGoodPage + 1} --numpages {numPages} {shlex.quote(query)}', file = sys.stderr)
  72. raise
  73. except (BrokenPipeError, KeyboardInterrupt):
  74. pass
  75. def usage():
  76. print('Usage: ia-cdx-search [--concurrency N] [--tries N] [--page N --numpages N] QUERY', file = sys.stderr)
  77. print('Please refer to https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server for the relevant query parameters', file = sys.stderr)
  78. print('The output, limit, resumeKey, showResumeKey, page, and showNumPages parameters must not be included.', file = sys.stderr)
  79. print('To resume a search that failed for some reason, provide the page number and number of pages through the second argument instead.', file = sys.stderr)
  80. print('Output is produces in JSONL format with one line per CDX entry.', file = sys.stderr)
  81. print('', file = sys.stderr)
  82. print('Examples:', file = sys.stderr)
  83. print(" - Subdomains: ia-cdx-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/'", file = sys.stderr)
  84. print(' Note that this will only find subdomains whose homepages are in the Wayback Machine. To discover all known subdomains, remove the filter and then extract the domains from the results.', file = sys.stderr)
  85. print(" - Subdirectories: ia-cdex-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/[^/]*/'", file = sys.stderr)
  86. print(' The same caveat applies. The directory must have been retrieved directly without an additional trailing path or query string.', file = sys.stderr)
  87. sys.exit(1)
  88. args = sys.argv[1:]
  89. if args[0].lower() in ('-h', '--help'):
  90. usage()
  91. kwargs = {}
  92. while args[0].startswith('--'):
  93. if args[0] == '--concurrency':
  94. kwargs['concurrency'] = int(args[1])
  95. args = args[2:]
  96. elif args[0] == '--tries':
  97. kwargs['tries'] = int(args[1])
  98. args = args[2:]
  99. elif args[0] == '--page' and args[2].lower() == '--numpages':
  100. kwargs['startPage'] = int(args[1])
  101. kwargs['numPages'] = int(args[3])
  102. args = args[4:]
  103. else:
  104. break
  105. if len(args) != 1 or re.search(r'(^|&)(output|limit|resumekey|showresumekey|page|shownumpages)=', args[0], re.IGNORECASE):
  106. usage()
  107. query = args[0]
  108. asyncio.run(main(query, **kwargs))