The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

50 lines
2.4 KiB

  1. #!/usr/bin/env python3
  2. import json
  3. import re
  4. import shlex
  5. import sys
  6. import urllib.request
  7. if not 2 <= len(sys.argv) <= 3 or sys.argv[1].lower() in ('-h', '--help') or re.search(r'(^|&)(output|limit|resumekey|showresumekey)=', sys.argv[1], re.IGNORECASE):
  8. print('Usage: ia-cdx-search QUERY [RESUMEKEY]', file = sys.stderr)
  9. print('Please refer to https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server for the relevant query parameters', file = sys.stderr)
  10. print('The output, limit, resumeKey, and showResumeKey parameters are added by this script and must not be included.', file = sys.stderr)
  11. print('To resume a search that failed for some reason, provide the resumeKey through the second argument instead.', file = sys.stderr)
  12. print('Output is produces in JSONL format with one line per CDX entry.', file = sys.stderr)
  13. print('', file = sys.stderr)
  14. print('Examples:', file = sys.stderr)
  15. print(" - Subdomains: ia-cdx-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/'", file = sys.stderr)
  16. print(" - Subdirectories: ia-cdex-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/[^/]*/'", file = sys.stderr)
  17. sys.exit(1)
  18. query = sys.argv[1]
  19. resumeKey = sys.argv[2:] or ''
  20. resumeKeyP = f'&resumeKey={resumeKey}' if resumeKey else ''
  21. baseUrl = f'https://web.archive.org/cdx/search/cdx?{query}&output=json&limit=100&showResumeKey=true'
  22. url = f'{baseUrl}{resumeKeyP}'
  23. try:
  24. while True:
  25. print(f'GET {url}', file = sys.stderr)
  26. req = urllib.request.Request(url)
  27. with urllib.request.urlopen(req) as r:
  28. if r.getcode() != 200:
  29. raise RuntimeError(f'Could not fetch {url}')
  30. o = json.load(r)
  31. assert o, 'got empty response'
  32. hasResumeKey = len(o) >= 3 and o[-2] == [] and len(o[-1]) == 1
  33. fields = o[0]
  34. endOfDataRows = -2 if hasResumeKey else None
  35. newResumeKey = o[-1][0] if hasResumeKey else False
  36. assert all(len(v) == len(fields) for v in o[1 : endOfDataRows]), 'got unexpected response format'
  37. for row in o[1 : endOfDataRows]:
  38. print(json.dumps(dict(zip(fields, row))))
  39. if not newResumeKey:
  40. break
  41. url = f'{baseUrl}&resumeKey={newResumeKey}'
  42. except (RuntimeError, json.JSONDecodeError, AssertionError):
  43. resumeKeyS = f' {shlex.quote(resumeKey)}' if resumeKey else ''
  44. print(f'To resume this search from where it crashed, run: ia-cdx-search {shlex.quote(query)}{resumeKeyS}', file = sys.stderr)
  45. raise