The little things give you away... A collection of various small helper stuff
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 

50 lignes
2.4 KiB

  1. #!/usr/bin/env python3
  2. import json
  3. import re
  4. import shlex
  5. import sys
  6. import urllib.request
  7. if not 2 <= len(sys.argv) <= 3 or sys.argv[1].lower() in ('-h', '--help') or re.search(r'(^|&)(output|limit|resumekey|showresumekey)=', sys.argv[1], re.IGNORECASE):
  8. print('Usage: ia-cdx-search QUERY [RESUMEKEY]', file = sys.stderr)
  9. print('Please refer to https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server for the relevant query parameters', file = sys.stderr)
  10. print('The output, limit, resumeKey, and showResumeKey parameters are added by this script and must not be included.', file = sys.stderr)
  11. print('To resume a search that failed for some reason, provide the resumeKey through the second argument instead.', file = sys.stderr)
  12. print('Output is produces in JSONL format with one line per CDX entry.', file = sys.stderr)
  13. print('', file = sys.stderr)
  14. print('Examples:', file = sys.stderr)
  15. print(" - Subdomains: ia-cdx-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/'", file = sys.stderr)
  16. print(" - Subdirectories: ia-cdex-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/[^/]*/'", file = sys.stderr)
  17. sys.exit(1)
  18. query = sys.argv[1]
  19. resumeKey = sys.argv[2:] or ''
  20. resumeKeyP = f'&resumeKey={resumeKey}' if resumeKey else ''
  21. baseUrl = f'https://web.archive.org/cdx/search/cdx?{query}&output=json&limit=100&showResumeKey=true'
  22. url = f'{baseUrl}{resumeKeyP}'
  23. try:
  24. while True:
  25. print(f'GET {url}', file = sys.stderr)
  26. req = urllib.request.Request(url)
  27. with urllib.request.urlopen(req) as r:
  28. if r.getcode() != 200:
  29. raise RuntimeError(f'Could not fetch {url}')
  30. o = json.load(r)
  31. assert o, 'got empty response'
  32. hasResumeKey = len(o) >= 3 and o[-2] == [] and len(o[-1]) == 1
  33. fields = o[0]
  34. endOfDataRows = -2 if hasResumeKey else None
  35. newResumeKey = o[-1][0] if hasResumeKey else False
  36. assert all(len(v) == len(fields) for v in o[1 : endOfDataRows]), 'got unexpected response format'
  37. for row in o[1 : endOfDataRows]:
  38. print(json.dumps(dict(zip(fields, row))))
  39. if not newResumeKey:
  40. break
  41. url = f'{baseUrl}&resumeKey={newResumeKey}'
  42. except (RuntimeError, json.JSONDecodeError, AssertionError):
  43. resumeKeyS = f' {shlex.quote(resumeKey)}' if resumeKey else ''
  44. print(f'To resume this search from where it crashed, run: ia-cdx-search {shlex.quote(query)}{resumeKeyS}', file = sys.stderr)
  45. raise