|
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849 |
- #!/usr/bin/env python3
- import json
- import re
- import shlex
- import sys
- import urllib.request
-
-
- if not 2 <= len(sys.argv) <= 3 or sys.argv[1].lower() in ('-h', '--help') or re.search(r'(^|&)(output|limit|resumekey|showresumekey)=', sys.argv[1], re.IGNORECASE):
- print('Usage: ia-cdx-search QUERY [RESUMEKEY]', file = sys.stderr)
- print('Please refer to https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server for the relevant query parameters', file = sys.stderr)
- print('The output, limit, resumeKey, and showResumeKey parameters are added by this script and must not be included.', file = sys.stderr)
- print('To resume a search that failed for some reason, provide the resumeKey through the second argument instead.', file = sys.stderr)
- print('Output is produces in JSONL format with one line per CDX entry.', file = sys.stderr)
- print('', file = sys.stderr)
- print('Examples:', file = sys.stderr)
- print(" - Subdomains: ia-cdx-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/'", file = sys.stderr)
- print(" - Subdirectories: ia-cdex-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/[^/]*/'", file = sys.stderr)
- sys.exit(1)
-
- query = sys.argv[1]
- resumeKey = sys.argv[2:] or ''
- resumeKeyP = f'&resumeKey={resumeKey}' if resumeKey else ''
-
- baseUrl = f'https://web.archive.org/cdx/search/cdx?{query}&output=json&limit=100&showResumeKey=true'
- url = f'{baseUrl}{resumeKeyP}'
- try:
- while True:
- print(f'GET {url}', file = sys.stderr)
- req = urllib.request.Request(url)
- with urllib.request.urlopen(req) as r:
- if r.getcode() != 200:
- raise RuntimeError(f'Could not fetch {url}')
- o = json.load(r)
- assert o, 'got empty response'
- hasResumeKey = len(o) >= 3 and o[-2] == [] and len(o[-1]) == 1
- fields = o[0]
- endOfDataRows = -2 if hasResumeKey else None
- newResumeKey = o[-1][0] if hasResumeKey else False
- assert all(len(v) == len(fields) for v in o[1 : endOfDataRows]), 'got unexpected response format'
- for row in o[1 : endOfDataRows]:
- print(json.dumps(dict(zip(fields, row))))
- if not newResumeKey:
- break
- url = f'{baseUrl}&resumeKey={newResumeKey}'
- except (RuntimeError, json.JSONDecodeError, AssertionError):
- resumeKeyS = f' {shlex.quote(resumeKey)}' if resumeKey else ''
- print(f'To resume this search from where it crashed, run: ia-cdx-search {shlex.quote(query)}{resumeKeyS}', file = sys.stderr)
- raise
|