#!/usr/bin/env python3 import json import re import shlex import sys import urllib.request if not 2 <= len(sys.argv) <= 3 or sys.argv[1].lower() in ('-h', '--help') or re.search(r'(^|&)(output|limit|resumekey|showresumekey)=', sys.argv[1], re.IGNORECASE): print('Usage: ia-cdx-search QUERY [RESUMEKEY]', file = sys.stderr) print('Please refer to https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server for the relevant query parameters', file = sys.stderr) print('The output, limit, resumeKey, and showResumeKey parameters are added by this script and must not be included.', file = sys.stderr) print('To resume a search that failed for some reason, provide the resumeKey through the second argument instead.', file = sys.stderr) print('Output is produces in JSONL format with one line per CDX entry.', file = sys.stderr) print('', file = sys.stderr) print('Examples:', file = sys.stderr) print(" - Subdomains: ia-cdx-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/'", file = sys.stderr) print(" - Subdirectories: ia-cdex-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/[^/]*/'", file = sys.stderr) sys.exit(1) query = sys.argv[1] resumeKey = sys.argv[2:] or '' resumeKeyP = f'&resumeKey={resumeKey}' if resumeKey else '' baseUrl = f'https://web.archive.org/cdx/search/cdx?{query}&output=json&limit=100&showResumeKey=true' url = f'{baseUrl}{resumeKeyP}' try: while True: print(f'GET {url}', file = sys.stderr) req = urllib.request.Request(url) with urllib.request.urlopen(req) as r: if r.getcode() != 200: raise RuntimeError(f'Could not fetch {url}') o = json.load(r) assert o, 'got empty response' hasResumeKey = len(o) >= 3 and o[-2] == [] and len(o[-1]) == 1 fields = o[0] endOfDataRows = -2 if hasResumeKey else None newResumeKey = o[-1][0] if hasResumeKey else False assert all(len(v) == len(fields) for v in o[1 : endOfDataRows]), 'got unexpected response format' for row in o[1 : endOfDataRows]: print(json.dumps(dict(zip(fields, row)))) if not newResumeKey: break url = f'{baseUrl}&resumeKey={newResumeKey}' except (RuntimeError, json.JSONDecodeError, AssertionError): resumeKeyS = f' {shlex.quote(resumeKey)}' if resumeKey else '' print(f'To resume this search from where it crashed, run: ia-cdx-search {shlex.quote(query)}{resumeKeyS}', file = sys.stderr) raise