From 303bb69c37bbf0a0e93e95a192d09e48263d740d Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 10 Nov 2021 08:40:13 +0000 Subject: [PATCH] Add ia-cdx-search --- ia-cdx-search | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100755 ia-cdx-search diff --git a/ia-cdx-search b/ia-cdx-search new file mode 100755 index 0000000..772b1e1 --- /dev/null +++ b/ia-cdx-search @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +import json +import re +import shlex +import sys +import urllib.request + + +if not 2 <= len(sys.argv) <= 3 or sys.argv[1].lower() in ('-h', '--help') or re.search(r'(^|&)(output|limit|resumekey|showresumekey)=', sys.argv[1], re.IGNORECASE): + print('Usage: ia-cdx-search QUERY [RESUMEKEY]', file = sys.stderr) + print('Please refer to https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server for the relevant query parameters', file = sys.stderr) + print('The output, limit, resumeKey, and showResumeKey parameters are added by this script and must not be included.', file = sys.stderr) + print('To resume a search that failed for some reason, provide the resumeKey through the second argument instead.', file = sys.stderr) + print('Output is produces in JSONL format with one line per CDX entry.', file = sys.stderr) + print('', file = sys.stderr) + print('Examples:', file = sys.stderr) + print(" - Subdomains: ia-cdx-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/'", file = sys.stderr) + print(" - Subdirectories: ia-cdex-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/[^/]*/'", file = sys.stderr) + sys.exit(1) + +query = sys.argv[1] +resumeKey = sys.argv[2:] or '' +resumeKeyP = f'&resumeKey={resumeKey}' if resumeKey else '' + +baseUrl = f'https://web.archive.org/cdx/search/cdx?{query}&output=json&limit=100&showResumeKey=true' +url = f'{baseUrl}{resumeKeyP}' +try: + while True: + print(f'GET {url}', file = sys.stderr) + req = urllib.request.Request(url) + with urllib.request.urlopen(req) as r: + if r.getcode() != 200: + raise RuntimeError(f'Could not fetch {url}') + o = json.load(r) + assert o, 'got empty response' + hasResumeKey = len(o) >= 3 and o[-2] == [] and len(o[-1]) == 1 + fields = o[0] + endOfDataRows = -2 if hasResumeKey else None + newResumeKey = o[-1][0] if hasResumeKey else False + assert all(len(v) == len(fields) for v in o[1 : endOfDataRows]), 'got unexpected response format' + for row in o[1 : endOfDataRows]: + print(json.dumps(dict(zip(fields, row)))) + if not newResumeKey: + break + url = f'{baseUrl}&resumeKey={newResumeKey}' +except (RuntimeError, json.JSONDecodeError, AssertionError): + resumeKeyS = f' {shlex.quote(resumeKey)}' if resumeKey else '' + print(f'To resume this search from where it crashed, run: ia-cdx-search {shlex.quote(query)}{resumeKeyS}', file = sys.stderr) + raise