From 303bb69c37bbf0a0e93e95a192d09e48263d740d Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Wed, 10 Nov 2021 08:40:13 +0000
Subject: [PATCH] Add ia-cdx-search

---
 ia-cdx-search | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100755 ia-cdx-search

diff --git a/ia-cdx-search b/ia-cdx-search
new file mode 100755
index 0000000..772b1e1
--- /dev/null
+++ b/ia-cdx-search
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+import json
+import re
+import shlex
+import sys
+import urllib.request
+
+
+if not 2 <= len(sys.argv) <= 3 or sys.argv[1].lower() in ('-h', '--help') or re.search(r'(^|&)(output|limit|resumekey|showresumekey)=', sys.argv[1], re.IGNORECASE):
+	print('Usage: ia-cdx-search QUERY [RESUMEKEY]', file = sys.stderr)
+	print('Please refer to https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server for the relevant query parameters', file = sys.stderr)
+	print('The output, limit, resumeKey, and showResumeKey parameters are added by this script and must not be included.', file = sys.stderr)
+	print('To resume a search that failed for some reason, provide the resumeKey through the second argument instead.', file = sys.stderr)
+	print('Output is produces in JSONL format with one line per CDX entry.', file = sys.stderr)
+	print('', file = sys.stderr)
+	print('Examples:', file = sys.stderr)
+	print(" - Subdomains: ia-cdx-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/'", file = sys.stderr)
+	print(" - Subdirectories: ia-cdex-search 'url=example.org&collapse=urlkey&fl=original&matchType=domain&filter=original:^https?://[^/]*example\.org(?::[0-9]*)?/[^/]*/'", file = sys.stderr)
+	sys.exit(1)
+
+query = sys.argv[1]
+resumeKey = sys.argv[2:] or ''
+resumeKeyP = f'&resumeKey={resumeKey}' if resumeKey else ''
+
+baseUrl = f'https://web.archive.org/cdx/search/cdx?{query}&output=json&limit=100&showResumeKey=true'
+url = f'{baseUrl}{resumeKeyP}'
+try:
+	while True:
+		print(f'GET {url}', file = sys.stderr)
+		req = urllib.request.Request(url)
+		with urllib.request.urlopen(req) as r:
+			if r.getcode() != 200:
+				raise RuntimeError(f'Could not fetch {url}')
+			o = json.load(r)
+		assert o, 'got empty response'
+		hasResumeKey = len(o) >= 3 and o[-2] == [] and len(o[-1]) == 1
+		fields = o[0]
+		endOfDataRows = -2 if hasResumeKey else None
+		newResumeKey = o[-1][0] if hasResumeKey else False
+		assert all(len(v) == len(fields) for v in o[1 : endOfDataRows]), 'got unexpected response format'
+		for row in o[1 : endOfDataRows]:
+			print(json.dumps(dict(zip(fields, row))))
+		if not newResumeKey:
+			break
+		url = f'{baseUrl}&resumeKey={newResumeKey}'
+except (RuntimeError, json.JSONDecodeError, AssertionError):
+	resumeKeyS = f' {shlex.quote(resumeKey)}' if resumeKey else ''
+	print(f'To resume this search from where it crashed, run: ia-cdx-search {shlex.quote(query)}{resumeKeyS}', file = sys.stderr)
+	raise