From 628aeb052f760b9e8ff2d51955cca960f3af9c3d Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Thu, 18 Nov 2021 03:36:29 +0000 Subject: [PATCH] Handle rate limiting --- ia-cdx-search | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ia-cdx-search b/ia-cdx-search index abe85b3..94579bd 100755 --- a/ia-cdx-search +++ b/ia-cdx-search @@ -6,6 +6,7 @@ import json import re import shlex import sys +import time HOST = 'web.archive.org' @@ -23,6 +24,11 @@ def fetch(url, tries, connection): r = connection.getresponse() status = r.status print(f'{status} {url}', file = sys.stderr) + if status == 302 and r.getheader('Location') in ('https://web.archive.org/429.html', '/429.html'): + # The CDX API is stupid and doesn't return 429s directly... + print('Exceeded rate limit, waiting...', file = sys.stderr) + time.sleep(30) + raise RuntimeError(f'Rate-limited on {url}') if status != 200: raise RuntimeError(f'Could not fetch {url}') data = r.read()