|
|
@@ -6,6 +6,7 @@ import json |
|
|
|
import re |
|
|
|
import shlex |
|
|
|
import sys |
|
|
|
import time |
|
|
|
|
|
|
|
|
|
|
|
HOST = 'web.archive.org' |
|
|
@@ -23,6 +24,11 @@ def fetch(url, tries, connection): |
|
|
|
r = connection.getresponse() |
|
|
|
status = r.status |
|
|
|
print(f'{status} {url}', file = sys.stderr) |
|
|
|
if status == 302 and r.getheader('Location') in ('https://web.archive.org/429.html', '/429.html'): |
|
|
|
# The CDX API is stupid and doesn't return 429s directly... |
|
|
|
print('Exceeded rate limit, waiting...', file = sys.stderr) |
|
|
|
time.sleep(30) |
|
|
|
raise RuntimeError(f'Rate-limited on {url}') |
|
|
|
if status != 200: |
|
|
|
raise RuntimeError(f'Could not fetch {url}') |
|
|
|
data = r.read() |
|
|
|