Browse Source

Switch from urllib to http.client to reuse connections

master
JustAnotherArchivist 2 years ago
parent
commit
d3ea3ce8a0
1 changed files with 48 additions and 24 deletions
  1. +48
    -24
      ia-cdx-search

+ 48
- 24
ia-cdx-search View File

@@ -1,53 +1,69 @@
#!/usr/bin/env python3
import asyncio
import collections
import http.client
import json
import re
import shlex
import sys
import urllib.error
import urllib.request


def fetch(url, tries):
HOST = 'web.archive.org'


def make_connection():
return http.client.HTTPSConnection(HOST, timeout = 60)


def fetch(url, tries, connection):
for i in range(tries):
try:
print(f'GET {url}', file = sys.stderr)
req = urllib.request.Request(url)
with urllib.request.urlopen(req) as r:
code = r.code
print(f'{code} {url}', file = sys.stderr)
if code != 200:
raise RuntimeError(f'Could not fetch {url}')
o = json.load(r)
break
except (RuntimeError, TimeoutError, urllib.error.URLError, json.JSONDecodeError) as e:
connection.request('GET', url)
r = connection.getresponse()
status = r.status
print(f'{status} {url}', file = sys.stderr)
if status != 200:
raise RuntimeError(f'Could not fetch {url}')
data = r.read()
print(f'Read {len(data)} bytes from {url}', file = sys.stderr)
o = json.loads(data)
break
except (RuntimeError, TimeoutError, http.client.HTTPException, json.JSONDecodeError) as e:
print(f'Error retrieving {url}: {type(e).__module__}.{type(e).__name__} {e!s}', file = sys.stderr)
connection.close()
connection = make_connection()
if i == tries - 1:
raise
return url, code, o
return url, status, o, connection


async def wait_first_and_print(tasks):
if not tasks:
return
task = tasks.popleft()
url, code, o = await task
url, code, o, connection = await task
assert o, 'got empty response'
fields = o[0]
assert all(len(v) == len(fields) for v in o[1:]), 'got unexpected response format'
for row in o[1:]:
print(json.dumps(dict(zip(fields, row))))
print(f'Completed processing page {task._ia_cdx_page}', file = sys.stderr)
return task._ia_cdx_page
return task._ia_cdx_page, connection


async def main(query, concurrency = 1, tries = 1, startPage = None, numPages = None):
assert (startPage is None) == (numPages is None)
baseUrl = f'https://web.archive.org/cdx/search/cdx?{query}'
connections = collections.deque()
for i in range(concurrency):
connections.append(make_connection())
baseUrl = f'/cdx/search/cdx?{query}'
if startPage is None:
url = f'{baseUrl}&showNumPages=true'
numPages = int(fetch(url, tries)[2])
connection = connections.popleft()
_, _, numPages, connection = fetch(url, tries, connection)
numPages = int(numPages)
connections.append(connection)
startPage = 0
print(f'{numPages} pages', file = sys.stderr)

@@ -58,19 +74,27 @@ async def main(query, concurrency = 1, tries = 1, startPage = None, numPages = N
try:
for page in range(startPage, numPages):
while len(tasks) >= concurrency:
lastGoodPage = await wait_first_and_print(tasks)
lastGoodPage, connection = await wait_first_and_print(tasks)
connections.append(connection)
url = f'{baseUrl}&output=json&page={page}'
task = loop.run_in_executor(None, fetch, url, tries)
connection = connections.popleft()
task = loop.run_in_executor(None, fetch, url, tries, connection)
task._ia_cdx_page = page
tasks.append(task)
while len(tasks) > 0:
lastGoodPage = await wait_first_and_print(tasks)
lastGoodPage, connection = await wait_first_and_print(tasks)
connections.append(connection)
except:
# It isn't possible to actually cancel a task running in a thread, so need to await them and discard any additional errors that occur.
try:
await asyncio.gather(*tasks)
except:
pass
for task in tasks:
try:
_, _, _, connection = await task
except:
pass
else:
connections.append(connection)
for connection in connections:
connection.close()
raise
except (RuntimeError, json.JSONDecodeError, AssertionError):
concurrencyS = f'--concurrency {concurrency} ' if concurrency != 1 else ''


Loading…
Cancel
Save