@@ -0,0 +1,316 @@ | |||||
import qwarc.aiohttp | |||||
from qwarc.const import * | |||||
import qwarc.utils | |||||
import qwarc.warc | |||||
import aiohttp as _aiohttp | |||||
if _aiohttp.__version__ != '2.3.10': | |||||
raise ImportError('aiohttp must be version 2.3.10') | |||||
import asyncio | |||||
import collections | |||||
import concurrent.futures | |||||
import itertools | |||||
import logging | |||||
import os | |||||
import random | |||||
import sqlite3 | |||||
import yarl | |||||
class Item: | |||||
itemType = None | |||||
def __init__(self, itemValue, session, headers, warc): | |||||
self.itemValue = itemValue | |||||
self.session = session | |||||
self.headers = headers | |||||
self.warc = warc | |||||
self.stats = {'tx': 0, 'rx': 0, 'requests': 0} | |||||
self.childItems = [] | |||||
async def fetch(self, url, responseHandler = qwarc.utils.handle_response_default): | |||||
''' | |||||
HTTP GET a URL | |||||
url: str or yarl.URL | |||||
responseHandler: a callable that determines how the response is handled. See qwarc.utils.handle_response_default for details. | |||||
Returns response (a ClientResponse object or None) and history (a tuple of (response, exception) tuples). | |||||
response can be None and history can be an empty tuple, depending on the circumstances (e.g. timeouts). | |||||
''' | |||||
#TODO: Rewrite using 'async with self.session.get' | |||||
url = yarl.URL(url) # Explicitly convert for normalisation, percent-encoding, etc. | |||||
history = [] | |||||
attempt = 0 | |||||
#TODO redirectLevel | |||||
while True: | |||||
attempt += 1 | |||||
response = None | |||||
exc = None | |||||
action = ACTION_RETRY | |||||
writeToWarc = True | |||||
try: | |||||
try: | |||||
with _aiohttp.Timeout(60): | |||||
logging.info('Fetching {}'.format(url)) | |||||
response = await self.session.get(url, headers = self.headers, allow_redirects = False) | |||||
try: | |||||
ret = await response.text(errors = 'surrogateescape') | |||||
except: | |||||
# No calling the handleResponse callback here because this is really bad. The not-so-bad exceptions (e.g. an error during reading the response) will be caught further down. | |||||
response.close() | |||||
raise | |||||
else: | |||||
tx = len(response.rawRequestData) | |||||
rx = len(response.rawResponseData) | |||||
logging.info('Fetched {}: {} (tx {}, rx {})'.format(url, response.status, tx, rx)) | |||||
self.stats['tx'] += tx | |||||
self.stats['rx'] += rx | |||||
self.stats['requests'] += 1 | |||||
except (asyncio.TimeoutError, _aiohttp.ClientError) as e: | |||||
logging.error('Request for {} failed: {!r}'.format(url, e)) | |||||
action, writeToWarc = await responseHandler(url, attempt, response, e) | |||||
exc = e # Pass the exception outward for the history | |||||
else: | |||||
action, writeToWarc = await responseHandler(url, attempt, response, None) | |||||
history.append((response, exc)) | |||||
if action in (ACTION_SUCCESS, ACTION_IGNORE): | |||||
return response, tuple(history) | |||||
elif action == ACTION_FOLLOW_OR_SUCCESS: | |||||
redirectUrl = response.headers.get('Location') or response.headers.get('URI') | |||||
if not redirectUrl: | |||||
return response, tuple(history) | |||||
url = url.join(yarl.URL(redirectUrl)) | |||||
attempt = 0 | |||||
elif action == ACTION_RETRY: | |||||
# Nothing to do, just go to the next cycle | |||||
pass | |||||
finally: | |||||
if response: | |||||
if writeToWarc: | |||||
self.warc.write_client_response(response) | |||||
await response.release() | |||||
async def process(self): | |||||
raise NotImplementedError | |||||
@classmethod | |||||
def generate(cls): | |||||
yield from () # Generate no items by default | |||||
@classmethod | |||||
def _gen(cls): | |||||
for x in cls.generate(): | |||||
yield (cls.itemType, x, STATUS_TODO) | |||||
def add_item(self, itemClassOrType, itemValue): | |||||
if issubclass(itemClassOrType, Item): | |||||
item = (itemClassOrType.itemType, itemValue) | |||||
else: | |||||
item = (itemClassOrType, itemValue) | |||||
if item not in self.childItems: | |||||
self.childItems.append(item) | |||||
class QWARC: | |||||
def __init__(self, itemClasses, warcBasePath, dbPath, concurrency = 1, memoryLimit = 0, minFreeDisk = 0, warcSizeLimit = 0): | |||||
''' | |||||
itemClasses: iterable of Item | |||||
warcBasePath: str, base name of the WARC files | |||||
dbPath: str, path to the sqlite3 database file | |||||
concurrency: int, number of concurrently processed items | |||||
memoryLimit: int, gracefully stop when the process uses more than memoryLimit bytes of RSS; 0 disables the memory check | |||||
minFreeDisk: int, pause when there's less than minFreeDisk space on the partition where WARCs are written; 0 disables the disk space check | |||||
warcSizeLimit: int, size of each WARC file; 0 if the WARCs should not be split | |||||
''' | |||||
self._itemClasses = itemClasses | |||||
self._itemTypeMap = {cls.itemType: cls for cls in itemClasses} | |||||
self._warcBasePath = warcBasePath | |||||
self._dbPath = dbPath | |||||
self._concurrency = concurrency | |||||
self._memoryLimit = memoryLimit | |||||
self._minFreeDisk = minFreeDisk | |||||
self._warcSizeLimit = warcSizeLimit | |||||
async def obtain_exclusive_db_lock(self, db): | |||||
c = db.cursor() | |||||
while True: | |||||
try: | |||||
c.execute('BEGIN EXCLUSIVE') | |||||
break | |||||
except sqlite3.OperationalError as e: | |||||
if str(e) != 'database is locked': | |||||
raise | |||||
await asyncio.sleep(1) | |||||
return c | |||||
def _make_item(self, itemType, itemValue, session, headers, warc): | |||||
try: | |||||
itemClass = self._itemTypeMap[itemType] | |||||
except KeyError: | |||||
raise RuntimeError('No such item type: {!r}'.format(itemType)) | |||||
return itemClass(itemValue, session, headers, warc) | |||||
async def run(self, loop): | |||||
headers = [('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0')] #TODO: Move elsewhere | |||||
tasks = set() | |||||
sleepTasks = set() | |||||
sessions = [] # aiohttp.ClientSession instances | |||||
freeSessions = collections.deque() # ClientSession instances that are currently free | |||||
for i in range(self._concurrency): | |||||
session = _aiohttp.ClientSession( | |||||
connector = qwarc.aiohttp.TCPConnector(loop = loop), | |||||
request_class = qwarc.aiohttp.ClientRequest, | |||||
response_class = qwarc.aiohttp.ClientResponse, | |||||
skip_auto_headers = ['Accept-Encoding'], | |||||
loop = loop | |||||
) | |||||
sessions.append(session) | |||||
freeSessions.append(session) | |||||
warc = qwarc.warc.WARC(self._warcBasePath, self._warcSizeLimit) | |||||
db = sqlite3.connect(self._dbPath, timeout = 1) | |||||
db.isolation_level = None # Transactions are handled manually below. | |||||
db.execute('PRAGMA synchronous = OFF') | |||||
try: | |||||
async def wait_for_free_task(): | |||||
nonlocal tasks, freeSessions, db, emptyTodoSleep | |||||
done, pending = await asyncio.wait(tasks, return_when = concurrent.futures.FIRST_COMPLETED) | |||||
for future in done: | |||||
# TODO Replace all of this with `if future.cancelled():` | |||||
try: | |||||
await future #TODO: Is this actually necessary? asyncio.wait only returns 'done' futures... | |||||
except concurrent.futures.CancelledError as e: | |||||
# Got cancelled, nothing we can do about it, but let's log a warning if it's a process task | |||||
if isinstance(future, asyncio.Task): | |||||
if future.taskType == 'process_item': | |||||
logging.warning('Task for {}:{} cancelled: {!r}'.format(future.itemType, future.itemValue, future)) | |||||
elif future.taskType == 'sleep': | |||||
sleepTasks.remove(future) | |||||
continue | |||||
if future.taskType == 'sleep': | |||||
# Dummy task for empty todo list, see below. | |||||
sleepTasks.remove(future) | |||||
continue | |||||
item = future.item | |||||
logging.info('{itemType}:{itemValue} done: {requests} requests, {tx} tx, {rx} rx'.format(itemType = future.itemType, itemValue = future.itemValue, **item.stats)) | |||||
cursor = await self.obtain_exclusive_db_lock(db) | |||||
try: | |||||
cursor.execute('UPDATE items SET status = ? WHERE id = ?', (STATUS_DONE, future.id)) | |||||
if item.childItems: | |||||
it = iter(item.childItems) | |||||
while True: | |||||
values = [(t, v, STATUS_TODO) for t, v in itertools.islice(it, 100000)] | |||||
if not values: | |||||
break | |||||
cursor.executemany('INSERT INTO items (type, value, status) VALUES (?, ?, ?)', values) | |||||
cursor.execute('COMMIT') | |||||
except: | |||||
cursor.execute('ROLLBACK') | |||||
raise | |||||
freeSessions.append(item.session) | |||||
tasks = pending | |||||
while True: | |||||
while len(tasks) >= self._concurrency: | |||||
emptyTodoFullReached = True | |||||
await wait_for_free_task() | |||||
if self._minFreeDisk and too_little_disk_space(self._minFreeDisk): | |||||
logging.info('Disk space is low, sleeping') | |||||
sleepTask = asyncio.sleep(random.uniform(self._concurrency / 2, self._concurrency * 1.5)) | |||||
sleepTask.taskType = 'sleep' | |||||
tasks.add(sleepTask) | |||||
sleepTasks.add(sleepTask) | |||||
continue | |||||
cursor = await self.obtain_exclusive_db_lock(db) | |||||
try: | |||||
cursor.execute('SELECT id, type, value, status FROM items WHERE status = ? LIMIT 1', (STATUS_TODO,)) | |||||
result = cursor.fetchone() | |||||
if not result: | |||||
if cursor.execute('SELECT id, status FROM items WHERE status != ? LIMIT 1', (STATUS_DONE,)).fetchone(): | |||||
# There is currently no item to do, but there are still some in progress, so more TODOs may appear in the future. | |||||
# It would be nice if we could just await wait_for_free_task() here, but that doesn't work because those TODOs might be in another process. | |||||
# So instead, we insert a dummy task which just sleeps a bit. Average sleep time is equal to concurrency, i.e. one check per second. | |||||
#TODO: The average sleep time is too large if there are only few sleep tasks; scale with len(sleepTasks)/self._concurrency? | |||||
sleepTask = asyncio.ensure_future(asyncio.sleep(random.uniform(self._concurrency / 2, self._concurrency * 1.5))) | |||||
sleepTask.taskType = 'sleep' | |||||
tasks.add(sleepTask) | |||||
sleepTasks.add(sleepTask) | |||||
cursor.execute('COMMIT') | |||||
continue | |||||
else: | |||||
# Really nothing to do anymore | |||||
#TODO: Another process may be running create_db, in which case we'd still want to wait... | |||||
# create_db could insert a dummy item which is marked as done when the DB is ready | |||||
cursor.execute('COMMIT') | |||||
break | |||||
emptyTodoSleep = 0 | |||||
id, itemType, itemValue, status = result | |||||
cursor.execute('UPDATE items SET status = ? WHERE id = ?', (STATUS_INPROGRESS, id)) | |||||
cursor.execute('COMMIT') | |||||
except: | |||||
cursor.execute('ROLLBACK') | |||||
raise | |||||
session = freeSessions.popleft() | |||||
item = self._make_item(itemType, itemValue, session, headers, warc) | |||||
task = asyncio.ensure_future(item.process()) | |||||
#TODO: Is there a better way to add custom information to a task/coroutine object? | |||||
task.taskType = 'process' | |||||
task.id = id | |||||
task.itemType = itemType | |||||
task.itemValue = itemValue | |||||
task.item = item | |||||
tasks.add(task) | |||||
if os.path.exists('STOP'): | |||||
logging.info('Gracefully shutting down due to STOP file') | |||||
break | |||||
if self._memoryLimit and uses_too_much_memory(self._memoryLimit): | |||||
logging.info('Gracefully shutting down due to memory usage (current = {} > limit = {})'.format(get_rss(), self._memoryLimit)) | |||||
break | |||||
for sleepTask in sleepTasks: | |||||
sleepTask.cancel() | |||||
while len(tasks): | |||||
await wait_for_free_task() | |||||
logging.info('Done') | |||||
except (Exception, KeyboardInterrupt) as e: | |||||
# Kill all tasks | |||||
for task in tasks: | |||||
task.cancel() | |||||
await asyncio.wait(tasks, return_when = concurrent.futures.ALL_COMPLETED) | |||||
raise | |||||
finally: | |||||
for session in sessions: | |||||
session.close() | |||||
warc.close() | |||||
db.close() | |||||
def create_db(self): | |||||
db = sqlite3.connect(self._dbPath, timeout = 1) | |||||
db.execute('PRAGMA synchronous = OFF') | |||||
with db: | |||||
db.execute('CREATE TABLE items (id INTEGER PRIMARY KEY, type TEXT, value TEXT, status INTEGER)') | |||||
db.execute('CREATE INDEX items_status_idx ON items (status)') | |||||
it = itertools.chain(*(i._gen() for i in self._itemClasses)) | |||||
while True: | |||||
values = tuple(itertools.islice(it, 100000)) | |||||
if not values: | |||||
break | |||||
with db: | |||||
db.executemany('INSERT INTO items (type, value, status) VALUES (?, ?, ?)', values) |
@@ -0,0 +1,113 @@ | |||||
import aiohttp | |||||
import aiohttp.client_proto | |||||
import aiohttp.connector | |||||
import functools | |||||
import itertools | |||||
import time | |||||
# aiohttp does not expose the raw data sent over the wire, so we need to get a bit creative... | |||||
# The ResponseHandler handles received data; the writes are done directly on the underlying transport. | |||||
# So ResponseHandler is replaced with a class which keeps all received data in a list, and the transport's write method is replaced with one which sends back all written data to the ResponseHandler. | |||||
# Because the ResponseHandler instance disappears when the connection is closed (ClientResponse.{_response_eof,close,release}), ClientResponse copies the references to the data objects in the RequestHandler. | |||||
# aiohttp also does connection pooling/reuse, so ClientRequest resets the raw data when the request is sent. (This would not work with pipelining, but aiohttp does not support pipelining: https://github.com/aio-libs/aiohttp/issues/1740 ) | |||||
# This code has been developed for aiohttp version 2.3.10. | |||||
#TODO: THERE IS A MEMORY LEAK HERE SOMEWHERE! I spent a whole day trying to find it without success. | |||||
class RawData: | |||||
def __init__(self): | |||||
self.requestTimestamp = None | |||||
self.requestData = [] | |||||
self.responseTimestamp = None | |||||
self.responseData = [] | |||||
class ResponseHandler(aiohttp.client_proto.ResponseHandler): | |||||
def __init__(self, *args, **kwargs): | |||||
super().__init__(*args, **kwargs) | |||||
self.rawData = None | |||||
self.remoteAddress = None | |||||
def data_received(self, data): | |||||
super().data_received(data) | |||||
if not data: | |||||
return | |||||
if self.rawData.responseTimestamp is None: | |||||
self.rawData.responseTimestamp = time.time() | |||||
self.rawData.responseData.append(data) | |||||
def reset_raw_data(self): | |||||
self.rawData = RawData() | |||||
def make_transport_write(transport, protocol): | |||||
transport._real_write = transport.write | |||||
def write(self, data): | |||||
if protocol.rawData.requestTimestamp is None: | |||||
protocol.rawData.requestTimestamp = time.time() | |||||
protocol.rawData.requestData.append(data) | |||||
self._real_write(data) | |||||
return write | |||||
class TCPConnector(aiohttp.connector.TCPConnector): | |||||
def __init__(self, *args, loop = None, **kwargs): | |||||
super().__init__(*args, loop = loop, **kwargs) | |||||
self._factory = functools.partial(ResponseHandler, loop = loop) | |||||
async def _wrap_create_connection(self, protocolFactory, host, port, *args, **kwargs): #FIXME: Uses internal API | |||||
transport, protocol = await super()._wrap_create_connection(protocolFactory, host, port, *args, **kwargs) | |||||
transport.write = make_transport_write(transport, protocol).__get__(transport, type(transport)) # https://stackoverflow.com/a/28127947 | |||||
protocol.remoteAddress = (host, port) | |||||
return (transport, protocol) | |||||
class ClientRequest(aiohttp.client_reqrep.ClientRequest): | |||||
def send(self, connection): | |||||
connection.protocol.reset_raw_data() | |||||
return super().send(connection) | |||||
class ClientResponse(aiohttp.client_reqrep.ClientResponse): | |||||
def __init__(self, *args, **kwargs): | |||||
super().__init__(*args, **kwargs) | |||||
self._rawData = None | |||||
self._remoteAddress = None | |||||
async def start(self, connection, readUntilEof): | |||||
self._rawData = connection.protocol.rawData | |||||
self._remoteAddress = connection.protocol.remoteAddress | |||||
return (await super().start(connection, readUntilEof)) | |||||
@property | |||||
def rawRequestTimestamp(self): | |||||
return self._rawData.requestTimestamp | |||||
@property | |||||
def rawRequestData(self): | |||||
return b''.join(self._rawData.requestData) | |||||
@property | |||||
def rawResponseTimestamp(self): | |||||
return self._rawData.responseTimestamp | |||||
@property | |||||
def rawResponseData(self): | |||||
return b''.join(self._rawData.responseData) | |||||
@property | |||||
def remoteAddress(self): | |||||
return self._remoteAddress | |||||
def set_history(self, history): | |||||
self._history = history #FIXME: Uses private attribute of aiohttp.client_reqrep.ClientResponse | |||||
def iter_all(self): | |||||
return itertools.chain(self.history, (self,)) | |||||
async def release(self): | |||||
if not self.closed: | |||||
self.connection.reset_raw_data() | |||||
await super().release() |
@@ -0,0 +1,81 @@ | |||||
import argparse | |||||
import asyncio | |||||
import importlib.util | |||||
import logging | |||||
import os.path | |||||
import qwarc | |||||
import sys | |||||
import time | |||||
def setup_logging(logFilename): | |||||
rootLogger = logging.getLogger() | |||||
rootLogger.handlers = [] | |||||
rootLogger.setLevel(logging.INFO) | |||||
formatter = logging.Formatter('%(asctime)s.%(msecs)03dZ %(levelname)s %(message)s', datefmt = '%Y-%m-%d %H:%M:%S') | |||||
formatter.converter = time.gmtime | |||||
fileHandler = logging.FileHandler(logFilename) | |||||
fileHandler.setFormatter(formatter) | |||||
rootLogger.addHandler(fileHandler) | |||||
stderrHandler = logging.StreamHandler() | |||||
stderrHandler.setFormatter(formatter) | |||||
rootLogger.addHandler(stderrHandler) | |||||
def check_files(specFilename, logFilename): | |||||
success = True | |||||
if not os.path.isfile(specFilename): | |||||
print('Error: "{}" does not exist or is not a regular file', file = sys.stderr) | |||||
success = False | |||||
if os.path.exists(logFilename): | |||||
print('Error: "{}" already exists'.format(logFilename), file = sys.stderr) | |||||
success = False | |||||
if os.path.exists('STOP'): | |||||
print('Error: "STOP" exists', file = sys.stderr) | |||||
success = False | |||||
return success | |||||
def main(): | |||||
parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter) | |||||
parser.add_argument('--log', metavar = 'LOGFILE', default = './qwarc.log') | |||||
parser.add_argument('--database', metavar = 'DBFILE', default = './qwarc.db') | |||||
parser.add_argument('--warc', metavar = 'PREFIX', help = 'prefix for the WARC filenames', default = './qwarc') | |||||
parser.add_argument('--concurrency', type = int, default = 1) | |||||
parser.add_argument('--memorylimit', metavar = 'LIMIT', help = 'pause when less than LIMIT bytes memory is free; disable if 0', default = 0) | |||||
parser.add_argument('--disklimit', metavar = 'LIMIT', help = 'pause when less than LIMIT bytes disk space is free; disable if 0', default = 0) | |||||
parser.add_argument('--warcsplit', metavar = 'SIZE', help = 'split WARCs into files of SIZE bytes; disable if 0', default = 0) | |||||
parser.add_argument('specfile') | |||||
args = parser.parse_args() | |||||
if not check_files(args.specfile, args.log): | |||||
sys.exit(1) | |||||
setup_logging(args.log) | |||||
spec = importlib.util.spec_from_file_location('spec', args.specfile) | |||||
specMod = importlib.util.module_from_spec(spec) | |||||
spec.loader.exec_module(specMod) | |||||
a = qwarc.QWARC( | |||||
itemClasses = qwarc.Item.__subclasses__(), | |||||
warcBasePath = args.warc, | |||||
dbPath = args.database, | |||||
concurrency = args.concurrency, | |||||
memoryLimit = args.memorylimit, | |||||
minFreeDisk = args.disklimit, | |||||
warcSizeLimit = args.warcsplit, | |||||
) | |||||
if not os.path.exists(args.database): | |||||
a.create_db() | |||||
loop = asyncio.get_event_loop() | |||||
try: | |||||
loop.run_until_complete(a.run(loop)) | |||||
except (Exception, KeyboardInterrupt) as e: | |||||
logging.exception('Unhandled error') | |||||
loop.close() |
@@ -0,0 +1,23 @@ | |||||
STATUS_TODO = 0 | |||||
'''Status of an item that has not been processed yet''' | |||||
STATUS_INPROGRESS = 1 | |||||
'''Status of an item that is currently being processed''' | |||||
STATUS_DONE = 2 | |||||
'''Status of an item that has been processed''' | |||||
#TODO: Add a STATUS_ERROR? | |||||
ACTION_SUCCESS = 0 | |||||
'''Treat this response as a success''' | |||||
ACTION_IGNORE = 1 #TODO Replace with ACTION_SUCCESS since it's really the same thing. | |||||
'''Ignore this response''' | |||||
ACTION_RETRY = 2 | |||||
'''Retry the same request''' | |||||
ACTION_FOLLOW_OR_SUCCESS = 3 | |||||
'''If the response contains a Location or URI header, follow it. Otherwise, treat it as a success.''' | |||||
#TODO: Rename to ACTION_FOLLOW maybe? However, the current name makes it more clear what qwarc does when there's a redirect without a redirect target... |
@@ -0,0 +1,182 @@ | |||||
from qwarc.const import * | |||||
import asyncio | |||||
import os | |||||
PAGESIZE = os.sysconf('SC_PAGE_SIZE') | |||||
def get_rss(): | |||||
'''Get the current RSS of this process in bytes''' | |||||
with open('/proc/self/statm', 'r') as fp: | |||||
return int(fp.readline().split()[1]) * PAGESIZE | |||||
def get_disk_free(): | |||||
'''Get the current free disk space on the relevant partition in bytes''' | |||||
st = os.statvfs('.') | |||||
return st.f_bavail * st.f_frsize | |||||
def uses_too_much_memory(limit): | |||||
''' | |||||
Check whether the process is using too much memory | |||||
For performance reasons, this actually only checks the memory usage on every 100th call. | |||||
''' | |||||
uses_too_much_memory.callCounter += 1 | |||||
# Only check every hundredth call | |||||
if uses_too_much_memory.callCounter % 100 == 0 and get_rss() > limit: | |||||
return True | |||||
return False | |||||
uses_too_much_memory.callCounter = 0 | |||||
def too_little_disk_space(limit): | |||||
''' | |||||
Check whether the disk space is too small | |||||
For performance reasons, this actually only checks the free disk space on every 100th call. | |||||
''' | |||||
too_little_disk_space.callCounter += 1 | |||||
if too_little_disk_space.callCounter % 100 == 0: | |||||
too_little_disk_space.currentResult = (get_disk_free() < limit) | |||||
return too_little_disk_space.currentResult | |||||
too_little_disk_space.callCounter = 0 | |||||
too_little_disk_space.currentResult = False | |||||
# https://stackoverflow.com/a/4665027 | |||||
def find_all(aStr, sub): | |||||
'''Generator yielding the start positions of every non-overlapping occurrence of sub in aStr.''' | |||||
start = 0 | |||||
while True: | |||||
start = aStr.find(sub, start) | |||||
if start == -1: | |||||
return | |||||
yield start | |||||
start += len(sub) | |||||
def str_get_between(aStr, a, b): | |||||
'''Get the string after the first occurrence of a in aStr and the first occurrence of b after that of a, or None if there is no such string.''' | |||||
aPos = aStr.find(a) | |||||
if aPos == -1: | |||||
return None | |||||
offset = aPos + len(a) | |||||
bPos = aStr.find(b, offset) | |||||
if bPos == -1: | |||||
return None | |||||
return aStr[offset:bPos] | |||||
def maybe_str_get_between(x, a, b): | |||||
'''Like str_get_between, but returns None if x evaluates to False and converts it to a str before matching.''' | |||||
if x: | |||||
return str_get_between(str(x), a, b) | |||||
def str_get_all_between(aStr, a, b): | |||||
'''Generator yielding every string between occurrences of a in aStr and the following occurrence of b.''' | |||||
#TODO: This produces half-overlapping matches: str_get_all_between('aabc', 'a', 'c') will yield 'ab' and 'b'. | |||||
# Might need to implement sending an offset to the find_all generator to work around this, or discard aOffset values which are smaller than the previous bPos+len(b). | |||||
for aOffset in find_all(aStr, a): | |||||
offset = aOffset + len(a) | |||||
bPos = aStr.find(b, offset) | |||||
if bPos != -1: | |||||
yield aStr[offset:bPos] | |||||
def maybe_str_get_all_between(x, a, b): | |||||
'''Like str_get_all_between, but yields no elements if x evaluates to False and converts x to a str before matching.''' | |||||
if x: | |||||
yield from str_get_all_between(str(x), a, b) | |||||
def generate_range_items(start, stop, step): | |||||
''' | |||||
Generator for items of `step` size between `start` and `stop` (inclusive) | |||||
Yields strings of the form `'a-b'` where `a` and `b` are integers such that `b - a + 1 == step`, `min(a) == start`, and `max(b) == stop`. | |||||
`b - a + 1` may be unequal to `step` on the last item if `(stop - start + 1) % step != 0` (see examples below). | |||||
Note that `a` and `b` can be equal on the last item if `(stop - start) % step == 0` (see examples below). | |||||
Examples: | |||||
- generate_range_items(0, 99, 10) yields '0-9', '10-19', '20-29', ..., '90-99' | |||||
- generate_range_items(0, 42, 10): '0-9', '10-19', '20-29', '30-39', '40-42' | |||||
- generate_range_items(0, 20, 10): '0-9', '10-19', '20-20' | |||||
''' | |||||
for i in range(start, stop + 1, step): | |||||
yield '{}-{}'.format(i, min(i + step - 1, stop)) | |||||
async def handle_response_default(url, attempt, response, exc): | |||||
''' | |||||
The default response handler, which behaves as follows: | |||||
- If there is no response (e.g. timeout error), retry the retrieval after a delay of 5 seconds. | |||||
- If the response has any of the status codes 401, 403, 404, 405, or 410, treat it as a permanent error and return. | |||||
- If there was any exception and it is a asyncio.TimeoutError or a aiohttp.ClientError, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds. | |||||
- If the response has any of the status codes 200, 204, 206, or 304, treat it as a success and return. | |||||
- If the response has any of the status codes 301, 302, 303, 307, or 308, follow the redirect target if specified or return otherwise. | |||||
- Otherwise, treat as a potentially temporary error and retry the retrieval after a delay of 5 seconds. | |||||
- All responses are written to WARC by default. | |||||
Note that this handler does not limit the number of retries on errors. | |||||
Parameters: url (yarl.URL instance), attempt (int), response (aiohttp.ClientResponse or None), exc (Exception or None) | |||||
At least one of response and exc is not None. | |||||
Returns: (one of the qwarc.RESPONSE_* constants, bool signifying whether to write to WARC or not) | |||||
''' | |||||
#TODO: Document that `attempt` is reset on redirects | |||||
if response is None: | |||||
await asyncio.sleep(5) | |||||
return ACTION_RETRY, True | |||||
if response.status in (401, 403, 404, 405, 410): | |||||
return ACTION_IGNORE, True | |||||
if exc is not None and isinstance(exc, (asyncio.TimeoutError, _aiohttp.ClientError)): | |||||
await asyncio.sleep(5) | |||||
return ACTION_RETRY, True | |||||
if response.status in (200, 204, 206, 304): | |||||
return ACTION_SUCCESS, True | |||||
if response.status in (301, 302, 303, 307, 308): | |||||
return ACTION_FOLLOW_OR_SUCCESS, True | |||||
await asyncio.sleep(5) | |||||
return ACTION_RETRY, True | |||||
async def handle_response_ignore_redirects(url, attempt, response, exc): | |||||
'''A response handler that does not follow redirects, i.e. treats them as a success instead. It behaves as handle_response_default otherwise.''' | |||||
action, writeToWarc = await handle_response_default(url, attempt, response, exc) | |||||
if action == ACTION_FOLLOW_OR_SUCCESS: | |||||
action = ACTION_SUCCESS | |||||
return action, writeToWarc | |||||
def handle_response_limit_error_retries(maxRetries): | |||||
'''A response handler that limits the number of retries on errors. It behaves as handle_response_default otherwise. | |||||
Technically, this is actually a response handler factory. This is so that the intuitive use works: fetch(..., responseHandler = handle_response_limit_error_retries(5)) | |||||
If you use the same limit many times, you should keep the return value (the response handler) of this method and reuse it to avoid creating a new function every time. | |||||
''' | |||||
async def handler(url, attempt, response, exc): | |||||
action, writeToWarc = await handle_response_default(url, attempt, response, exc) | |||||
if action == ACTION_RETRY and attempt > maxRetries: | |||||
action = ACTION_IGNORE | |||||
return action, writeToWarc | |||||
return handler |
@@ -0,0 +1,93 @@ | |||||
import fcntl | |||||
import io | |||||
import logging | |||||
import time | |||||
import warcio | |||||
class WARCWriter(warcio.warcwriter.WARCWriter): | |||||
def _do_write_req_resp(self, req, resp, params): #FIXME: Internal API | |||||
# Write request before response, like wget and wpull; cf. https://github.com/webrecorder/warcio/issues/20 | |||||
self._write_warc_record(self.out, req) | |||||
self._write_warc_record(self.out, resp) | |||||
class WARC: | |||||
def __init__(self, prefix, maxFileSize): | |||||
''' | |||||
Initialise the WARC writer | |||||
prefix: str, path prefix for WARCs; a dash, a five-digit number, and ".warc.gz" will be appended. | |||||
maxFileSize: int, maximum size of an individual WARC. Use 0 to disable splitting. | |||||
''' | |||||
self._prefix = prefix | |||||
self._counter = 0 | |||||
self._maxFileSize = maxFileSize | |||||
self._closed = True | |||||
self._file = None | |||||
self._warcWriter = None | |||||
self._cycle() | |||||
def _cycle(self): | |||||
'''Close the current file, open the next file that doesn't exist yet''' | |||||
#TODO: This opens a new file also at the end, which can result in empty WARCs. Should try to reorder this to only open a WARC when writing a record, and to only close the current WARC if the size is exceeded after write_client_response. | |||||
self.close() | |||||
while True: | |||||
filename = '{}-{:05d}.warc.gz'.format(self._prefix, self._counter) | |||||
try: | |||||
# Try to open the file for writing, requiring that it does not exist yet, and attempt to get an exclusive, non-blocking lock on it | |||||
self._file = open(filename, 'xb') | |||||
fcntl.flock(self._file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) | |||||
except FileExistsError: | |||||
logging.info('{} already exists, skipping'.format(filename)) | |||||
self._counter += 1 | |||||
else: | |||||
break | |||||
logging.info('Opened {}'.format(filename)) | |||||
self._warcWriter = WARCWriter(self._file, gzip = True) | |||||
self._closed = False | |||||
self._counter += 1 | |||||
def write_client_response(self, response): | |||||
''' | |||||
Write the requests and responses stored in a ClientResponse instance to the currently opened WARC. | |||||
A new WARC will be started automatically if the size of the current file exceeds the limit after writing all requests and responses from this `response` to the current WARC. | |||||
''' | |||||
for r in response.iter_all(): | |||||
requestDate = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(r.rawRequestTimestamp)) | |||||
requestRecord = self._warcWriter.create_warc_record( | |||||
str(r.url), | |||||
'request', | |||||
payload = io.BytesIO(r.rawRequestData), | |||||
warc_headers_dict = { | |||||
'WARC-Date': requestDate, | |||||
'WARC-IP-Address': r.remoteAddress[0], | |||||
} | |||||
) | |||||
responseRecord = self._warcWriter.create_warc_record( | |||||
str(r.url), | |||||
'response', | |||||
payload = io.BytesIO(r.rawResponseData), | |||||
warc_headers_dict = { | |||||
'WARC-Date': requestDate, | |||||
'WARC-IP-Address': r.remoteAddress[0], | |||||
} | |||||
) | |||||
self._warcWriter.write_request_response_pair(requestRecord, responseRecord) | |||||
if self._maxFileSize and self._file.tell() > self._maxFileSize: | |||||
self._cycle() | |||||
def close(self): | |||||
'''Close the currently opened WARC''' | |||||
if not self._closed: | |||||
self._file.close() | |||||
self._warcWriter = None | |||||
self._file = None | |||||
self._closed = True |
@@ -0,0 +1,22 @@ | |||||
import setuptools | |||||
setuptools.setup( | |||||
name = 'qwarc', | |||||
version = '0.0-dev', | |||||
description = 'A framework for quick web archival', | |||||
author = 'JustAnotherArchivist', | |||||
url = 'https://github.com/JustAnotherArchivist/qwarc', | |||||
classifiers = [ | |||||
'Development Status :: 3 - Alpha', | |||||
'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)', | |||||
'Programming Language :: Python :: 3.6', | |||||
], | |||||
packages = ['qwarc'], | |||||
install_requires = ['aiohttp==2.3.10', 'warcio', 'yarl'], | |||||
entry_points = { | |||||
'console_scripts': [ | |||||
'qwarc = qwarc.cli:main', | |||||
], | |||||
}, | |||||
) |