|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390 |
- import qwarc.aiohttp
- from qwarc.const import *
- import qwarc.utils
- import qwarc.warc
-
-
- import aiohttp as _aiohttp
- if _aiohttp.__version__ != '2.3.10':
- raise ImportError('aiohttp must be version 2.3.10')
- import asyncio
- import collections
- import concurrent.futures
- import io
- import itertools
- import logging
- import os
- import random
- import sqlite3
- import urllib.parse
- import yarl
-
-
- class Item:
- itemType = None
-
- def __init__(self, qwarcObj, itemValue, session, headers, warc):
- self.qwarcObj = qwarcObj
- self.itemValue = itemValue
- self.session = session
- self.headers = headers
- self.warc = warc
- self.stats = {'tx': 0, 'rx': 0, 'requests': 0}
- self.logger = logging.LoggerAdapter(logging.getLogger(), {'itemType': self.itemType, 'itemValue': self.itemValue})
-
- self.childItems = []
-
- async def fetch(self, url, responseHandler = qwarc.utils.handle_response_default, method = 'GET', data = None, headers = [], verify_ssl = True, timeout = 60):
- '''
- HTTP GET or POST a URL
-
- url: str or yarl.URL
- responseHandler: a callable that determines how the response is handled. See qwarc.utils.handle_response_default for details.
- method: str, must be 'GET' or 'POST'
- data: dict or list/tuple of lists/tuples of length two or bytes or file-like or None, the data to be sent in the request body
- headers: list of 2-tuples, additional headers for this request only
-
- Returns response (a ClientResponse object or None) and history (a tuple of (response, exception) tuples).
- response can be None and history can be an empty tuple, depending on the circumstances (e.g. timeouts).
- '''
-
- #TODO: Rewrite using 'async with self.session.get'
-
- url = yarl.URL(url) # Explicitly convert for normalisation, percent-encoding, etc.
- assert method in ('GET', 'POST', 'HEAD'), 'method must be GET, POST, or HEAD'
- headers = self.headers + headers
- #TODO Deduplicate headers with later values overriding earlier ones
- history = []
- attempt = 0
- #TODO redirectLevel
- while True:
- attempt += 1
- response = None
- exc = None
- action = ACTION_RETRY
- writeToWarc = True
- try:
- try:
- with _aiohttp.Timeout(timeout):
- self.logger.info(f'Fetching {url}')
- response = await self.session.request(method, url, data = data, headers = headers, allow_redirects = False, verify_ssl = verify_ssl)
- try:
- while True:
- ret = await response.content.read(1048576)
- if not ret:
- break
- except:
- # No calling the handleResponse callback here because this is really bad. The not-so-bad exceptions (e.g. an error during reading the response) will be caught further down.
- response.close()
- raise
- else:
- response.rawRequestData.seek(0, io.SEEK_END)
- tx = response.rawRequestData.tell()
- response.rawResponseData.seek(0, io.SEEK_END)
- rx = response.rawResponseData.tell()
- self.logger.info(f'Fetched {url}: {response.status} (tx {tx}, rx {rx})')
- self.stats['tx'] += tx
- self.stats['rx'] += rx
- self.stats['requests'] += 1
- except (asyncio.TimeoutError, _aiohttp.ClientError) as e:
- self.logger.warning(f'Request for {url} failed: {e!r}')
- action, writeToWarc = await responseHandler(url, attempt, response, e)
- exc = e # Pass the exception outward for the history
- else:
- action, writeToWarc = await responseHandler(url, attempt, response, None)
- if response and exc is None and writeToWarc:
- self.warc.write_client_response(response)
- history.append((response, exc))
- retResponse = response if exc is None else None
- if action in (ACTION_SUCCESS, ACTION_IGNORE):
- return retResponse, tuple(history)
- elif action == ACTION_FOLLOW_OR_SUCCESS:
- redirectUrl = response.headers.get('Location') or response.headers.get('URI')
- if not redirectUrl:
- return retResponse, tuple(history)
- if any(56448 <= ord(c) <= 56575 for c in redirectUrl):
- # Surrogate escape characters in the redirect URL, which usually means that the server sent non-ASCII data (e.g. ISO-8859-1).
- # Revert the encoding, then percent-encode the non-ASCII bytes.
- redirectUrl = urllib.parse.quote_from_bytes(redirectUrl.encode('utf8', 'surrogateescape'), safe = ''.join(chr(i) for i in range(128)))
- url = url.join(yarl.URL(redirectUrl))
- if response.status in (301, 302, 303) and method == 'POST':
- method = 'GET'
- data = None
- attempt = 0
- elif action == ACTION_RETRIES_EXCEEDED:
- self.logger.error(f'Request for {url} failed {attempt} times')
- return retResponse, tuple(history)
- elif action == ACTION_RETRY:
- # Nothing to do, just go to the next cycle
- pass
- finally:
- if response:
- await response.release()
-
- async def process(self):
- raise NotImplementedError
-
- @classmethod
- def generate(cls):
- yield from () # Generate no items by default
-
- @classmethod
- def _gen(cls):
- for x in cls.generate():
- yield (cls.itemType, x, STATUS_TODO)
-
- def add_subitem(self, itemClassOrType, itemValue):
- if issubclass(itemClassOrType, Item):
- item = (itemClassOrType.itemType, itemValue)
- else:
- item = (itemClassOrType, itemValue)
- if item not in self.childItems:
- self.childItems.append(item)
-
- async def flush_subitems(self):
- await self.qwarcObj.flush_subitems(self)
-
- def clear_subitems(self):
- self.childItems = []
-
- @classmethod
- def get_subclasses(cls):
- for subclass in cls.__subclasses__():
- yield subclass
- yield from subclass.get_subclasses()
-
-
- class QWARC:
- def __init__(self, itemClasses, warcBasePath, dbPath, command, specFile, specDependencies, logFilename, concurrency = 1, memoryLimit = 0, minFreeDisk = 0, warcSizeLimit = 0, warcDedupe = False):
- '''
- itemClasses: iterable of Item
- warcBasePath: str, base name of the WARC files
- dbPath: str, path to the sqlite3 database file
- command: list, the command line used to invoke qwarc
- specFile: str, path to the spec file
- specDependencies: qwarc.utils.SpecDependencies
- logFilename: str, name of the log file written by this process
- concurrency: int, number of concurrently processed items
- memoryLimit: int, gracefully stop when the process uses more than memoryLimit bytes of RSS; 0 disables the memory check
- minFreeDisk: int, pause when there's less than minFreeDisk space on the partition where WARCs are written; 0 disables the disk space check
- warcSizeLimit: int, size of each WARC file; 0 if the WARCs should not be split
- '''
-
- self._itemClasses = itemClasses
- self._itemTypeMap = {cls.itemType: cls for cls in itemClasses}
- self._warcBasePath = warcBasePath
- self._dbPath = dbPath
- self._command = command
- self._specFile = specFile
- self._specDependencies = specDependencies
- self._logFilename = logFilename
- self._concurrency = concurrency
- self._memoryLimit = memoryLimit
- self._minFreeDisk = minFreeDisk
- self._warcSizeLimit = warcSizeLimit
- self._warcDedupe = warcDedupe
-
- self._reset_working_vars()
-
- def _reset_working_vars(self):
- # Working variables
- self._db = None
- self._tasks = set()
- self._sleepTasks = set()
- self._sessions = [] # aiohttp.ClientSession instances
- self._freeSessions = collections.deque() # ClientSession instances that are currently free
- self._warc = None
-
- async def obtain_exclusive_db_lock(self):
- c = self._db.cursor()
- while True:
- try:
- c.execute('BEGIN EXCLUSIVE')
- break
- except sqlite3.OperationalError as e:
- if str(e) != 'database is locked':
- raise
- await asyncio.sleep(1)
- return c
-
- def _make_item(self, itemType, itemValue, session, headers):
- try:
- itemClass = self._itemTypeMap[itemType]
- except KeyError:
- raise RuntimeError(f'No such item type: {itemType!r}')
- return itemClass(self, itemValue, session, headers, self._warc)
-
- async def _wait_for_free_task(self):
- if not self._tasks:
- return
- done, pending = await asyncio.wait(self._tasks, return_when = concurrent.futures.FIRST_COMPLETED)
- for future in done:
- newStatus = STATUS_DONE
- if future.taskType == 'sleep':
- self._sleepTasks.remove(future)
- elif future.taskType == 'process':
- item = future.item
- # TODO Replace all of this with `if future.cancelled():`
- try:
- await future #TODO: Is this actually necessary? asyncio.wait only returns 'done' futures...
- except concurrent.futures.CancelledError as e:
- # Got cancelled, nothing we can do about it, but let's log a warning if it's a process task
- if future.taskType == 'process':
- logging.error(f'Task for {future.itemType}:{future.itemValue} cancelled: {future!r}')
- newStatus = STATUS_ERROR
- except Exception as e:
- if future.taskType == 'process':
- logging.error(f'{future.itemType}:{future.itemValue} failed: {e!r} ({item.stats["requests"]} requests, {item.stats["tx"]} tx, {item.stats["rx"]} rx)', exc_info = e)
- newStatus = STATUS_ERROR
- else:
- if future.taskType == 'process':
- logging.info(f'{future.itemType}:{future.itemValue} done: {item.stats["requests"]} requests, {item.stats["tx"]} tx, {item.stats["rx"]} rx')
- if future.taskType != 'process':
- continue
- cursor = await self.obtain_exclusive_db_lock()
- try:
- cursor.execute('UPDATE items SET status = ? WHERE id = ?', (newStatus, future.id))
- cursor.execute('COMMIT')
- except:
- cursor.execute('ROLLBACK')
- raise
- await self._insert_subitems(item)
- self._freeSessions.append(item.session)
- self._tasks = pending
-
- async def _insert_subitems(self, item):
- cursor = await self.obtain_exclusive_db_lock()
- try:
- if item.childItems:
- it = iter(item.childItems)
- while True:
- values = [(t, v, STATUS_TODO) for t, v in itertools.islice(it, 100000)]
- if not values:
- break
- cursor.executemany('INSERT OR IGNORE INTO items (type, value, status) VALUES (?, ?, ?)', values)
- cursor.execute('COMMIT')
- except:
- cursor.execute('ROLLBACK')
- raise
-
- async def run(self, loop):
- headers = [('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0')] #TODO: Move elsewhere
-
- for i in range(self._concurrency):
- session = _aiohttp.ClientSession(
- connector = qwarc.aiohttp.TCPConnector(loop = loop),
- request_class = qwarc.aiohttp.ClientRequest,
- response_class = qwarc.aiohttp.ClientResponse,
- skip_auto_headers = ['Accept-Encoding'],
- loop = loop
- )
- self._sessions.append(session)
- self._freeSessions.append(session)
-
- self._warc = qwarc.warc.WARC(self._warcBasePath, self._warcSizeLimit, self._warcDedupe, self._command, self._specFile, self._specDependencies, self._logFilename)
-
- self._db = sqlite3.connect(self._dbPath, timeout = 1)
- self._db.isolation_level = None # Transactions are handled manually below.
- self._db.execute('PRAGMA synchronous = OFF')
-
- try:
- while True:
- while len(self._tasks) >= self._concurrency:
- await self._wait_for_free_task()
-
- if os.path.exists('STOP'):
- logging.info('Gracefully shutting down due to STOP file')
- break
- if self._memoryLimit and qwarc.utils.uses_too_much_memory(self._memoryLimit):
- logging.info(f'Gracefully shutting down due to memory usage (current = {qwarc.utils.get_rss()} > limit = {self._memoryLimit})')
- break
-
- if self._minFreeDisk and qwarc.utils.too_little_disk_space(self._minFreeDisk):
- logging.info('Disk space is low, sleeping')
- sleepTask = asyncio.ensure_future(asyncio.sleep(random.uniform(self._concurrency / 2, self._concurrency * 1.5)))
- sleepTask.taskType = 'sleep'
- self._tasks.add(sleepTask)
- self._sleepTasks.add(sleepTask)
- continue
-
- cursor = await self.obtain_exclusive_db_lock()
- try:
- cursor.execute('SELECT id, type, value, status FROM items WHERE status = ? LIMIT 1', (STATUS_TODO,))
- result = cursor.fetchone()
- if not result:
- if cursor.execute('SELECT id, status FROM items WHERE status != ? LIMIT 1', (STATUS_DONE,)).fetchone():
- # There is currently no item to do, but there are still some in progress, so more TODOs may appear in the future.
- # It would be nice if we could just await wait_for_free_task() here, but that doesn't work because those TODOs might be in another process.
- # So instead, we insert a dummy task which just sleeps a bit. Average sleep time is equal to concurrency, i.e. one check per second.
- #TODO: The average sleep time is too large if there are only few sleep tasks; scale with len(sleepTasks)/self._concurrency?
- sleepTask = asyncio.ensure_future(asyncio.sleep(random.uniform(self._concurrency / 2, self._concurrency * 1.5)))
- sleepTask.taskType = 'sleep'
- self._tasks.add(sleepTask)
- self._sleepTasks.add(sleepTask)
- cursor.execute('COMMIT')
- continue
- else:
- # Really nothing to do anymore
- #TODO: Another process may be running create_db, in which case we'd still want to wait...
- # create_db could insert a dummy item which is marked as done when the DB is ready
- cursor.execute('COMMIT')
- break
- id, itemType, itemValue, status = result
- cursor.execute('UPDATE items SET status = ? WHERE id = ?', (STATUS_INPROGRESS, id))
- cursor.execute('COMMIT')
- except:
- cursor.execute('ROLLBACK')
- raise
-
- session = self._freeSessions.popleft()
- item = self._make_item(itemType, itemValue, session, headers)
- task = asyncio.ensure_future(item.process())
- #TODO: Is there a better way to add custom information to a task/coroutine object?
- task.taskType = 'process'
- task.id = id
- task.itemType = itemType
- task.itemValue = itemValue
- task.item = item
- self._tasks.add(task)
-
- for sleepTask in self._sleepTasks:
- sleepTask.cancel()
-
- while len(self._tasks):
- await self._wait_for_free_task()
-
- logging.info('Done')
- except (Exception, KeyboardInterrupt) as e:
- # Kill all tasks
- for task in self._tasks:
- task.cancel()
- await asyncio.wait(self._tasks, return_when = concurrent.futures.ALL_COMPLETED)
-
- raise
- finally:
- for session in self._sessions:
- session.close()
- self._warc.close()
- self._db.close()
-
- self._reset_working_vars()
-
- async def flush_subitems(self, item):
- await self._insert_subitems(item)
- item.clear_subitems()
-
- def create_db(self):
- db = sqlite3.connect(self._dbPath, timeout = 1)
- db.execute('PRAGMA synchronous = OFF')
- with db:
- db.execute('CREATE TABLE items (id INTEGER PRIMARY KEY, type TEXT, value TEXT, status INTEGER)')
- db.execute('CREATE INDEX items_status_idx ON items (status)')
- db.execute('CREATE UNIQUE INDEX items_type_value_idx ON items (type, value)')
-
- it = itertools.chain(*(i._gen() for i in self._itemClasses))
- while True:
- values = tuple(itertools.islice(it, 100000))
- if not values:
- break
- with db:
- db.executemany('INSERT INTO items (type, value, status) VALUES (?, ?, ?)', values)
|