Browse Source

Refactor database creation and item generation: call `Item.generate()` on every qwarc run and dedupe its output, allowing the addition of further items by modifying the spec file

master
JustAnotherArchivist 3 years ago
parent
commit
6bdcfe71f0
2 changed files with 19 additions and 18 deletions
  1. +19
    -15
      qwarc/__init__.py
  2. +0
    -3
      qwarc/cli.py

+ 19
- 15
qwarc/__init__.py View File

@@ -152,11 +152,6 @@ class Item:
def generate(cls):
yield from () # Generate no items by default

@classmethod
def _gen(cls):
for x in cls.generate():
yield (cls.itemType, x, STATUS_TODO)

def add_subitem(self, itemClassOrType, itemValue):
if issubclass(itemClassOrType, Item):
item = (itemClassOrType.itemType, itemValue)
@@ -307,6 +302,18 @@ class QWARC:
self._db.isolation_level = None # Transactions are handled manually below.
self._db.execute('PRAGMA synchronous = OFF')

cursor = await self.obtain_exclusive_db_lock()
try:
cursor.execute('SELECT name FROM sqlite_master WHERE type = "table" AND name = "items"')
result = cursor.fetchone()
if not result:
self._create_db(cursor)
self._insert_generated_items(cursor)
cursor.execute('COMMIT')
except:
cursor.execute('ROLLBACK')
raise

try:
while True:
while len(self._tasks) >= self._concurrency:
@@ -393,18 +400,15 @@ class QWARC:
await self._insert_subitems(item)
item.clear_subitems()

def create_db(self):
db = sqlite3.connect(self._dbPath, timeout = 1)
db.execute('PRAGMA synchronous = OFF')
with db:
db.execute('CREATE TABLE items (id INTEGER PRIMARY KEY, type TEXT, value TEXT, status INTEGER)')
db.execute('CREATE INDEX items_status_idx ON items (status)')
db.execute('CREATE UNIQUE INDEX items_type_value_idx ON items (type, value)')
def _create_db(self, cursor):
cursor.execute('CREATE TABLE items (id INTEGER PRIMARY KEY, type TEXT, value TEXT, status INTEGER)')
cursor.execute('CREATE INDEX items_status_idx ON items (status)')
cursor.execute('CREATE UNIQUE INDEX items_type_value_idx ON items (type, value)')

it = itertools.chain(*(i._gen() for i in self._itemClasses))
def _insert_generated_items(self, cursor):
it = itertools.chain((cls.itemType, value, STATUS_TODO) for cls in self._itemClasses for value in cls.generate())
while True:
values = tuple(itertools.islice(it, 100000))
if not values:
break
with db:
db.executemany('INSERT INTO items (type, value, status) VALUES (?, ?, ?)', values)
cursor.executemany('INSERT OR IGNORE INTO items (type, value, status) VALUES (?, ?, ?)', values)

+ 0
- 3
qwarc/cli.py View File

@@ -78,9 +78,6 @@ def main():
warcSizeLimit = args.warcsplit,
warcDedupe = args.warcdedupe,
)
if not os.path.exists(args.database):
a.create_db()

loop = asyncio.get_event_loop()
try:
loop.run_until_complete(a.run(loop))


Loading…
Cancel
Save