Преглед на файлове

Add indices for files

JustAnotherArchivist преди 1 година
променени са 3 файла, в които са добавени 102 реда и са изтрити 8 реда
  1. +69
  2. +22
  3. +11

+ 69
- 2
codearchiver/core.py Целия файл

@@ -2,8 +2,11 @@ import abc
import collections
#import codearchiver.modules # In get_module_class
import codearchiver.version
import contextlib
import dataclasses
import functools
import logging
import os
import queue
import requests
import time
@@ -59,13 +62,77 @@ class Result:
id: str
'''A unique ID for this result'''

files: list[str] = dataclasses.field(default_factory = list)
'''List of filenames produced by the run'''
files: list[tuple[str, typing.Optional['Index']]] = dataclasses.field(default_factory = list)
'''List of filenames produced by the run, optionally with an index'''

submoduleResults: list[tuple['Module', 'Result']] = dataclasses.field(default_factory = list)
'''List of related submodules and their results'''

class IndexValidationError(ValueError):

class IndexField:
key: str
required: bool
repeatable: bool

class Index(list[tuple[str, str]]):
'''An index (key-value mapping, possibly with repeated keys) of a file produced by a module'''

fields: list[IndexField] = []
'''The fields for this index'''

def append(self, *args):
if len(args) == 1:
args = args[0]
return super().append(args)

def validate(self):
'''Check that all keys and values in the index conform to the specification'''

keyCounts = collections.Counter(key for key, _ in self)
keys = set(keyCounts)

permittedKeys = set(field.key for field in type(self).fields)
unrecognisedKeys = keys - permittedKeys
if unrecognisedKeys:
raise IndexValidationError(f'Unrecognised key(s): {", ".join(sorted(unrecognisedKeys))}')

requiredKeys = set(field.key for field in type(self).fields if field.required)
missingRequiredKeys = requiredKeys - keys
if missingRequiredKeys:
raise IndexValidationError(f'Missing required key(s): {", ".join(sorted(missingRequiredKeys))}')

repeatableKeys = set(field.key for field in type(self).fields if field.repeatable)
repeatedKeys = set(key for key, count in keyCounts.items() if count > 1)
repeatedUnrepeatableKeys = repeatedKeys - repeatableKeys
if repeatedUnrepeatableKeys:
raise IndexValidationError(f'Repeated unrepeatable key(s): {", ".join(sorted(repeatedUnrepeatableKeys))}')

def serialise(self) -> str:
'''Convert the index to a string suitable for e.g. a simple text file storage'''

return ''.join(f'{key}: {value}\n' for key, value in self)

def deserialise(cls, f: typing.Union[str, bytes, os.PathLike, typing.TextIO]):
'''Import a serialised index from a filename or file-like object'''

if isinstance(f, (str, bytes, os.PathLike)):
cm = open(f, 'r')
cm = contextlib.nullcontext(f)
with cm as fp:
o = cls((key, value[:-1]) for key, value in map(functools.partial(str.split, sep = ': '), fp))
return o

class HttpError(Exception):
'''An HTTP request failed too many times.'''

+ 22
- 1
codearchiver/modules/git.py Целия файл

@@ -1,6 +1,7 @@
import codearchiver.core
import codearchiver.subprocess
import datetime
import functools
import logging
import os.path
import shutil
@@ -10,6 +11,14 @@ import subprocess
logger = logging.getLogger(__name__)

class GitIndex(codearchiver.core.Index):
fields = [
codearchiver.core.IndexField(key = 'Ref', required = True, repeatable = True),
codearchiver.core.IndexField(key = 'Root commit', required = True, repeatable = True),
codearchiver.core.IndexField(key = 'Commit', required = True, repeatable = True),

class Git(codearchiver.core.Module):
name = 'git'

@@ -51,10 +60,22 @@ class Git(codearchiver.core.Module):
logger.info(f'Bundling into {bundle}')
codearchiver.subprocess.run_with_log(['git', 'bundle', 'create', '--progress', f'../{bundle}', '--all'], cwd = directory)

logger.info(f'Collecting repository metadata for index')
_, refs = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory)
_, commits = codearchiver.subprocess.run_with_log(['git', 'log', '--reflog', '--format=format:%H% P'], cwd = directory)

logger.info(f'Removing clone')

return codearchiver.core.Result(id = self._id, files = [bundle])
index = GitIndex()
for line in refs.splitlines():
index.append('Ref', line)
for commitHash, *parents in map(functools.partial(str.split, sep = ' '), commits.splitlines()):
index.append('Commit', commitHash)
if not parents:
index.append('Root commit', commitHash)

return codearchiver.core.Result(id = self._id, files = [(bundle, index)])

def __repr__(self):
return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r}, extraBranches = {self._extraBranches!r})'

+ 11
- 5
codearchiver/storage.py Целия файл

@@ -12,13 +12,13 @@ _logger = logging.getLogger(__name__)

class Storage(abc.ABC):
def put(self, filename: str):
'''Put a local file into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.'''
def put(self, filename: str, index: typing.Optional[codearchiver.core.Index] = None):
'''Put a local file and (if provided) its index into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.'''

def put_result(self, result: codearchiver.core.Result):
'''Put a module's Result into storage. The semantics are as for `put`, and the exact behaviour regarding partial copies and leftover files on errors is undefined.'''
for fn in result.files:
for fn, index in result.files:
self.put(fn, index)
for _, subresult in result.submoduleResults:

@@ -43,10 +43,16 @@ class DirectoryStorage(Storage):
if not self._check_directory():

def put(self, filename):
def put(self, filename, index = None):
_logger.info(f'Moving {filename} to {self._directory}')
shutil.move(filename, self._directory)
if not index:
indexFilename = os.path.join(self._directory, f'{filename}.codearchiver-index')
_logger.info(f'Writing index for {filename} to {indexFilename}')
with open(indexFilename, 'x') as fp:

def open(self, filename):
