'Index' was a misnomer from the start since it contains critical information for the operation that can't be reconstructed (e.g. existing refs).tags/v1.0
@@ -64,37 +64,37 @@ class Result: | |||
id: str | |||
'''A unique ID for this result''' | |||
files: list[tuple[str, typing.Optional['Index']]] = dataclasses.field(default_factory = list) | |||
'''List of filenames produced by the run, optionally with an index''' | |||
files: list[tuple[str, typing.Optional['Metadata']]] = dataclasses.field(default_factory = list) | |||
'''List of filenames produced by the run, optionally with metadata''' | |||
submoduleResults: list[tuple['Module', 'Result']] = dataclasses.field(default_factory = list) | |||
'''List of related submodules and their results''' | |||
class IndexValidationError(ValueError): | |||
class MetadataValidationError(ValueError): | |||
pass | |||
@dataclasses.dataclass | |||
class IndexField: | |||
class MetadataField: | |||
key: str | |||
required: bool | |||
repeatable: bool | |||
class Index(list[tuple[str, str]]): | |||
'''An index (key-value mapping, possibly with repeated keys) of a file produced by a module''' | |||
class Metadata(list[tuple[str, str]]): | |||
'''Metadata (key-value mapping, possibly with repeated keys) of a file produced by a module''' | |||
fields: tuple[IndexField] = ( | |||
IndexField('codearchiver version', required = True, repeatable = False), | |||
IndexField('Module', required = True, repeatable = False), | |||
IndexField('ID', required = True, repeatable = False), | |||
IndexField('Input URL', required = True, repeatable = False), | |||
IndexField('Filename', required = True, repeatable = False), | |||
fields: tuple[MetadataField] = ( | |||
MetadataField('codearchiver version', required = True, repeatable = False), | |||
MetadataField('Module', required = True, repeatable = False), | |||
MetadataField('ID', required = True, repeatable = False), | |||
MetadataField('Input URL', required = True, repeatable = False), | |||
MetadataField('Filename', required = True, repeatable = False), | |||
) | |||
'''The fields for this index''' | |||
'''The fields for this metadata collection''' | |||
_allFieldsCache: typing.Optional[tuple[IndexField]] = None | |||
_allFieldsCache: typing.Optional[tuple[MetadataField]] = None | |||
def append(self, *args): | |||
if len(args) == 1: | |||
@@ -104,7 +104,7 @@ class Index(list[tuple[str, str]]): | |||
# This should be a @classmethod, too, but that's deprecated since Python 3.11. | |||
@property | |||
def _allFields(self): | |||
'''All fields known by this index, own ones and all from superclasses''' | |||
'''All fields known by this metadata collection, own ones and all from superclasses''' | |||
if type(self)._allFieldsCache is None: | |||
fields = [] | |||
@@ -114,7 +114,7 @@ class Index(list[tuple[str, str]]): | |||
return type(self)._allFieldsCache | |||
def validate(self): | |||
'''Check that all keys and values in the index conform to the specification''' | |||
'''Check that all keys and values conform to the specification''' | |||
keyCounts = collections.Counter(key for key, _ in self) | |||
keys = set(keyCounts) | |||
@@ -122,29 +122,29 @@ class Index(list[tuple[str, str]]): | |||
permittedKeys = set(field.key for field in self._allFields) | |||
unrecognisedKeys = keys - permittedKeys | |||
if unrecognisedKeys: | |||
raise IndexValidationError(f'Unrecognised key(s): {", ".join(sorted(unrecognisedKeys))}') | |||
raise MetadataValidationError(f'Unrecognised key(s): {", ".join(sorted(unrecognisedKeys))}') | |||
requiredKeys = set(field.key for field in self._allFields if field.required) | |||
missingRequiredKeys = requiredKeys - keys | |||
if missingRequiredKeys: | |||
raise IndexValidationError(f'Missing required key(s): {", ".join(sorted(missingRequiredKeys))}') | |||
raise MetadataValidationError(f'Missing required key(s): {", ".join(sorted(missingRequiredKeys))}') | |||
repeatableKeys = set(field.key for field in self._allFields if field.repeatable) | |||
repeatedKeys = set(key for key, count in keyCounts.items() if count > 1) | |||
repeatedUnrepeatableKeys = repeatedKeys - repeatableKeys | |||
if repeatedUnrepeatableKeys: | |||
raise IndexValidationError(f'Repeated unrepeatable key(s): {", ".join(sorted(repeatedUnrepeatableKeys))}') | |||
raise MetadataValidationError(f'Repeated unrepeatable key(s): {", ".join(sorted(repeatedUnrepeatableKeys))}') | |||
def matches(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> bool: | |||
''' | |||
Check whether the criteria match this index | |||
Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in the index. | |||
Check whether the criteria match this metadata collection | |||
Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in the metadata. | |||
Multiple criteria may use the same key to perform an AND search. | |||
The index is a match if all criteria match. | |||
The metadata is a match if all criteria match. | |||
''' | |||
criteria = criteria.copy() | |||
_logger.debug(f'Searching index for {criteria!r}') | |||
_logger.debug(f'Searching metadata for {criteria!r}') | |||
keysOfInterest = set(key for key, _ in criteria) | |||
for key, value in self: | |||
if key not in keysOfInterest: | |||
@@ -169,14 +169,14 @@ class Index(list[tuple[str, str]]): | |||
return not bool(criteria) | |||
def serialise(self) -> str: | |||
'''Convert the index to a string suitable for e.g. a simple text file storage''' | |||
'''Convert the metadata to a string suitable for e.g. a simple text file storage''' | |||
self.validate() | |||
return ''.join(f'{key}: {value}\n' for key, value in self) | |||
@classmethod | |||
def deserialise(cls, f: typing.Union[str, bytes, os.PathLike, typing.TextIO], *, validate = True): | |||
'''Import a serialised index from a filename or file-like object''' | |||
'''Import a serialised metadata from a filename or file-like object''' | |||
if isinstance(f, (str, bytes, os.PathLike)): | |||
cm = open(f, 'r') | |||
@@ -355,8 +355,8 @@ class Module(metaclass = ModuleMeta): | |||
name: typing.Optional[str] = None | |||
'''The name of the module. Modules without a name are ignored. Names must be unique and may only contain a-z and hyphens.''' | |||
IndexClass: typing.Optional[typing.Type[Index]] = None | |||
'''The Index class corresponding to this module, if any.''' | |||
MetadataClass: typing.Optional[typing.Type[Metadata]] = None | |||
'''The Metadata class corresponding to this module, if any.''' | |||
@staticmethod | |||
def matches(inputUrl: InputURL) -> bool: | |||
@@ -376,12 +376,12 @@ class Module(metaclass = ModuleMeta): | |||
def process(self) -> Result: | |||
'''Perform the relevant retrieval(s)''' | |||
def create_index(self, filename: str) -> Index: | |||
'''Create a basic Index instance appropriate for this module''' | |||
def create_metadata(self, filename: str) -> Metadata: | |||
'''Create a basic Metadata instance appropriate for this module''' | |||
if type(self).IndexClass is None or type(self).name is None: | |||
raise RuntimeError('Module lacks an IndexClass or a name; cannot create index') | |||
idx = type(self).IndexClass() | |||
if type(self).MetadataClass is None or type(self).name is None: | |||
raise RuntimeError('Module lacks an MetadataClass or a name; cannot create metadata') | |||
idx = type(self).MetadataClass() | |||
idx.append('codearchiver version', codearchiver.version.__version__) | |||
idx.append('Module', type(self).name) | |||
idx.append('ID', self._id) | |||
@@ -11,19 +11,19 @@ import subprocess | |||
_logger = logging.getLogger(__name__) | |||
class GitIndex(codearchiver.core.Index): | |||
class GitMetadata(codearchiver.core.Metadata): | |||
fields = ( | |||
codearchiver.core.IndexField(key = 'Git version', required = True, repeatable = False), | |||
codearchiver.core.IndexField(key = 'Based on bundle', required = False, repeatable = True), | |||
codearchiver.core.IndexField(key = 'Ref', required = True, repeatable = True), | |||
codearchiver.core.IndexField(key = 'Root commit', required = True, repeatable = True), | |||
codearchiver.core.IndexField(key = 'Commit', required = False, repeatable = True), | |||
codearchiver.core.MetadataField(key = 'Git version', required = True, repeatable = False), | |||
codearchiver.core.MetadataField(key = 'Based on bundle', required = False, repeatable = True), | |||
codearchiver.core.MetadataField(key = 'Ref', required = True, repeatable = True), | |||
codearchiver.core.MetadataField(key = 'Root commit', required = True, repeatable = True), | |||
codearchiver.core.MetadataField(key = 'Commit', required = False, repeatable = True), | |||
) | |||
class Git(codearchiver.core.Module): | |||
name = 'git' | |||
IndexClass = GitIndex | |||
MetadataClass = GitMetadata | |||
@staticmethod | |||
def matches(inputUrl): | |||
@@ -63,7 +63,7 @@ class Git(codearchiver.core.Module): | |||
_logger.error(f'Failed to fetch {commit}') | |||
# This leaves over a FETCH_HEAD file, but git-bundle does not care about that, so it can safely be ignored. | |||
_logger.info(f'Collecting repository metadata for index') | |||
_logger.info('Collecting repository metadata') | |||
_, refs, _ = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory) | |||
_, commits, _ = codearchiver.subprocess.run_with_log(['git', 'log', '--reflog', '--all', '--format=format:%H% P'], cwd = directory) | |||
commits = list(map(functools.partial(str.split, sep = ' '), commits.splitlines())) | |||
@@ -75,12 +75,12 @@ class Git(codearchiver.core.Module): | |||
oldCommits = {} # dict to keep the order reasonable | |||
basedOnBundles = {} # ditto | |||
if self._storage: | |||
for oldBundle in self._storage.search_indices([('Root commit', c) for c in rootCommits]): | |||
for oldBundle in self._storage.search_metadata([('Root commit', c) for c in rootCommits]): | |||
if not oldBundle.startswith('git_'): #TODO Is there a more generic and elegant approach? | |||
continue | |||
_logger.info(f'Previous bundle: {oldBundle!r}') | |||
with self._storage.open_index(oldBundle) as fp: | |||
idx = GitIndex.deserialise(fp) | |||
with self._storage.open_metadata(oldBundle) as fp: | |||
idx = GitMetadata.deserialise(fp) | |||
for key, value in idx: | |||
if key == 'Commit' and value in commitSet: | |||
oldCommits[value] = True | |||
@@ -106,19 +106,19 @@ class Git(codearchiver.core.Module): | |||
_logger.info(f'Removing clone') | |||
shutil.rmtree(directory) | |||
index = self.create_index(bundle) | |||
index.append('Git version', gitVersion) | |||
metadata = self.create_metadata(bundle) | |||
metadata.append('Git version', gitVersion) | |||
for oldBundle in basedOnBundles: | |||
index.append('Based on bundle', oldBundle) | |||
metadata.append('Based on bundle', oldBundle) | |||
for line in refs.splitlines(): | |||
index.append('Ref', line) | |||
metadata.append('Ref', line) | |||
for commitHash, *parents in commits: | |||
if commitHash not in oldCommits: | |||
index.append('Commit', commitHash) | |||
metadata.append('Commit', commitHash) | |||
if not parents: | |||
index.append('Root commit', commitHash) | |||
metadata.append('Root commit', commitHash) | |||
return codearchiver.core.Result(id = self._id, files = [(bundle, index)]) | |||
return codearchiver.core.Result(id = self._id, files = [(bundle, metadata)]) | |||
def __repr__(self): | |||
return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r}, extraBranches = {self._extraBranches!r})' |
@@ -14,28 +14,28 @@ _logger = logging.getLogger(__name__) | |||
class Storage(abc.ABC): | |||
@abc.abstractmethod | |||
def put(self, filename: str, index: typing.Optional['codearchiver.core.Index'] = None): | |||
'''Put a local file and (if provided) its index into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.''' | |||
def put(self, filename: str, metadata: typing.Optional['codearchiver.core.Metadata'] = None): | |||
'''Put a local file and (if provided) its metadata into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.''' | |||
def put_result(self, result: 'codearchiver.core.Result'): | |||
'''Put a module's Result into storage. The semantics are as for `put`, and the exact behaviour regarding partial copies and leftover files on errors is undefined.''' | |||
for fn, index in result.files: | |||
self.put(fn, index) | |||
for fn, metadata in result.files: | |||
self.put(fn, metadata) | |||
for _, subresult in result.submoduleResults: | |||
self.put_result(subresult) | |||
@abc.abstractmethod | |||
def search_indices(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> collections.abc.Iterator[str]: | |||
def search_metadata(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> collections.abc.Iterator[str]: | |||
''' | |||
Search all indices in storage by criteria. | |||
Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in a file's index. | |||
Search all metadata in storage by criteria. | |||
Refer to `codearchiver.core.Metadata.matches` for the semantics of `criteria`. | |||
Yields all filenames where all criteria match. | |||
''' | |||
@abc.abstractmethod | |||
@contextlib.contextmanager | |||
def open_index(self, filename: str) -> typing.TextIO: | |||
'''Open the index for a file in serialised form.''' | |||
def open_metadata(self, filename: str) -> typing.TextIO: | |||
'''Open the metadata for a file in serialised form.''' | |||
@abc.abstractmethod | |||
@contextlib.contextmanager | |||
@@ -58,35 +58,35 @@ class DirectoryStorage(Storage): | |||
if not self._check_directory(): | |||
os.makedirs(self._directory) | |||
def put(self, filename, index = None): | |||
def put(self, filename, metadata = None): | |||
self._ensure_directory() | |||
#FIXME: Race condition | |||
if os.path.exists((targetFilename := os.path.join(self._directory, os.path.basename(filename)))): | |||
raise FileExistsError(f'{targetFilename} already exists') | |||
_logger.info(f'Moving {filename} to {self._directory}') | |||
shutil.move(filename, self._directory) | |||
if not index: | |||
if not metadata: | |||
return | |||
indexFilename = os.path.join(self._directory, f'{filename}.codearchiver-index') | |||
metadataFilename = os.path.join(self._directory, f'{filename}.codearchiver-metadata') | |||
# No need to check for existence here thanks to the 'x' mode | |||
_logger.info(f'Writing index for {filename} to {indexFilename}') | |||
with open(indexFilename, 'x') as fp: | |||
fp.write(index.serialise()) | |||
def search_indices(self, criteria): | |||
_logger.info(f'Searching indices by criteria: {criteria!r}') | |||
for indexFilename in glob.glob('*.codearchiver-index', root_dir = self._directory): | |||
_logger.info(f'Searching index {indexFilename}') | |||
with self.open(indexFilename, 'r') as fp: | |||
idx = codearchiver.core.Index.deserialise(fp, validate = False) | |||
_logger.info(f'Writing metadata for {filename} to {metadataFilename}') | |||
with open(metadataFilename, 'x') as fp: | |||
fp.write(metadata.serialise()) | |||
def search_metadata(self, criteria): | |||
_logger.info(f'Searching metadata by criteria: {criteria!r}') | |||
for metadataFilename in glob.glob('*.codearchiver-metadata', root_dir = self._directory): | |||
_logger.info(f'Searching metadata {metadataFilename}') | |||
with self.open(metadataFilename, 'r') as fp: | |||
idx = codearchiver.core.Metadata.deserialise(fp, validate = False) | |||
if idx.matches(criteria): | |||
_logger.info(f'Found index match {indexFilename}') | |||
yield indexFilename.rsplit('.', 1)[0] | |||
_logger.info('Done searching indices') | |||
_logger.info(f'Found metadata match {metadataFilename}') | |||
yield metadataFilename.rsplit('.', 1)[0] | |||
_logger.info('Done searching metadata') | |||
@contextlib.contextmanager | |||
def open_index(self, filename): | |||
with self.open(f'{filename}.codearchiver-index', 'r') as fp: | |||
def open_metadata(self, filename): | |||
with self.open(f'{filename}.codearchiver-metadata', 'r') as fp: | |||
yield fp | |||
@contextlib.contextmanager | |||