From fa4b60225c26ab4da1709eda5b47dde8d58caa04 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 10 Mar 2023 01:16:25 +0000 Subject: [PATCH] =?UTF-8?q?Index=20=E2=86=92=20Metadata?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'Index' was a misnomer from the start since it contains critical information for the operation that can't be reconstructed (e.g. existing refs). --- codearchiver/core.py | 64 ++++++++++++++++++------------------- codearchiver/modules/git.py | 36 ++++++++++----------- codearchiver/storage.py | 54 +++++++++++++++---------------- 3 files changed, 77 insertions(+), 77 deletions(-) diff --git a/codearchiver/core.py b/codearchiver/core.py index e3f4c00..30047cd 100644 --- a/codearchiver/core.py +++ b/codearchiver/core.py @@ -64,37 +64,37 @@ class Result: id: str '''A unique ID for this result''' - files: list[tuple[str, typing.Optional['Index']]] = dataclasses.field(default_factory = list) - '''List of filenames produced by the run, optionally with an index''' + files: list[tuple[str, typing.Optional['Metadata']]] = dataclasses.field(default_factory = list) + '''List of filenames produced by the run, optionally with metadata''' submoduleResults: list[tuple['Module', 'Result']] = dataclasses.field(default_factory = list) '''List of related submodules and their results''' -class IndexValidationError(ValueError): +class MetadataValidationError(ValueError): pass @dataclasses.dataclass -class IndexField: +class MetadataField: key: str required: bool repeatable: bool -class Index(list[tuple[str, str]]): - '''An index (key-value mapping, possibly with repeated keys) of a file produced by a module''' +class Metadata(list[tuple[str, str]]): + '''Metadata (key-value mapping, possibly with repeated keys) of a file produced by a module''' - fields: tuple[IndexField] = ( - IndexField('codearchiver version', required = True, repeatable = False), - IndexField('Module', required = True, repeatable = False), - IndexField('ID', required = True, repeatable = False), - IndexField('Input URL', required = True, repeatable = False), - IndexField('Filename', required = True, repeatable = False), + fields: tuple[MetadataField] = ( + MetadataField('codearchiver version', required = True, repeatable = False), + MetadataField('Module', required = True, repeatable = False), + MetadataField('ID', required = True, repeatable = False), + MetadataField('Input URL', required = True, repeatable = False), + MetadataField('Filename', required = True, repeatable = False), ) - '''The fields for this index''' + '''The fields for this metadata collection''' - _allFieldsCache: typing.Optional[tuple[IndexField]] = None + _allFieldsCache: typing.Optional[tuple[MetadataField]] = None def append(self, *args): if len(args) == 1: @@ -104,7 +104,7 @@ class Index(list[tuple[str, str]]): # This should be a @classmethod, too, but that's deprecated since Python 3.11. @property def _allFields(self): - '''All fields known by this index, own ones and all from superclasses''' + '''All fields known by this metadata collection, own ones and all from superclasses''' if type(self)._allFieldsCache is None: fields = [] @@ -114,7 +114,7 @@ class Index(list[tuple[str, str]]): return type(self)._allFieldsCache def validate(self): - '''Check that all keys and values in the index conform to the specification''' + '''Check that all keys and values conform to the specification''' keyCounts = collections.Counter(key for key, _ in self) keys = set(keyCounts) @@ -122,29 +122,29 @@ class Index(list[tuple[str, str]]): permittedKeys = set(field.key for field in self._allFields) unrecognisedKeys = keys - permittedKeys if unrecognisedKeys: - raise IndexValidationError(f'Unrecognised key(s): {", ".join(sorted(unrecognisedKeys))}') + raise MetadataValidationError(f'Unrecognised key(s): {", ".join(sorted(unrecognisedKeys))}') requiredKeys = set(field.key for field in self._allFields if field.required) missingRequiredKeys = requiredKeys - keys if missingRequiredKeys: - raise IndexValidationError(f'Missing required key(s): {", ".join(sorted(missingRequiredKeys))}') + raise MetadataValidationError(f'Missing required key(s): {", ".join(sorted(missingRequiredKeys))}') repeatableKeys = set(field.key for field in self._allFields if field.repeatable) repeatedKeys = set(key for key, count in keyCounts.items() if count > 1) repeatedUnrepeatableKeys = repeatedKeys - repeatableKeys if repeatedUnrepeatableKeys: - raise IndexValidationError(f'Repeated unrepeatable key(s): {", ".join(sorted(repeatedUnrepeatableKeys))}') + raise MetadataValidationError(f'Repeated unrepeatable key(s): {", ".join(sorted(repeatedUnrepeatableKeys))}') def matches(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> bool: ''' - Check whether the criteria match this index - Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in the index. + Check whether the criteria match this metadata collection + Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in the metadata. Multiple criteria may use the same key to perform an AND search. - The index is a match if all criteria match. + The metadata is a match if all criteria match. ''' criteria = criteria.copy() - _logger.debug(f'Searching index for {criteria!r}') + _logger.debug(f'Searching metadata for {criteria!r}') keysOfInterest = set(key for key, _ in criteria) for key, value in self: if key not in keysOfInterest: @@ -169,14 +169,14 @@ class Index(list[tuple[str, str]]): return not bool(criteria) def serialise(self) -> str: - '''Convert the index to a string suitable for e.g. a simple text file storage''' + '''Convert the metadata to a string suitable for e.g. a simple text file storage''' self.validate() return ''.join(f'{key}: {value}\n' for key, value in self) @classmethod def deserialise(cls, f: typing.Union[str, bytes, os.PathLike, typing.TextIO], *, validate = True): - '''Import a serialised index from a filename or file-like object''' + '''Import a serialised metadata from a filename or file-like object''' if isinstance(f, (str, bytes, os.PathLike)): cm = open(f, 'r') @@ -355,8 +355,8 @@ class Module(metaclass = ModuleMeta): name: typing.Optional[str] = None '''The name of the module. Modules without a name are ignored. Names must be unique and may only contain a-z and hyphens.''' - IndexClass: typing.Optional[typing.Type[Index]] = None - '''The Index class corresponding to this module, if any.''' + MetadataClass: typing.Optional[typing.Type[Metadata]] = None + '''The Metadata class corresponding to this module, if any.''' @staticmethod def matches(inputUrl: InputURL) -> bool: @@ -376,12 +376,12 @@ class Module(metaclass = ModuleMeta): def process(self) -> Result: '''Perform the relevant retrieval(s)''' - def create_index(self, filename: str) -> Index: - '''Create a basic Index instance appropriate for this module''' + def create_metadata(self, filename: str) -> Metadata: + '''Create a basic Metadata instance appropriate for this module''' - if type(self).IndexClass is None or type(self).name is None: - raise RuntimeError('Module lacks an IndexClass or a name; cannot create index') - idx = type(self).IndexClass() + if type(self).MetadataClass is None or type(self).name is None: + raise RuntimeError('Module lacks an MetadataClass or a name; cannot create metadata') + idx = type(self).MetadataClass() idx.append('codearchiver version', codearchiver.version.__version__) idx.append('Module', type(self).name) idx.append('ID', self._id) diff --git a/codearchiver/modules/git.py b/codearchiver/modules/git.py index 37e14c2..19d7889 100644 --- a/codearchiver/modules/git.py +++ b/codearchiver/modules/git.py @@ -11,19 +11,19 @@ import subprocess _logger = logging.getLogger(__name__) -class GitIndex(codearchiver.core.Index): +class GitMetadata(codearchiver.core.Metadata): fields = ( - codearchiver.core.IndexField(key = 'Git version', required = True, repeatable = False), - codearchiver.core.IndexField(key = 'Based on bundle', required = False, repeatable = True), - codearchiver.core.IndexField(key = 'Ref', required = True, repeatable = True), - codearchiver.core.IndexField(key = 'Root commit', required = True, repeatable = True), - codearchiver.core.IndexField(key = 'Commit', required = False, repeatable = True), + codearchiver.core.MetadataField(key = 'Git version', required = True, repeatable = False), + codearchiver.core.MetadataField(key = 'Based on bundle', required = False, repeatable = True), + codearchiver.core.MetadataField(key = 'Ref', required = True, repeatable = True), + codearchiver.core.MetadataField(key = 'Root commit', required = True, repeatable = True), + codearchiver.core.MetadataField(key = 'Commit', required = False, repeatable = True), ) class Git(codearchiver.core.Module): name = 'git' - IndexClass = GitIndex + MetadataClass = GitMetadata @staticmethod def matches(inputUrl): @@ -63,7 +63,7 @@ class Git(codearchiver.core.Module): _logger.error(f'Failed to fetch {commit}') # This leaves over a FETCH_HEAD file, but git-bundle does not care about that, so it can safely be ignored. - _logger.info(f'Collecting repository metadata for index') + _logger.info('Collecting repository metadata') _, refs, _ = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory) _, commits, _ = codearchiver.subprocess.run_with_log(['git', 'log', '--reflog', '--all', '--format=format:%H% P'], cwd = directory) commits = list(map(functools.partial(str.split, sep = ' '), commits.splitlines())) @@ -75,12 +75,12 @@ class Git(codearchiver.core.Module): oldCommits = {} # dict to keep the order reasonable basedOnBundles = {} # ditto if self._storage: - for oldBundle in self._storage.search_indices([('Root commit', c) for c in rootCommits]): + for oldBundle in self._storage.search_metadata([('Root commit', c) for c in rootCommits]): if not oldBundle.startswith('git_'): #TODO Is there a more generic and elegant approach? continue _logger.info(f'Previous bundle: {oldBundle!r}') - with self._storage.open_index(oldBundle) as fp: - idx = GitIndex.deserialise(fp) + with self._storage.open_metadata(oldBundle) as fp: + idx = GitMetadata.deserialise(fp) for key, value in idx: if key == 'Commit' and value in commitSet: oldCommits[value] = True @@ -106,19 +106,19 @@ class Git(codearchiver.core.Module): _logger.info(f'Removing clone') shutil.rmtree(directory) - index = self.create_index(bundle) - index.append('Git version', gitVersion) + metadata = self.create_metadata(bundle) + metadata.append('Git version', gitVersion) for oldBundle in basedOnBundles: - index.append('Based on bundle', oldBundle) + metadata.append('Based on bundle', oldBundle) for line in refs.splitlines(): - index.append('Ref', line) + metadata.append('Ref', line) for commitHash, *parents in commits: if commitHash not in oldCommits: - index.append('Commit', commitHash) + metadata.append('Commit', commitHash) if not parents: - index.append('Root commit', commitHash) + metadata.append('Root commit', commitHash) - return codearchiver.core.Result(id = self._id, files = [(bundle, index)]) + return codearchiver.core.Result(id = self._id, files = [(bundle, metadata)]) def __repr__(self): return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r}, extraBranches = {self._extraBranches!r})' diff --git a/codearchiver/storage.py b/codearchiver/storage.py index 8a3bcbd..23f9dd3 100644 --- a/codearchiver/storage.py +++ b/codearchiver/storage.py @@ -14,28 +14,28 @@ _logger = logging.getLogger(__name__) class Storage(abc.ABC): @abc.abstractmethod - def put(self, filename: str, index: typing.Optional['codearchiver.core.Index'] = None): - '''Put a local file and (if provided) its index into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.''' + def put(self, filename: str, metadata: typing.Optional['codearchiver.core.Metadata'] = None): + '''Put a local file and (if provided) its metadata into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.''' def put_result(self, result: 'codearchiver.core.Result'): '''Put a module's Result into storage. The semantics are as for `put`, and the exact behaviour regarding partial copies and leftover files on errors is undefined.''' - for fn, index in result.files: - self.put(fn, index) + for fn, metadata in result.files: + self.put(fn, metadata) for _, subresult in result.submoduleResults: self.put_result(subresult) @abc.abstractmethod - def search_indices(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> collections.abc.Iterator[str]: + def search_metadata(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> collections.abc.Iterator[str]: ''' - Search all indices in storage by criteria. - Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in a file's index. + Search all metadata in storage by criteria. + Refer to `codearchiver.core.Metadata.matches` for the semantics of `criteria`. Yields all filenames where all criteria match. ''' @abc.abstractmethod @contextlib.contextmanager - def open_index(self, filename: str) -> typing.TextIO: - '''Open the index for a file in serialised form.''' + def open_metadata(self, filename: str) -> typing.TextIO: + '''Open the metadata for a file in serialised form.''' @abc.abstractmethod @contextlib.contextmanager @@ -58,35 +58,35 @@ class DirectoryStorage(Storage): if not self._check_directory(): os.makedirs(self._directory) - def put(self, filename, index = None): + def put(self, filename, metadata = None): self._ensure_directory() #FIXME: Race condition if os.path.exists((targetFilename := os.path.join(self._directory, os.path.basename(filename)))): raise FileExistsError(f'{targetFilename} already exists') _logger.info(f'Moving {filename} to {self._directory}') shutil.move(filename, self._directory) - if not index: + if not metadata: return - indexFilename = os.path.join(self._directory, f'{filename}.codearchiver-index') + metadataFilename = os.path.join(self._directory, f'{filename}.codearchiver-metadata') # No need to check for existence here thanks to the 'x' mode - _logger.info(f'Writing index for {filename} to {indexFilename}') - with open(indexFilename, 'x') as fp: - fp.write(index.serialise()) - - def search_indices(self, criteria): - _logger.info(f'Searching indices by criteria: {criteria!r}') - for indexFilename in glob.glob('*.codearchiver-index', root_dir = self._directory): - _logger.info(f'Searching index {indexFilename}') - with self.open(indexFilename, 'r') as fp: - idx = codearchiver.core.Index.deserialise(fp, validate = False) + _logger.info(f'Writing metadata for {filename} to {metadataFilename}') + with open(metadataFilename, 'x') as fp: + fp.write(metadata.serialise()) + + def search_metadata(self, criteria): + _logger.info(f'Searching metadata by criteria: {criteria!r}') + for metadataFilename in glob.glob('*.codearchiver-metadata', root_dir = self._directory): + _logger.info(f'Searching metadata {metadataFilename}') + with self.open(metadataFilename, 'r') as fp: + idx = codearchiver.core.Metadata.deserialise(fp, validate = False) if idx.matches(criteria): - _logger.info(f'Found index match {indexFilename}') - yield indexFilename.rsplit('.', 1)[0] - _logger.info('Done searching indices') + _logger.info(f'Found metadata match {metadataFilename}') + yield metadataFilename.rsplit('.', 1)[0] + _logger.info('Done searching metadata') @contextlib.contextmanager - def open_index(self, filename): - with self.open(f'{filename}.codearchiver-index', 'r') as fp: + def open_metadata(self, filename): + with self.open(f'{filename}.codearchiver-metadata', 'r') as fp: yield fp @contextlib.contextmanager