'Index' was a misnomer from the start since it contains critical information for the operation that can't be reconstructed (e.g. existing refs).tags/v1.0
@@ -64,37 +64,37 @@ class Result: | |||||
id: str | id: str | ||||
'''A unique ID for this result''' | '''A unique ID for this result''' | ||||
files: list[tuple[str, typing.Optional['Index']]] = dataclasses.field(default_factory = list) | |||||
'''List of filenames produced by the run, optionally with an index''' | |||||
files: list[tuple[str, typing.Optional['Metadata']]] = dataclasses.field(default_factory = list) | |||||
'''List of filenames produced by the run, optionally with metadata''' | |||||
submoduleResults: list[tuple['Module', 'Result']] = dataclasses.field(default_factory = list) | submoduleResults: list[tuple['Module', 'Result']] = dataclasses.field(default_factory = list) | ||||
'''List of related submodules and their results''' | '''List of related submodules and their results''' | ||||
class IndexValidationError(ValueError): | |||||
class MetadataValidationError(ValueError): | |||||
pass | pass | ||||
@dataclasses.dataclass | @dataclasses.dataclass | ||||
class IndexField: | |||||
class MetadataField: | |||||
key: str | key: str | ||||
required: bool | required: bool | ||||
repeatable: bool | repeatable: bool | ||||
class Index(list[tuple[str, str]]): | |||||
'''An index (key-value mapping, possibly with repeated keys) of a file produced by a module''' | |||||
class Metadata(list[tuple[str, str]]): | |||||
'''Metadata (key-value mapping, possibly with repeated keys) of a file produced by a module''' | |||||
fields: tuple[IndexField] = ( | |||||
IndexField('codearchiver version', required = True, repeatable = False), | |||||
IndexField('Module', required = True, repeatable = False), | |||||
IndexField('ID', required = True, repeatable = False), | |||||
IndexField('Input URL', required = True, repeatable = False), | |||||
IndexField('Filename', required = True, repeatable = False), | |||||
fields: tuple[MetadataField] = ( | |||||
MetadataField('codearchiver version', required = True, repeatable = False), | |||||
MetadataField('Module', required = True, repeatable = False), | |||||
MetadataField('ID', required = True, repeatable = False), | |||||
MetadataField('Input URL', required = True, repeatable = False), | |||||
MetadataField('Filename', required = True, repeatable = False), | |||||
) | ) | ||||
'''The fields for this index''' | |||||
'''The fields for this metadata collection''' | |||||
_allFieldsCache: typing.Optional[tuple[IndexField]] = None | |||||
_allFieldsCache: typing.Optional[tuple[MetadataField]] = None | |||||
def append(self, *args): | def append(self, *args): | ||||
if len(args) == 1: | if len(args) == 1: | ||||
@@ -104,7 +104,7 @@ class Index(list[tuple[str, str]]): | |||||
# This should be a @classmethod, too, but that's deprecated since Python 3.11. | # This should be a @classmethod, too, but that's deprecated since Python 3.11. | ||||
@property | @property | ||||
def _allFields(self): | def _allFields(self): | ||||
'''All fields known by this index, own ones and all from superclasses''' | |||||
'''All fields known by this metadata collection, own ones and all from superclasses''' | |||||
if type(self)._allFieldsCache is None: | if type(self)._allFieldsCache is None: | ||||
fields = [] | fields = [] | ||||
@@ -114,7 +114,7 @@ class Index(list[tuple[str, str]]): | |||||
return type(self)._allFieldsCache | return type(self)._allFieldsCache | ||||
def validate(self): | def validate(self): | ||||
'''Check that all keys and values in the index conform to the specification''' | |||||
'''Check that all keys and values conform to the specification''' | |||||
keyCounts = collections.Counter(key for key, _ in self) | keyCounts = collections.Counter(key for key, _ in self) | ||||
keys = set(keyCounts) | keys = set(keyCounts) | ||||
@@ -122,29 +122,29 @@ class Index(list[tuple[str, str]]): | |||||
permittedKeys = set(field.key for field in self._allFields) | permittedKeys = set(field.key for field in self._allFields) | ||||
unrecognisedKeys = keys - permittedKeys | unrecognisedKeys = keys - permittedKeys | ||||
if unrecognisedKeys: | if unrecognisedKeys: | ||||
raise IndexValidationError(f'Unrecognised key(s): {", ".join(sorted(unrecognisedKeys))}') | |||||
raise MetadataValidationError(f'Unrecognised key(s): {", ".join(sorted(unrecognisedKeys))}') | |||||
requiredKeys = set(field.key for field in self._allFields if field.required) | requiredKeys = set(field.key for field in self._allFields if field.required) | ||||
missingRequiredKeys = requiredKeys - keys | missingRequiredKeys = requiredKeys - keys | ||||
if missingRequiredKeys: | if missingRequiredKeys: | ||||
raise IndexValidationError(f'Missing required key(s): {", ".join(sorted(missingRequiredKeys))}') | |||||
raise MetadataValidationError(f'Missing required key(s): {", ".join(sorted(missingRequiredKeys))}') | |||||
repeatableKeys = set(field.key for field in self._allFields if field.repeatable) | repeatableKeys = set(field.key for field in self._allFields if field.repeatable) | ||||
repeatedKeys = set(key for key, count in keyCounts.items() if count > 1) | repeatedKeys = set(key for key, count in keyCounts.items() if count > 1) | ||||
repeatedUnrepeatableKeys = repeatedKeys - repeatableKeys | repeatedUnrepeatableKeys = repeatedKeys - repeatableKeys | ||||
if repeatedUnrepeatableKeys: | if repeatedUnrepeatableKeys: | ||||
raise IndexValidationError(f'Repeated unrepeatable key(s): {", ".join(sorted(repeatedUnrepeatableKeys))}') | |||||
raise MetadataValidationError(f'Repeated unrepeatable key(s): {", ".join(sorted(repeatedUnrepeatableKeys))}') | |||||
def matches(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> bool: | def matches(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> bool: | ||||
''' | ''' | ||||
Check whether the criteria match this index | |||||
Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in the index. | |||||
Check whether the criteria match this metadata collection | |||||
Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in the metadata. | |||||
Multiple criteria may use the same key to perform an AND search. | Multiple criteria may use the same key to perform an AND search. | ||||
The index is a match if all criteria match. | |||||
The metadata is a match if all criteria match. | |||||
''' | ''' | ||||
criteria = criteria.copy() | criteria = criteria.copy() | ||||
_logger.debug(f'Searching index for {criteria!r}') | |||||
_logger.debug(f'Searching metadata for {criteria!r}') | |||||
keysOfInterest = set(key for key, _ in criteria) | keysOfInterest = set(key for key, _ in criteria) | ||||
for key, value in self: | for key, value in self: | ||||
if key not in keysOfInterest: | if key not in keysOfInterest: | ||||
@@ -169,14 +169,14 @@ class Index(list[tuple[str, str]]): | |||||
return not bool(criteria) | return not bool(criteria) | ||||
def serialise(self) -> str: | def serialise(self) -> str: | ||||
'''Convert the index to a string suitable for e.g. a simple text file storage''' | |||||
'''Convert the metadata to a string suitable for e.g. a simple text file storage''' | |||||
self.validate() | self.validate() | ||||
return ''.join(f'{key}: {value}\n' for key, value in self) | return ''.join(f'{key}: {value}\n' for key, value in self) | ||||
@classmethod | @classmethod | ||||
def deserialise(cls, f: typing.Union[str, bytes, os.PathLike, typing.TextIO], *, validate = True): | def deserialise(cls, f: typing.Union[str, bytes, os.PathLike, typing.TextIO], *, validate = True): | ||||
'''Import a serialised index from a filename or file-like object''' | |||||
'''Import a serialised metadata from a filename or file-like object''' | |||||
if isinstance(f, (str, bytes, os.PathLike)): | if isinstance(f, (str, bytes, os.PathLike)): | ||||
cm = open(f, 'r') | cm = open(f, 'r') | ||||
@@ -355,8 +355,8 @@ class Module(metaclass = ModuleMeta): | |||||
name: typing.Optional[str] = None | name: typing.Optional[str] = None | ||||
'''The name of the module. Modules without a name are ignored. Names must be unique and may only contain a-z and hyphens.''' | '''The name of the module. Modules without a name are ignored. Names must be unique and may only contain a-z and hyphens.''' | ||||
IndexClass: typing.Optional[typing.Type[Index]] = None | |||||
'''The Index class corresponding to this module, if any.''' | |||||
MetadataClass: typing.Optional[typing.Type[Metadata]] = None | |||||
'''The Metadata class corresponding to this module, if any.''' | |||||
@staticmethod | @staticmethod | ||||
def matches(inputUrl: InputURL) -> bool: | def matches(inputUrl: InputURL) -> bool: | ||||
@@ -376,12 +376,12 @@ class Module(metaclass = ModuleMeta): | |||||
def process(self) -> Result: | def process(self) -> Result: | ||||
'''Perform the relevant retrieval(s)''' | '''Perform the relevant retrieval(s)''' | ||||
def create_index(self, filename: str) -> Index: | |||||
'''Create a basic Index instance appropriate for this module''' | |||||
def create_metadata(self, filename: str) -> Metadata: | |||||
'''Create a basic Metadata instance appropriate for this module''' | |||||
if type(self).IndexClass is None or type(self).name is None: | |||||
raise RuntimeError('Module lacks an IndexClass or a name; cannot create index') | |||||
idx = type(self).IndexClass() | |||||
if type(self).MetadataClass is None or type(self).name is None: | |||||
raise RuntimeError('Module lacks an MetadataClass or a name; cannot create metadata') | |||||
idx = type(self).MetadataClass() | |||||
idx.append('codearchiver version', codearchiver.version.__version__) | idx.append('codearchiver version', codearchiver.version.__version__) | ||||
idx.append('Module', type(self).name) | idx.append('Module', type(self).name) | ||||
idx.append('ID', self._id) | idx.append('ID', self._id) | ||||
@@ -11,19 +11,19 @@ import subprocess | |||||
_logger = logging.getLogger(__name__) | _logger = logging.getLogger(__name__) | ||||
class GitIndex(codearchiver.core.Index): | |||||
class GitMetadata(codearchiver.core.Metadata): | |||||
fields = ( | fields = ( | ||||
codearchiver.core.IndexField(key = 'Git version', required = True, repeatable = False), | |||||
codearchiver.core.IndexField(key = 'Based on bundle', required = False, repeatable = True), | |||||
codearchiver.core.IndexField(key = 'Ref', required = True, repeatable = True), | |||||
codearchiver.core.IndexField(key = 'Root commit', required = True, repeatable = True), | |||||
codearchiver.core.IndexField(key = 'Commit', required = False, repeatable = True), | |||||
codearchiver.core.MetadataField(key = 'Git version', required = True, repeatable = False), | |||||
codearchiver.core.MetadataField(key = 'Based on bundle', required = False, repeatable = True), | |||||
codearchiver.core.MetadataField(key = 'Ref', required = True, repeatable = True), | |||||
codearchiver.core.MetadataField(key = 'Root commit', required = True, repeatable = True), | |||||
codearchiver.core.MetadataField(key = 'Commit', required = False, repeatable = True), | |||||
) | ) | ||||
class Git(codearchiver.core.Module): | class Git(codearchiver.core.Module): | ||||
name = 'git' | name = 'git' | ||||
IndexClass = GitIndex | |||||
MetadataClass = GitMetadata | |||||
@staticmethod | @staticmethod | ||||
def matches(inputUrl): | def matches(inputUrl): | ||||
@@ -63,7 +63,7 @@ class Git(codearchiver.core.Module): | |||||
_logger.error(f'Failed to fetch {commit}') | _logger.error(f'Failed to fetch {commit}') | ||||
# This leaves over a FETCH_HEAD file, but git-bundle does not care about that, so it can safely be ignored. | # This leaves over a FETCH_HEAD file, but git-bundle does not care about that, so it can safely be ignored. | ||||
_logger.info(f'Collecting repository metadata for index') | |||||
_logger.info('Collecting repository metadata') | |||||
_, refs, _ = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory) | _, refs, _ = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory) | ||||
_, commits, _ = codearchiver.subprocess.run_with_log(['git', 'log', '--reflog', '--all', '--format=format:%H% P'], cwd = directory) | _, commits, _ = codearchiver.subprocess.run_with_log(['git', 'log', '--reflog', '--all', '--format=format:%H% P'], cwd = directory) | ||||
commits = list(map(functools.partial(str.split, sep = ' '), commits.splitlines())) | commits = list(map(functools.partial(str.split, sep = ' '), commits.splitlines())) | ||||
@@ -75,12 +75,12 @@ class Git(codearchiver.core.Module): | |||||
oldCommits = {} # dict to keep the order reasonable | oldCommits = {} # dict to keep the order reasonable | ||||
basedOnBundles = {} # ditto | basedOnBundles = {} # ditto | ||||
if self._storage: | if self._storage: | ||||
for oldBundle in self._storage.search_indices([('Root commit', c) for c in rootCommits]): | |||||
for oldBundle in self._storage.search_metadata([('Root commit', c) for c in rootCommits]): | |||||
if not oldBundle.startswith('git_'): #TODO Is there a more generic and elegant approach? | if not oldBundle.startswith('git_'): #TODO Is there a more generic and elegant approach? | ||||
continue | continue | ||||
_logger.info(f'Previous bundle: {oldBundle!r}') | _logger.info(f'Previous bundle: {oldBundle!r}') | ||||
with self._storage.open_index(oldBundle) as fp: | |||||
idx = GitIndex.deserialise(fp) | |||||
with self._storage.open_metadata(oldBundle) as fp: | |||||
idx = GitMetadata.deserialise(fp) | |||||
for key, value in idx: | for key, value in idx: | ||||
if key == 'Commit' and value in commitSet: | if key == 'Commit' and value in commitSet: | ||||
oldCommits[value] = True | oldCommits[value] = True | ||||
@@ -106,19 +106,19 @@ class Git(codearchiver.core.Module): | |||||
_logger.info(f'Removing clone') | _logger.info(f'Removing clone') | ||||
shutil.rmtree(directory) | shutil.rmtree(directory) | ||||
index = self.create_index(bundle) | |||||
index.append('Git version', gitVersion) | |||||
metadata = self.create_metadata(bundle) | |||||
metadata.append('Git version', gitVersion) | |||||
for oldBundle in basedOnBundles: | for oldBundle in basedOnBundles: | ||||
index.append('Based on bundle', oldBundle) | |||||
metadata.append('Based on bundle', oldBundle) | |||||
for line in refs.splitlines(): | for line in refs.splitlines(): | ||||
index.append('Ref', line) | |||||
metadata.append('Ref', line) | |||||
for commitHash, *parents in commits: | for commitHash, *parents in commits: | ||||
if commitHash not in oldCommits: | if commitHash not in oldCommits: | ||||
index.append('Commit', commitHash) | |||||
metadata.append('Commit', commitHash) | |||||
if not parents: | if not parents: | ||||
index.append('Root commit', commitHash) | |||||
metadata.append('Root commit', commitHash) | |||||
return codearchiver.core.Result(id = self._id, files = [(bundle, index)]) | |||||
return codearchiver.core.Result(id = self._id, files = [(bundle, metadata)]) | |||||
def __repr__(self): | def __repr__(self): | ||||
return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r}, extraBranches = {self._extraBranches!r})' | return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r}, extraBranches = {self._extraBranches!r})' |
@@ -14,28 +14,28 @@ _logger = logging.getLogger(__name__) | |||||
class Storage(abc.ABC): | class Storage(abc.ABC): | ||||
@abc.abstractmethod | @abc.abstractmethod | ||||
def put(self, filename: str, index: typing.Optional['codearchiver.core.Index'] = None): | |||||
'''Put a local file and (if provided) its index into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.''' | |||||
def put(self, filename: str, metadata: typing.Optional['codearchiver.core.Metadata'] = None): | |||||
'''Put a local file and (if provided) its metadata into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.''' | |||||
def put_result(self, result: 'codearchiver.core.Result'): | def put_result(self, result: 'codearchiver.core.Result'): | ||||
'''Put a module's Result into storage. The semantics are as for `put`, and the exact behaviour regarding partial copies and leftover files on errors is undefined.''' | '''Put a module's Result into storage. The semantics are as for `put`, and the exact behaviour regarding partial copies and leftover files on errors is undefined.''' | ||||
for fn, index in result.files: | |||||
self.put(fn, index) | |||||
for fn, metadata in result.files: | |||||
self.put(fn, metadata) | |||||
for _, subresult in result.submoduleResults: | for _, subresult in result.submoduleResults: | ||||
self.put_result(subresult) | self.put_result(subresult) | ||||
@abc.abstractmethod | @abc.abstractmethod | ||||
def search_indices(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> collections.abc.Iterator[str]: | |||||
def search_metadata(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> collections.abc.Iterator[str]: | |||||
''' | ''' | ||||
Search all indices in storage by criteria. | |||||
Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in a file's index. | |||||
Search all metadata in storage by criteria. | |||||
Refer to `codearchiver.core.Metadata.matches` for the semantics of `criteria`. | |||||
Yields all filenames where all criteria match. | Yields all filenames where all criteria match. | ||||
''' | ''' | ||||
@abc.abstractmethod | @abc.abstractmethod | ||||
@contextlib.contextmanager | @contextlib.contextmanager | ||||
def open_index(self, filename: str) -> typing.TextIO: | |||||
'''Open the index for a file in serialised form.''' | |||||
def open_metadata(self, filename: str) -> typing.TextIO: | |||||
'''Open the metadata for a file in serialised form.''' | |||||
@abc.abstractmethod | @abc.abstractmethod | ||||
@contextlib.contextmanager | @contextlib.contextmanager | ||||
@@ -58,35 +58,35 @@ class DirectoryStorage(Storage): | |||||
if not self._check_directory(): | if not self._check_directory(): | ||||
os.makedirs(self._directory) | os.makedirs(self._directory) | ||||
def put(self, filename, index = None): | |||||
def put(self, filename, metadata = None): | |||||
self._ensure_directory() | self._ensure_directory() | ||||
#FIXME: Race condition | #FIXME: Race condition | ||||
if os.path.exists((targetFilename := os.path.join(self._directory, os.path.basename(filename)))): | if os.path.exists((targetFilename := os.path.join(self._directory, os.path.basename(filename)))): | ||||
raise FileExistsError(f'{targetFilename} already exists') | raise FileExistsError(f'{targetFilename} already exists') | ||||
_logger.info(f'Moving {filename} to {self._directory}') | _logger.info(f'Moving {filename} to {self._directory}') | ||||
shutil.move(filename, self._directory) | shutil.move(filename, self._directory) | ||||
if not index: | |||||
if not metadata: | |||||
return | return | ||||
indexFilename = os.path.join(self._directory, f'{filename}.codearchiver-index') | |||||
metadataFilename = os.path.join(self._directory, f'{filename}.codearchiver-metadata') | |||||
# No need to check for existence here thanks to the 'x' mode | # No need to check for existence here thanks to the 'x' mode | ||||
_logger.info(f'Writing index for {filename} to {indexFilename}') | |||||
with open(indexFilename, 'x') as fp: | |||||
fp.write(index.serialise()) | |||||
def search_indices(self, criteria): | |||||
_logger.info(f'Searching indices by criteria: {criteria!r}') | |||||
for indexFilename in glob.glob('*.codearchiver-index', root_dir = self._directory): | |||||
_logger.info(f'Searching index {indexFilename}') | |||||
with self.open(indexFilename, 'r') as fp: | |||||
idx = codearchiver.core.Index.deserialise(fp, validate = False) | |||||
_logger.info(f'Writing metadata for {filename} to {metadataFilename}') | |||||
with open(metadataFilename, 'x') as fp: | |||||
fp.write(metadata.serialise()) | |||||
def search_metadata(self, criteria): | |||||
_logger.info(f'Searching metadata by criteria: {criteria!r}') | |||||
for metadataFilename in glob.glob('*.codearchiver-metadata', root_dir = self._directory): | |||||
_logger.info(f'Searching metadata {metadataFilename}') | |||||
with self.open(metadataFilename, 'r') as fp: | |||||
idx = codearchiver.core.Metadata.deserialise(fp, validate = False) | |||||
if idx.matches(criteria): | if idx.matches(criteria): | ||||
_logger.info(f'Found index match {indexFilename}') | |||||
yield indexFilename.rsplit('.', 1)[0] | |||||
_logger.info('Done searching indices') | |||||
_logger.info(f'Found metadata match {metadataFilename}') | |||||
yield metadataFilename.rsplit('.', 1)[0] | |||||
_logger.info('Done searching metadata') | |||||
@contextlib.contextmanager | @contextlib.contextmanager | ||||
def open_index(self, filename): | |||||
with self.open(f'{filename}.codearchiver-index', 'r') as fp: | |||||
def open_metadata(self, filename): | |||||
with self.open(f'{filename}.codearchiver-metadata', 'r') as fp: | |||||
yield fp | yield fp | ||||
@contextlib.contextmanager | @contextlib.contextmanager | ||||