Browse Source

Index → Metadata

'Index' was a misnomer from the start since it contains critical information for the operation that can't be reconstructed (e.g. existing refs).
tags/v1.0
JustAnotherArchivist 1 year ago
parent
commit
fa4b60225c
3 changed files with 77 additions and 77 deletions
  1. +32
    -32
      codearchiver/core.py
  2. +18
    -18
      codearchiver/modules/git.py
  3. +27
    -27
      codearchiver/storage.py

+ 32
- 32
codearchiver/core.py View File

@@ -64,37 +64,37 @@ class Result:
id: str
'''A unique ID for this result'''

files: list[tuple[str, typing.Optional['Index']]] = dataclasses.field(default_factory = list)
'''List of filenames produced by the run, optionally with an index'''
files: list[tuple[str, typing.Optional['Metadata']]] = dataclasses.field(default_factory = list)
'''List of filenames produced by the run, optionally with metadata'''

submoduleResults: list[tuple['Module', 'Result']] = dataclasses.field(default_factory = list)
'''List of related submodules and their results'''


class IndexValidationError(ValueError):
class MetadataValidationError(ValueError):
pass


@dataclasses.dataclass
class IndexField:
class MetadataField:
key: str
required: bool
repeatable: bool


class Index(list[tuple[str, str]]):
'''An index (key-value mapping, possibly with repeated keys) of a file produced by a module'''
class Metadata(list[tuple[str, str]]):
'''Metadata (key-value mapping, possibly with repeated keys) of a file produced by a module'''

fields: tuple[IndexField] = (
IndexField('codearchiver version', required = True, repeatable = False),
IndexField('Module', required = True, repeatable = False),
IndexField('ID', required = True, repeatable = False),
IndexField('Input URL', required = True, repeatable = False),
IndexField('Filename', required = True, repeatable = False),
fields: tuple[MetadataField] = (
MetadataField('codearchiver version', required = True, repeatable = False),
MetadataField('Module', required = True, repeatable = False),
MetadataField('ID', required = True, repeatable = False),
MetadataField('Input URL', required = True, repeatable = False),
MetadataField('Filename', required = True, repeatable = False),
)
'''The fields for this index'''
'''The fields for this metadata collection'''

_allFieldsCache: typing.Optional[tuple[IndexField]] = None
_allFieldsCache: typing.Optional[tuple[MetadataField]] = None

def append(self, *args):
if len(args) == 1:
@@ -104,7 +104,7 @@ class Index(list[tuple[str, str]]):
# This should be a @classmethod, too, but that's deprecated since Python 3.11.
@property
def _allFields(self):
'''All fields known by this index, own ones and all from superclasses'''
'''All fields known by this metadata collection, own ones and all from superclasses'''

if type(self)._allFieldsCache is None:
fields = []
@@ -114,7 +114,7 @@ class Index(list[tuple[str, str]]):
return type(self)._allFieldsCache

def validate(self):
'''Check that all keys and values in the index conform to the specification'''
'''Check that all keys and values conform to the specification'''

keyCounts = collections.Counter(key for key, _ in self)
keys = set(keyCounts)
@@ -122,29 +122,29 @@ class Index(list[tuple[str, str]]):
permittedKeys = set(field.key for field in self._allFields)
unrecognisedKeys = keys - permittedKeys
if unrecognisedKeys:
raise IndexValidationError(f'Unrecognised key(s): {", ".join(sorted(unrecognisedKeys))}')
raise MetadataValidationError(f'Unrecognised key(s): {", ".join(sorted(unrecognisedKeys))}')

requiredKeys = set(field.key for field in self._allFields if field.required)
missingRequiredKeys = requiredKeys - keys
if missingRequiredKeys:
raise IndexValidationError(f'Missing required key(s): {", ".join(sorted(missingRequiredKeys))}')
raise MetadataValidationError(f'Missing required key(s): {", ".join(sorted(missingRequiredKeys))}')

repeatableKeys = set(field.key for field in self._allFields if field.repeatable)
repeatedKeys = set(key for key, count in keyCounts.items() if count > 1)
repeatedUnrepeatableKeys = repeatedKeys - repeatableKeys
if repeatedUnrepeatableKeys:
raise IndexValidationError(f'Repeated unrepeatable key(s): {", ".join(sorted(repeatedUnrepeatableKeys))}')
raise MetadataValidationError(f'Repeated unrepeatable key(s): {", ".join(sorted(repeatedUnrepeatableKeys))}')

def matches(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> bool:
'''
Check whether the criteria match this index
Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in the index.
Check whether the criteria match this metadata collection
Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in the metadata.
Multiple criteria may use the same key to perform an AND search.
The index is a match if all criteria match.
The metadata is a match if all criteria match.
'''

criteria = criteria.copy()
_logger.debug(f'Searching index for {criteria!r}')
_logger.debug(f'Searching metadata for {criteria!r}')
keysOfInterest = set(key for key, _ in criteria)
for key, value in self:
if key not in keysOfInterest:
@@ -169,14 +169,14 @@ class Index(list[tuple[str, str]]):
return not bool(criteria)

def serialise(self) -> str:
'''Convert the index to a string suitable for e.g. a simple text file storage'''
'''Convert the metadata to a string suitable for e.g. a simple text file storage'''

self.validate()
return ''.join(f'{key}: {value}\n' for key, value in self)

@classmethod
def deserialise(cls, f: typing.Union[str, bytes, os.PathLike, typing.TextIO], *, validate = True):
'''Import a serialised index from a filename or file-like object'''
'''Import a serialised metadata from a filename or file-like object'''

if isinstance(f, (str, bytes, os.PathLike)):
cm = open(f, 'r')
@@ -355,8 +355,8 @@ class Module(metaclass = ModuleMeta):
name: typing.Optional[str] = None
'''The name of the module. Modules without a name are ignored. Names must be unique and may only contain a-z and hyphens.'''

IndexClass: typing.Optional[typing.Type[Index]] = None
'''The Index class corresponding to this module, if any.'''
MetadataClass: typing.Optional[typing.Type[Metadata]] = None
'''The Metadata class corresponding to this module, if any.'''

@staticmethod
def matches(inputUrl: InputURL) -> bool:
@@ -376,12 +376,12 @@ class Module(metaclass = ModuleMeta):
def process(self) -> Result:
'''Perform the relevant retrieval(s)'''

def create_index(self, filename: str) -> Index:
'''Create a basic Index instance appropriate for this module'''
def create_metadata(self, filename: str) -> Metadata:
'''Create a basic Metadata instance appropriate for this module'''

if type(self).IndexClass is None or type(self).name is None:
raise RuntimeError('Module lacks an IndexClass or a name; cannot create index')
idx = type(self).IndexClass()
if type(self).MetadataClass is None or type(self).name is None:
raise RuntimeError('Module lacks an MetadataClass or a name; cannot create metadata')
idx = type(self).MetadataClass()
idx.append('codearchiver version', codearchiver.version.__version__)
idx.append('Module', type(self).name)
idx.append('ID', self._id)


+ 18
- 18
codearchiver/modules/git.py View File

@@ -11,19 +11,19 @@ import subprocess
_logger = logging.getLogger(__name__)


class GitIndex(codearchiver.core.Index):
class GitMetadata(codearchiver.core.Metadata):
fields = (
codearchiver.core.IndexField(key = 'Git version', required = True, repeatable = False),
codearchiver.core.IndexField(key = 'Based on bundle', required = False, repeatable = True),
codearchiver.core.IndexField(key = 'Ref', required = True, repeatable = True),
codearchiver.core.IndexField(key = 'Root commit', required = True, repeatable = True),
codearchiver.core.IndexField(key = 'Commit', required = False, repeatable = True),
codearchiver.core.MetadataField(key = 'Git version', required = True, repeatable = False),
codearchiver.core.MetadataField(key = 'Based on bundle', required = False, repeatable = True),
codearchiver.core.MetadataField(key = 'Ref', required = True, repeatable = True),
codearchiver.core.MetadataField(key = 'Root commit', required = True, repeatable = True),
codearchiver.core.MetadataField(key = 'Commit', required = False, repeatable = True),
)


class Git(codearchiver.core.Module):
name = 'git'
IndexClass = GitIndex
MetadataClass = GitMetadata

@staticmethod
def matches(inputUrl):
@@ -63,7 +63,7 @@ class Git(codearchiver.core.Module):
_logger.error(f'Failed to fetch {commit}')
# This leaves over a FETCH_HEAD file, but git-bundle does not care about that, so it can safely be ignored.

_logger.info(f'Collecting repository metadata for index')
_logger.info('Collecting repository metadata')
_, refs, _ = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory)
_, commits, _ = codearchiver.subprocess.run_with_log(['git', 'log', '--reflog', '--all', '--format=format:%H% P'], cwd = directory)
commits = list(map(functools.partial(str.split, sep = ' '), commits.splitlines()))
@@ -75,12 +75,12 @@ class Git(codearchiver.core.Module):
oldCommits = {} # dict to keep the order reasonable
basedOnBundles = {} # ditto
if self._storage:
for oldBundle in self._storage.search_indices([('Root commit', c) for c in rootCommits]):
for oldBundle in self._storage.search_metadata([('Root commit', c) for c in rootCommits]):
if not oldBundle.startswith('git_'): #TODO Is there a more generic and elegant approach?
continue
_logger.info(f'Previous bundle: {oldBundle!r}')
with self._storage.open_index(oldBundle) as fp:
idx = GitIndex.deserialise(fp)
with self._storage.open_metadata(oldBundle) as fp:
idx = GitMetadata.deserialise(fp)
for key, value in idx:
if key == 'Commit' and value in commitSet:
oldCommits[value] = True
@@ -106,19 +106,19 @@ class Git(codearchiver.core.Module):
_logger.info(f'Removing clone')
shutil.rmtree(directory)

index = self.create_index(bundle)
index.append('Git version', gitVersion)
metadata = self.create_metadata(bundle)
metadata.append('Git version', gitVersion)
for oldBundle in basedOnBundles:
index.append('Based on bundle', oldBundle)
metadata.append('Based on bundle', oldBundle)
for line in refs.splitlines():
index.append('Ref', line)
metadata.append('Ref', line)
for commitHash, *parents in commits:
if commitHash not in oldCommits:
index.append('Commit', commitHash)
metadata.append('Commit', commitHash)
if not parents:
index.append('Root commit', commitHash)
metadata.append('Root commit', commitHash)

return codearchiver.core.Result(id = self._id, files = [(bundle, index)])
return codearchiver.core.Result(id = self._id, files = [(bundle, metadata)])

def __repr__(self):
return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r}, extraBranches = {self._extraBranches!r})'

+ 27
- 27
codearchiver/storage.py View File

@@ -14,28 +14,28 @@ _logger = logging.getLogger(__name__)

class Storage(abc.ABC):
@abc.abstractmethod
def put(self, filename: str, index: typing.Optional['codearchiver.core.Index'] = None):
'''Put a local file and (if provided) its index into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.'''
def put(self, filename: str, metadata: typing.Optional['codearchiver.core.Metadata'] = None):
'''Put a local file and (if provided) its metadata into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.'''

def put_result(self, result: 'codearchiver.core.Result'):
'''Put a module's Result into storage. The semantics are as for `put`, and the exact behaviour regarding partial copies and leftover files on errors is undefined.'''
for fn, index in result.files:
self.put(fn, index)
for fn, metadata in result.files:
self.put(fn, metadata)
for _, subresult in result.submoduleResults:
self.put_result(subresult)

@abc.abstractmethod
def search_indices(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> collections.abc.Iterator[str]:
def search_metadata(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> collections.abc.Iterator[str]:
'''
Search all indices in storage by criteria.
Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in a file's index.
Search all metadata in storage by criteria.
Refer to `codearchiver.core.Metadata.matches` for the semantics of `criteria`.
Yields all filenames where all criteria match.
'''

@abc.abstractmethod
@contextlib.contextmanager
def open_index(self, filename: str) -> typing.TextIO:
'''Open the index for a file in serialised form.'''
def open_metadata(self, filename: str) -> typing.TextIO:
'''Open the metadata for a file in serialised form.'''

@abc.abstractmethod
@contextlib.contextmanager
@@ -58,35 +58,35 @@ class DirectoryStorage(Storage):
if not self._check_directory():
os.makedirs(self._directory)

def put(self, filename, index = None):
def put(self, filename, metadata = None):
self._ensure_directory()
#FIXME: Race condition
if os.path.exists((targetFilename := os.path.join(self._directory, os.path.basename(filename)))):
raise FileExistsError(f'{targetFilename} already exists')
_logger.info(f'Moving {filename} to {self._directory}')
shutil.move(filename, self._directory)
if not index:
if not metadata:
return
indexFilename = os.path.join(self._directory, f'{filename}.codearchiver-index')
metadataFilename = os.path.join(self._directory, f'{filename}.codearchiver-metadata')
# No need to check for existence here thanks to the 'x' mode
_logger.info(f'Writing index for {filename} to {indexFilename}')
with open(indexFilename, 'x') as fp:
fp.write(index.serialise())
def search_indices(self, criteria):
_logger.info(f'Searching indices by criteria: {criteria!r}')
for indexFilename in glob.glob('*.codearchiver-index', root_dir = self._directory):
_logger.info(f'Searching index {indexFilename}')
with self.open(indexFilename, 'r') as fp:
idx = codearchiver.core.Index.deserialise(fp, validate = False)
_logger.info(f'Writing metadata for {filename} to {metadataFilename}')
with open(metadataFilename, 'x') as fp:
fp.write(metadata.serialise())
def search_metadata(self, criteria):
_logger.info(f'Searching metadata by criteria: {criteria!r}')
for metadataFilename in glob.glob('*.codearchiver-metadata', root_dir = self._directory):
_logger.info(f'Searching metadata {metadataFilename}')
with self.open(metadataFilename, 'r') as fp:
idx = codearchiver.core.Metadata.deserialise(fp, validate = False)
if idx.matches(criteria):
_logger.info(f'Found index match {indexFilename}')
yield indexFilename.rsplit('.', 1)[0]
_logger.info('Done searching indices')
_logger.info(f'Found metadata match {metadataFilename}')
yield metadataFilename.rsplit('.', 1)[0]
_logger.info('Done searching metadata')

@contextlib.contextmanager
def open_index(self, filename):
with self.open(f'{filename}.codearchiver-index', 'r') as fp:
def open_metadata(self, filename):
with self.open(f'{filename}.codearchiver-metadata', 'r') as fp:
yield fp

@contextlib.contextmanager


Loading…
Cancel
Save