Bladeren bron

Index → Metadata

'Index' was a misnomer from the start since it contains critical information for the operation that can't be reconstructed (e.g. existing refs).
tags/v1.0
JustAnotherArchivist 1 jaar geleden
bovenliggende
commit
fa4b60225c
3 gewijzigde bestanden met toevoegingen van 77 en 77 verwijderingen
  1. +32
    -32
      codearchiver/core.py
  2. +18
    -18
      codearchiver/modules/git.py
  3. +27
    -27
      codearchiver/storage.py

+ 32
- 32
codearchiver/core.py Bestand weergeven

@@ -64,37 +64,37 @@ class Result:
id: str id: str
'''A unique ID for this result''' '''A unique ID for this result'''


files: list[tuple[str, typing.Optional['Index']]] = dataclasses.field(default_factory = list)
'''List of filenames produced by the run, optionally with an index'''
files: list[tuple[str, typing.Optional['Metadata']]] = dataclasses.field(default_factory = list)
'''List of filenames produced by the run, optionally with metadata'''


submoduleResults: list[tuple['Module', 'Result']] = dataclasses.field(default_factory = list) submoduleResults: list[tuple['Module', 'Result']] = dataclasses.field(default_factory = list)
'''List of related submodules and their results''' '''List of related submodules and their results'''




class IndexValidationError(ValueError):
class MetadataValidationError(ValueError):
pass pass




@dataclasses.dataclass @dataclasses.dataclass
class IndexField:
class MetadataField:
key: str key: str
required: bool required: bool
repeatable: bool repeatable: bool




class Index(list[tuple[str, str]]):
'''An index (key-value mapping, possibly with repeated keys) of a file produced by a module'''
class Metadata(list[tuple[str, str]]):
'''Metadata (key-value mapping, possibly with repeated keys) of a file produced by a module'''


fields: tuple[IndexField] = (
IndexField('codearchiver version', required = True, repeatable = False),
IndexField('Module', required = True, repeatable = False),
IndexField('ID', required = True, repeatable = False),
IndexField('Input URL', required = True, repeatable = False),
IndexField('Filename', required = True, repeatable = False),
fields: tuple[MetadataField] = (
MetadataField('codearchiver version', required = True, repeatable = False),
MetadataField('Module', required = True, repeatable = False),
MetadataField('ID', required = True, repeatable = False),
MetadataField('Input URL', required = True, repeatable = False),
MetadataField('Filename', required = True, repeatable = False),
) )
'''The fields for this index'''
'''The fields for this metadata collection'''


_allFieldsCache: typing.Optional[tuple[IndexField]] = None
_allFieldsCache: typing.Optional[tuple[MetadataField]] = None


def append(self, *args): def append(self, *args):
if len(args) == 1: if len(args) == 1:
@@ -104,7 +104,7 @@ class Index(list[tuple[str, str]]):
# This should be a @classmethod, too, but that's deprecated since Python 3.11. # This should be a @classmethod, too, but that's deprecated since Python 3.11.
@property @property
def _allFields(self): def _allFields(self):
'''All fields known by this index, own ones and all from superclasses'''
'''All fields known by this metadata collection, own ones and all from superclasses'''


if type(self)._allFieldsCache is None: if type(self)._allFieldsCache is None:
fields = [] fields = []
@@ -114,7 +114,7 @@ class Index(list[tuple[str, str]]):
return type(self)._allFieldsCache return type(self)._allFieldsCache


def validate(self): def validate(self):
'''Check that all keys and values in the index conform to the specification'''
'''Check that all keys and values conform to the specification'''


keyCounts = collections.Counter(key for key, _ in self) keyCounts = collections.Counter(key for key, _ in self)
keys = set(keyCounts) keys = set(keyCounts)
@@ -122,29 +122,29 @@ class Index(list[tuple[str, str]]):
permittedKeys = set(field.key for field in self._allFields) permittedKeys = set(field.key for field in self._allFields)
unrecognisedKeys = keys - permittedKeys unrecognisedKeys = keys - permittedKeys
if unrecognisedKeys: if unrecognisedKeys:
raise IndexValidationError(f'Unrecognised key(s): {", ".join(sorted(unrecognisedKeys))}')
raise MetadataValidationError(f'Unrecognised key(s): {", ".join(sorted(unrecognisedKeys))}')


requiredKeys = set(field.key for field in self._allFields if field.required) requiredKeys = set(field.key for field in self._allFields if field.required)
missingRequiredKeys = requiredKeys - keys missingRequiredKeys = requiredKeys - keys
if missingRequiredKeys: if missingRequiredKeys:
raise IndexValidationError(f'Missing required key(s): {", ".join(sorted(missingRequiredKeys))}')
raise MetadataValidationError(f'Missing required key(s): {", ".join(sorted(missingRequiredKeys))}')


repeatableKeys = set(field.key for field in self._allFields if field.repeatable) repeatableKeys = set(field.key for field in self._allFields if field.repeatable)
repeatedKeys = set(key for key, count in keyCounts.items() if count > 1) repeatedKeys = set(key for key, count in keyCounts.items() if count > 1)
repeatedUnrepeatableKeys = repeatedKeys - repeatableKeys repeatedUnrepeatableKeys = repeatedKeys - repeatableKeys
if repeatedUnrepeatableKeys: if repeatedUnrepeatableKeys:
raise IndexValidationError(f'Repeated unrepeatable key(s): {", ".join(sorted(repeatedUnrepeatableKeys))}')
raise MetadataValidationError(f'Repeated unrepeatable key(s): {", ".join(sorted(repeatedUnrepeatableKeys))}')


def matches(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> bool: def matches(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> bool:
''' '''
Check whether the criteria match this index
Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in the index.
Check whether the criteria match this metadata collection
Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in the metadata.
Multiple criteria may use the same key to perform an AND search. Multiple criteria may use the same key to perform an AND search.
The index is a match if all criteria match.
The metadata is a match if all criteria match.
''' '''


criteria = criteria.copy() criteria = criteria.copy()
_logger.debug(f'Searching index for {criteria!r}')
_logger.debug(f'Searching metadata for {criteria!r}')
keysOfInterest = set(key for key, _ in criteria) keysOfInterest = set(key for key, _ in criteria)
for key, value in self: for key, value in self:
if key not in keysOfInterest: if key not in keysOfInterest:
@@ -169,14 +169,14 @@ class Index(list[tuple[str, str]]):
return not bool(criteria) return not bool(criteria)


def serialise(self) -> str: def serialise(self) -> str:
'''Convert the index to a string suitable for e.g. a simple text file storage'''
'''Convert the metadata to a string suitable for e.g. a simple text file storage'''


self.validate() self.validate()
return ''.join(f'{key}: {value}\n' for key, value in self) return ''.join(f'{key}: {value}\n' for key, value in self)


@classmethod @classmethod
def deserialise(cls, f: typing.Union[str, bytes, os.PathLike, typing.TextIO], *, validate = True): def deserialise(cls, f: typing.Union[str, bytes, os.PathLike, typing.TextIO], *, validate = True):
'''Import a serialised index from a filename or file-like object'''
'''Import a serialised metadata from a filename or file-like object'''


if isinstance(f, (str, bytes, os.PathLike)): if isinstance(f, (str, bytes, os.PathLike)):
cm = open(f, 'r') cm = open(f, 'r')
@@ -355,8 +355,8 @@ class Module(metaclass = ModuleMeta):
name: typing.Optional[str] = None name: typing.Optional[str] = None
'''The name of the module. Modules without a name are ignored. Names must be unique and may only contain a-z and hyphens.''' '''The name of the module. Modules without a name are ignored. Names must be unique and may only contain a-z and hyphens.'''


IndexClass: typing.Optional[typing.Type[Index]] = None
'''The Index class corresponding to this module, if any.'''
MetadataClass: typing.Optional[typing.Type[Metadata]] = None
'''The Metadata class corresponding to this module, if any.'''


@staticmethod @staticmethod
def matches(inputUrl: InputURL) -> bool: def matches(inputUrl: InputURL) -> bool:
@@ -376,12 +376,12 @@ class Module(metaclass = ModuleMeta):
def process(self) -> Result: def process(self) -> Result:
'''Perform the relevant retrieval(s)''' '''Perform the relevant retrieval(s)'''


def create_index(self, filename: str) -> Index:
'''Create a basic Index instance appropriate for this module'''
def create_metadata(self, filename: str) -> Metadata:
'''Create a basic Metadata instance appropriate for this module'''


if type(self).IndexClass is None or type(self).name is None:
raise RuntimeError('Module lacks an IndexClass or a name; cannot create index')
idx = type(self).IndexClass()
if type(self).MetadataClass is None or type(self).name is None:
raise RuntimeError('Module lacks an MetadataClass or a name; cannot create metadata')
idx = type(self).MetadataClass()
idx.append('codearchiver version', codearchiver.version.__version__) idx.append('codearchiver version', codearchiver.version.__version__)
idx.append('Module', type(self).name) idx.append('Module', type(self).name)
idx.append('ID', self._id) idx.append('ID', self._id)


+ 18
- 18
codearchiver/modules/git.py Bestand weergeven

@@ -11,19 +11,19 @@ import subprocess
_logger = logging.getLogger(__name__) _logger = logging.getLogger(__name__)




class GitIndex(codearchiver.core.Index):
class GitMetadata(codearchiver.core.Metadata):
fields = ( fields = (
codearchiver.core.IndexField(key = 'Git version', required = True, repeatable = False),
codearchiver.core.IndexField(key = 'Based on bundle', required = False, repeatable = True),
codearchiver.core.IndexField(key = 'Ref', required = True, repeatable = True),
codearchiver.core.IndexField(key = 'Root commit', required = True, repeatable = True),
codearchiver.core.IndexField(key = 'Commit', required = False, repeatable = True),
codearchiver.core.MetadataField(key = 'Git version', required = True, repeatable = False),
codearchiver.core.MetadataField(key = 'Based on bundle', required = False, repeatable = True),
codearchiver.core.MetadataField(key = 'Ref', required = True, repeatable = True),
codearchiver.core.MetadataField(key = 'Root commit', required = True, repeatable = True),
codearchiver.core.MetadataField(key = 'Commit', required = False, repeatable = True),
) )




class Git(codearchiver.core.Module): class Git(codearchiver.core.Module):
name = 'git' name = 'git'
IndexClass = GitIndex
MetadataClass = GitMetadata


@staticmethod @staticmethod
def matches(inputUrl): def matches(inputUrl):
@@ -63,7 +63,7 @@ class Git(codearchiver.core.Module):
_logger.error(f'Failed to fetch {commit}') _logger.error(f'Failed to fetch {commit}')
# This leaves over a FETCH_HEAD file, but git-bundle does not care about that, so it can safely be ignored. # This leaves over a FETCH_HEAD file, but git-bundle does not care about that, so it can safely be ignored.


_logger.info(f'Collecting repository metadata for index')
_logger.info('Collecting repository metadata')
_, refs, _ = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory) _, refs, _ = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory)
_, commits, _ = codearchiver.subprocess.run_with_log(['git', 'log', '--reflog', '--all', '--format=format:%H% P'], cwd = directory) _, commits, _ = codearchiver.subprocess.run_with_log(['git', 'log', '--reflog', '--all', '--format=format:%H% P'], cwd = directory)
commits = list(map(functools.partial(str.split, sep = ' '), commits.splitlines())) commits = list(map(functools.partial(str.split, sep = ' '), commits.splitlines()))
@@ -75,12 +75,12 @@ class Git(codearchiver.core.Module):
oldCommits = {} # dict to keep the order reasonable oldCommits = {} # dict to keep the order reasonable
basedOnBundles = {} # ditto basedOnBundles = {} # ditto
if self._storage: if self._storage:
for oldBundle in self._storage.search_indices([('Root commit', c) for c in rootCommits]):
for oldBundle in self._storage.search_metadata([('Root commit', c) for c in rootCommits]):
if not oldBundle.startswith('git_'): #TODO Is there a more generic and elegant approach? if not oldBundle.startswith('git_'): #TODO Is there a more generic and elegant approach?
continue continue
_logger.info(f'Previous bundle: {oldBundle!r}') _logger.info(f'Previous bundle: {oldBundle!r}')
with self._storage.open_index(oldBundle) as fp:
idx = GitIndex.deserialise(fp)
with self._storage.open_metadata(oldBundle) as fp:
idx = GitMetadata.deserialise(fp)
for key, value in idx: for key, value in idx:
if key == 'Commit' and value in commitSet: if key == 'Commit' and value in commitSet:
oldCommits[value] = True oldCommits[value] = True
@@ -106,19 +106,19 @@ class Git(codearchiver.core.Module):
_logger.info(f'Removing clone') _logger.info(f'Removing clone')
shutil.rmtree(directory) shutil.rmtree(directory)


index = self.create_index(bundle)
index.append('Git version', gitVersion)
metadata = self.create_metadata(bundle)
metadata.append('Git version', gitVersion)
for oldBundle in basedOnBundles: for oldBundle in basedOnBundles:
index.append('Based on bundle', oldBundle)
metadata.append('Based on bundle', oldBundle)
for line in refs.splitlines(): for line in refs.splitlines():
index.append('Ref', line)
metadata.append('Ref', line)
for commitHash, *parents in commits: for commitHash, *parents in commits:
if commitHash not in oldCommits: if commitHash not in oldCommits:
index.append('Commit', commitHash)
metadata.append('Commit', commitHash)
if not parents: if not parents:
index.append('Root commit', commitHash)
metadata.append('Root commit', commitHash)


return codearchiver.core.Result(id = self._id, files = [(bundle, index)])
return codearchiver.core.Result(id = self._id, files = [(bundle, metadata)])


def __repr__(self): def __repr__(self):
return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r}, extraBranches = {self._extraBranches!r})' return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r}, extraBranches = {self._extraBranches!r})'

+ 27
- 27
codearchiver/storage.py Bestand weergeven

@@ -14,28 +14,28 @@ _logger = logging.getLogger(__name__)


class Storage(abc.ABC): class Storage(abc.ABC):
@abc.abstractmethod @abc.abstractmethod
def put(self, filename: str, index: typing.Optional['codearchiver.core.Index'] = None):
'''Put a local file and (if provided) its index into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.'''
def put(self, filename: str, metadata: typing.Optional['codearchiver.core.Metadata'] = None):
'''Put a local file and (if provided) its metadata into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.'''


def put_result(self, result: 'codearchiver.core.Result'): def put_result(self, result: 'codearchiver.core.Result'):
'''Put a module's Result into storage. The semantics are as for `put`, and the exact behaviour regarding partial copies and leftover files on errors is undefined.''' '''Put a module's Result into storage. The semantics are as for `put`, and the exact behaviour regarding partial copies and leftover files on errors is undefined.'''
for fn, index in result.files:
self.put(fn, index)
for fn, metadata in result.files:
self.put(fn, metadata)
for _, subresult in result.submoduleResults: for _, subresult in result.submoduleResults:
self.put_result(subresult) self.put_result(subresult)


@abc.abstractmethod @abc.abstractmethod
def search_indices(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> collections.abc.Iterator[str]:
def search_metadata(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> collections.abc.Iterator[str]:
''' '''
Search all indices in storage by criteria.
Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in a file's index.
Search all metadata in storage by criteria.
Refer to `codearchiver.core.Metadata.matches` for the semantics of `criteria`.
Yields all filenames where all criteria match. Yields all filenames where all criteria match.
''' '''


@abc.abstractmethod @abc.abstractmethod
@contextlib.contextmanager @contextlib.contextmanager
def open_index(self, filename: str) -> typing.TextIO:
'''Open the index for a file in serialised form.'''
def open_metadata(self, filename: str) -> typing.TextIO:
'''Open the metadata for a file in serialised form.'''


@abc.abstractmethod @abc.abstractmethod
@contextlib.contextmanager @contextlib.contextmanager
@@ -58,35 +58,35 @@ class DirectoryStorage(Storage):
if not self._check_directory(): if not self._check_directory():
os.makedirs(self._directory) os.makedirs(self._directory)


def put(self, filename, index = None):
def put(self, filename, metadata = None):
self._ensure_directory() self._ensure_directory()
#FIXME: Race condition #FIXME: Race condition
if os.path.exists((targetFilename := os.path.join(self._directory, os.path.basename(filename)))): if os.path.exists((targetFilename := os.path.join(self._directory, os.path.basename(filename)))):
raise FileExistsError(f'{targetFilename} already exists') raise FileExistsError(f'{targetFilename} already exists')
_logger.info(f'Moving {filename} to {self._directory}') _logger.info(f'Moving {filename} to {self._directory}')
shutil.move(filename, self._directory) shutil.move(filename, self._directory)
if not index:
if not metadata:
return return
indexFilename = os.path.join(self._directory, f'{filename}.codearchiver-index')
metadataFilename = os.path.join(self._directory, f'{filename}.codearchiver-metadata')
# No need to check for existence here thanks to the 'x' mode # No need to check for existence here thanks to the 'x' mode
_logger.info(f'Writing index for {filename} to {indexFilename}')
with open(indexFilename, 'x') as fp:
fp.write(index.serialise())
def search_indices(self, criteria):
_logger.info(f'Searching indices by criteria: {criteria!r}')
for indexFilename in glob.glob('*.codearchiver-index', root_dir = self._directory):
_logger.info(f'Searching index {indexFilename}')
with self.open(indexFilename, 'r') as fp:
idx = codearchiver.core.Index.deserialise(fp, validate = False)
_logger.info(f'Writing metadata for {filename} to {metadataFilename}')
with open(metadataFilename, 'x') as fp:
fp.write(metadata.serialise())
def search_metadata(self, criteria):
_logger.info(f'Searching metadata by criteria: {criteria!r}')
for metadataFilename in glob.glob('*.codearchiver-metadata', root_dir = self._directory):
_logger.info(f'Searching metadata {metadataFilename}')
with self.open(metadataFilename, 'r') as fp:
idx = codearchiver.core.Metadata.deserialise(fp, validate = False)
if idx.matches(criteria): if idx.matches(criteria):
_logger.info(f'Found index match {indexFilename}')
yield indexFilename.rsplit('.', 1)[0]
_logger.info('Done searching indices')
_logger.info(f'Found metadata match {metadataFilename}')
yield metadataFilename.rsplit('.', 1)[0]
_logger.info('Done searching metadata')


@contextlib.contextmanager @contextlib.contextmanager
def open_index(self, filename):
with self.open(f'{filename}.codearchiver-index', 'r') as fp:
def open_metadata(self, filename):
with self.open(f'{filename}.codearchiver-metadata', 'r') as fp:
yield fp yield fp


@contextlib.contextmanager @contextlib.contextmanager


Laden…
Annuleren
Opslaan