From 0f1f5abc648671afe9a31a4172ac696f49e2f1c1 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Thu, 9 Mar 2023 07:55:40 +0000 Subject: [PATCH] Add indices for files --- codearchiver/core.py | 71 +++++++++++++++++++++++++++++++++++-- codearchiver/modules/git.py | 23 +++++++++++- codearchiver/storage.py | 16 ++++++--- 3 files changed, 102 insertions(+), 8 deletions(-) diff --git a/codearchiver/core.py b/codearchiver/core.py index f5e20cf..dac2077 100644 --- a/codearchiver/core.py +++ b/codearchiver/core.py @@ -2,8 +2,11 @@ import abc import collections #import codearchiver.modules # In get_module_class import codearchiver.version +import contextlib import dataclasses +import functools import logging +import os import queue import requests import time @@ -59,13 +62,77 @@ class Result: id: str '''A unique ID for this result''' - files: list[str] = dataclasses.field(default_factory = list) - '''List of filenames produced by the run''' + files: list[tuple[str, typing.Optional['Index']]] = dataclasses.field(default_factory = list) + '''List of filenames produced by the run, optionally with an index''' submoduleResults: list[tuple['Module', 'Result']] = dataclasses.field(default_factory = list) '''List of related submodules and their results''' +class IndexValidationError(ValueError): + pass + + +@dataclasses.dataclass +class IndexField: + key: str + required: bool + repeatable: bool + + +class Index(list[tuple[str, str]]): + '''An index (key-value mapping, possibly with repeated keys) of a file produced by a module''' + + fields: list[IndexField] = [] + '''The fields for this index''' + + def append(self, *args): + if len(args) == 1: + args = args[0] + return super().append(args) + + def validate(self): + '''Check that all keys and values in the index conform to the specification''' + + keyCounts = collections.Counter(key for key, _ in self) + keys = set(keyCounts) + + permittedKeys = set(field.key for field in type(self).fields) + unrecognisedKeys = keys - permittedKeys + if unrecognisedKeys: + raise IndexValidationError(f'Unrecognised key(s): {", ".join(sorted(unrecognisedKeys))}') + + requiredKeys = set(field.key for field in type(self).fields if field.required) + missingRequiredKeys = requiredKeys - keys + if missingRequiredKeys: + raise IndexValidationError(f'Missing required key(s): {", ".join(sorted(missingRequiredKeys))}') + + repeatableKeys = set(field.key for field in type(self).fields if field.repeatable) + repeatedKeys = set(key for key, count in keyCounts.items() if count > 1) + repeatedUnrepeatableKeys = repeatedKeys - repeatableKeys + if repeatedUnrepeatableKeys: + raise IndexValidationError(f'Repeated unrepeatable key(s): {", ".join(sorted(repeatedUnrepeatableKeys))}') + + def serialise(self) -> str: + '''Convert the index to a string suitable for e.g. a simple text file storage''' + + self.validate() + return ''.join(f'{key}: {value}\n' for key, value in self) + + @classmethod + def deserialise(cls, f: typing.Union[str, bytes, os.PathLike, typing.TextIO]): + '''Import a serialised index from a filename or file-like object''' + + if isinstance(f, (str, bytes, os.PathLike)): + cm = open(f, 'r') + else: + cm = contextlib.nullcontext(f) + with cm as fp: + o = cls((key, value[:-1]) for key, value in map(functools.partial(str.split, sep = ': '), fp)) + o.validate() + return o + + class HttpError(Exception): '''An HTTP request failed too many times.''' diff --git a/codearchiver/modules/git.py b/codearchiver/modules/git.py index 7223955..ca12f9c 100644 --- a/codearchiver/modules/git.py +++ b/codearchiver/modules/git.py @@ -1,6 +1,7 @@ import codearchiver.core import codearchiver.subprocess import datetime +import functools import logging import os.path import shutil @@ -10,6 +11,14 @@ import subprocess logger = logging.getLogger(__name__) +class GitIndex(codearchiver.core.Index): + fields = [ + codearchiver.core.IndexField(key = 'Ref', required = True, repeatable = True), + codearchiver.core.IndexField(key = 'Root commit', required = True, repeatable = True), + codearchiver.core.IndexField(key = 'Commit', required = True, repeatable = True), + ] + + class Git(codearchiver.core.Module): name = 'git' @@ -51,10 +60,22 @@ class Git(codearchiver.core.Module): logger.info(f'Bundling into {bundle}') codearchiver.subprocess.run_with_log(['git', 'bundle', 'create', '--progress', f'../{bundle}', '--all'], cwd = directory) + logger.info(f'Collecting repository metadata for index') + _, refs = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory) + _, commits = codearchiver.subprocess.run_with_log(['git', 'log', '--reflog', '--format=format:%H% P'], cwd = directory) + logger.info(f'Removing clone') shutil.rmtree(directory) - return codearchiver.core.Result(id = self._id, files = [bundle]) + index = GitIndex() + for line in refs.splitlines(): + index.append('Ref', line) + for commitHash, *parents in map(functools.partial(str.split, sep = ' '), commits.splitlines()): + index.append('Commit', commitHash) + if not parents: + index.append('Root commit', commitHash) + + return codearchiver.core.Result(id = self._id, files = [(bundle, index)]) def __repr__(self): return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r}, extraBranches = {self._extraBranches!r})' diff --git a/codearchiver/storage.py b/codearchiver/storage.py index 7b42ab2..ee56afd 100644 --- a/codearchiver/storage.py +++ b/codearchiver/storage.py @@ -12,13 +12,13 @@ _logger = logging.getLogger(__name__) class Storage(abc.ABC): @abc.abstractmethod - def put(self, filename: str): - '''Put a local file into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.''' + def put(self, filename: str, index: typing.Optional[codearchiver.core.Index] = None): + '''Put a local file and (if provided) its index into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.''' def put_result(self, result: codearchiver.core.Result): '''Put a module's Result into storage. The semantics are as for `put`, and the exact behaviour regarding partial copies and leftover files on errors is undefined.''' - for fn in result.files: - self.put(fn) + for fn, index in result.files: + self.put(fn, index) for _, subresult in result.submoduleResults: self.put_result(subresult) @@ -43,10 +43,16 @@ class DirectoryStorage(Storage): if not self._check_directory(): os.makedirs(self._directory) - def put(self, filename): + def put(self, filename, index = None): self._ensure_directory() _logger.info(f'Moving {filename} to {self._directory}') shutil.move(filename, self._directory) + if not index: + return + indexFilename = os.path.join(self._directory, f'{filename}.codearchiver-index') + _logger.info(f'Writing index for {filename} to {indexFilename}') + with open(indexFilename, 'x') as fp: + fp.write(index.serialise()) @contextlib.contextmanager def open(self, filename):