From 8e83c9b7b49b9a498482b48619d0b358bb8e1420 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Thu, 9 Mar 2023 10:53:02 +0000 Subject: [PATCH] Support incremental Git bundles Also fix a small discrepancy between the commit list and bundle due to --reflog vs --all --- codearchiver/cli.py | 2 +- codearchiver/core.py | 42 +++++++++++++++++++++++++++++++++--- codearchiver/modules/git.py | 35 ++++++++++++++++++++++++------ codearchiver/storage.py | 43 +++++++++++++++++++++++++++++++------ 4 files changed, 106 insertions(+), 16 deletions(-) diff --git a/codearchiver/cli.py b/codearchiver/cli.py index 88d3cb4..ef471e6 100644 --- a/codearchiver/cli.py +++ b/codearchiver/cli.py @@ -190,8 +190,8 @@ def main(): import codearchiver.storage with _dump_locals_on_exception(): inputUrl = codearchiver.core.InputURL(args.url) - module = codearchiver.core.get_module_instance(inputUrl) storage = codearchiver.storage.DirectoryStorage(os.getcwd()) + module = codearchiver.core.get_module_instance(inputUrl, storage = storage) with tempfile.TemporaryDirectory(prefix = 'tmp.codearchiver.', dir = os.getcwd()) as td: _logger.debug(f'Running in {td}') os.chdir(td) diff --git a/codearchiver/core.py b/codearchiver/core.py index 11fa67e..945b991 100644 --- a/codearchiver/core.py +++ b/codearchiver/core.py @@ -1,5 +1,6 @@ import abc #import codearchiver.modules # In get_module_class +import codearchiver.storage import codearchiver.version import collections import contextlib @@ -113,6 +114,39 @@ class Index(list[tuple[str, str]]): if repeatedUnrepeatableKeys: raise IndexValidationError(f'Repeated unrepeatable key(s): {", ".join(sorted(repeatedUnrepeatableKeys))}') + def matches(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> bool: + ''' + Check whether the criteria match this index + Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in the index. + Multiple criteria may use the same key to perform an AND search. + The index is a match if all criteria match. + ''' + + criteria = criteria.copy() + _logger.debug(f'Searching index for {criteria!r}') + keysOfInterest = set(key for key, _ in criteria) + for key, value in self: + if key not in keysOfInterest: + continue + _logger.debug(f'Potentially interesting entry: {key!r} = {value!r}') + matched = [] # Indices to remove from remaining criteria + for i, (keyCriterion, valueCriterion) in enumerate(criteria): + if keyCriterion != key: + continue + if isinstance(valueCriterion, str) and valueCriterion == value: + _logger.debug('Str match') + matched.append(i) + elif isinstance(valueCriterion, tuple) and value in valueCriterion: + _logger.debug('Tuple match') + matched.append(i) + for i in reversed(matched): + _logger.debug(f'Matched remaining criterion {i}: {criteria[i]}') + del criteria[i] + if not criteria: + break + _logger.debug(f'Remaining unmatched criteria: {criteria!r}') + return not bool(criteria) + def serialise(self) -> str: '''Convert the index to a string suitable for e.g. a simple text file storage''' @@ -120,7 +154,7 @@ class Index(list[tuple[str, str]]): return ''.join(f'{key}: {value}\n' for key, value in self) @classmethod - def deserialise(cls, f: typing.Union[str, bytes, os.PathLike, typing.TextIO]): + def deserialise(cls, f: typing.Union[str, bytes, os.PathLike, typing.TextIO], *, validate = True): '''Import a serialised index from a filename or file-like object''' if isinstance(f, (str, bytes, os.PathLike)): @@ -129,7 +163,8 @@ class Index(list[tuple[str, str]]): cm = contextlib.nullcontext(f) with cm as fp: o = cls((key, value[:-1]) for key, value in map(functools.partial(str.split, sep = ': '), fp)) - o.validate() + if validate: + o.validate() return o @@ -304,9 +339,10 @@ class Module(metaclass = ModuleMeta): '''Whether or not this module is for handling `inputUrl`.''' return False - def __init__(self, inputUrl: InputURL, id_: typing.Optional[str] = None): + def __init__(self, inputUrl: InputURL, storage: typing.Optional[codearchiver.storage.Storage] = None, id_: typing.Optional[str] = None): self._inputUrl = inputUrl self._url = inputUrl.url + self._storage = storage self._id = id_ self._httpClient = HttpClient() diff --git a/codearchiver/modules/git.py b/codearchiver/modules/git.py index 5e8ad84..9714809 100644 --- a/codearchiver/modules/git.py +++ b/codearchiver/modules/git.py @@ -13,6 +13,7 @@ _logger = logging.getLogger(__name__) class GitIndex(codearchiver.core.Index): fields = [ + codearchiver.core.IndexField(key = 'Based on bundle', required = False, repeatable = True), codearchiver.core.IndexField(key = 'Ref', required = True, repeatable = True), codearchiver.core.IndexField(key = 'Root commit', required = True, repeatable = True), codearchiver.core.IndexField(key = 'Commit', required = True, repeatable = True), @@ -57,21 +58,43 @@ class Git(codearchiver.core.Module): else: _logger.error(f'Failed to fetch {commit}') - _logger.info(f'Bundling into {bundle}') - codearchiver.subprocess.run_with_log(['git', 'bundle', 'create', '--progress', f'../{bundle}', '--all'], cwd = directory) - _logger.info(f'Collecting repository metadata for index') _, refs = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory) - _, commits = codearchiver.subprocess.run_with_log(['git', 'log', '--reflog', '--format=format:%H% P'], cwd = directory) + _, commits = codearchiver.subprocess.run_with_log(['git', 'log', '--reflog', '--all', '--format=format:%H% P'], cwd = directory) + commits = list(map(functools.partial(str.split, sep = ' '), commits.splitlines())) + rootCommits = [c[0] for c in commits if len(c) == 1] + + # Check whether there are relevant prior bundles to create an incremental one + # Collect their commits shared with this clone (else `git bundle` complains about 'bad object') + commitSet = set(c[0] for c in commits) # For fast lookup + oldCommits = {} # dict to keep the order reasonable + basedOnBundles = {} # ditto + if self._storage: + for oldBundle in self._storage.search_indices([('Root commit', c) for c in rootCommits]): + if not oldBundle.startswith('git_'): #TODO Is there a more generic and elegant approach? + continue + _logger.info(f'Previous bundle: {oldBundle!r}') + with self._storage.open_index(oldBundle) as fp: + idx = GitIndex.deserialise(fp) + for key, value in idx: + if key == 'Commit' and value in commitSet: + oldCommits[value] = True + basedOnBundles[oldBundle] = True + + _logger.info(f'Bundling into {bundle}') + codearchiver.subprocess.run_with_log(['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all'], cwd = directory, input = ''.join(f'^{commit}\n' for commit in oldCommits).encode('ascii')) _logger.info(f'Removing clone') shutil.rmtree(directory) index = GitIndex() + for oldBundle in basedOnBundles: + index.append('Based on bundle', oldBundle) for line in refs.splitlines(): index.append('Ref', line) - for commitHash, *parents in map(functools.partial(str.split, sep = ' '), commits.splitlines()): - index.append('Commit', commitHash) + for commitHash, *parents in commits: + if commitHash not in oldCommits: + index.append('Commit', commitHash) if not parents: index.append('Root commit', commitHash) diff --git a/codearchiver/storage.py b/codearchiver/storage.py index bd7641c..8a3bcbd 100644 --- a/codearchiver/storage.py +++ b/codearchiver/storage.py @@ -1,6 +1,8 @@ import abc import codearchiver.core +import collections.abc import contextlib +import glob import logging import os.path import shutil @@ -12,20 +14,33 @@ _logger = logging.getLogger(__name__) class Storage(abc.ABC): @abc.abstractmethod - def put(self, filename: str, index: typing.Optional[codearchiver.core.Index] = None): + def put(self, filename: str, index: typing.Optional['codearchiver.core.Index'] = None): '''Put a local file and (if provided) its index into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.''' - def put_result(self, result: codearchiver.core.Result): + def put_result(self, result: 'codearchiver.core.Result'): '''Put a module's Result into storage. The semantics are as for `put`, and the exact behaviour regarding partial copies and leftover files on errors is undefined.''' for fn, index in result.files: self.put(fn, index) for _, subresult in result.submoduleResults: self.put_result(subresult) + @abc.abstractmethod + def search_indices(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> collections.abc.Iterator[str]: + ''' + Search all indices in storage by criteria. + Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in a file's index. + Yields all filenames where all criteria match. + ''' + + @abc.abstractmethod + @contextlib.contextmanager + def open_index(self, filename: str) -> typing.TextIO: + '''Open the index for a file in serialised form.''' + @abc.abstractmethod @contextlib.contextmanager - def open(self, filename: str) -> typing.Iterator[typing.BinaryIO]: - '''Open a file from storage.''' + def open(self, filename: str, mode: typing.Optional[str] = 'rb') -> typing.Iterator[typing.Union[typing.BinaryIO, typing.TextIO]]: + '''Open a file from storage. The mode must be r or rb.''' class DirectoryStorage(Storage): @@ -58,7 +73,23 @@ class DirectoryStorage(Storage): with open(indexFilename, 'x') as fp: fp.write(index.serialise()) + def search_indices(self, criteria): + _logger.info(f'Searching indices by criteria: {criteria!r}') + for indexFilename in glob.glob('*.codearchiver-index', root_dir = self._directory): + _logger.info(f'Searching index {indexFilename}') + with self.open(indexFilename, 'r') as fp: + idx = codearchiver.core.Index.deserialise(fp, validate = False) + if idx.matches(criteria): + _logger.info(f'Found index match {indexFilename}') + yield indexFilename.rsplit('.', 1)[0] + _logger.info('Done searching indices') + + @contextlib.contextmanager + def open_index(self, filename): + with self.open(f'{filename}.codearchiver-index', 'r') as fp: + yield fp + @contextlib.contextmanager - def open(self, filename): - with open(filename, 'rb') as fp: + def open(self, filename, mode = 'rb'): + with open(os.path.join(self._directory, filename), mode) as fp: yield fp