Also fix a small discrepancy between the commit list and bundle due to --reflog vs --alltags/v1.0
@@ -190,8 +190,8 @@ def main(): | |||
import codearchiver.storage | |||
with _dump_locals_on_exception(): | |||
inputUrl = codearchiver.core.InputURL(args.url) | |||
module = codearchiver.core.get_module_instance(inputUrl) | |||
storage = codearchiver.storage.DirectoryStorage(os.getcwd()) | |||
module = codearchiver.core.get_module_instance(inputUrl, storage = storage) | |||
with tempfile.TemporaryDirectory(prefix = 'tmp.codearchiver.', dir = os.getcwd()) as td: | |||
_logger.debug(f'Running in {td}') | |||
os.chdir(td) | |||
@@ -1,5 +1,6 @@ | |||
import abc | |||
#import codearchiver.modules # In get_module_class | |||
import codearchiver.storage | |||
import codearchiver.version | |||
import collections | |||
import contextlib | |||
@@ -113,6 +114,39 @@ class Index(list[tuple[str, str]]): | |||
if repeatedUnrepeatableKeys: | |||
raise IndexValidationError(f'Repeated unrepeatable key(s): {", ".join(sorted(repeatedUnrepeatableKeys))}') | |||
def matches(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> bool: | |||
''' | |||
Check whether the criteria match this index | |||
Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in the index. | |||
Multiple criteria may use the same key to perform an AND search. | |||
The index is a match if all criteria match. | |||
''' | |||
criteria = criteria.copy() | |||
_logger.debug(f'Searching index for {criteria!r}') | |||
keysOfInterest = set(key for key, _ in criteria) | |||
for key, value in self: | |||
if key not in keysOfInterest: | |||
continue | |||
_logger.debug(f'Potentially interesting entry: {key!r} = {value!r}') | |||
matched = [] # Indices to remove from remaining criteria | |||
for i, (keyCriterion, valueCriterion) in enumerate(criteria): | |||
if keyCriterion != key: | |||
continue | |||
if isinstance(valueCriterion, str) and valueCriterion == value: | |||
_logger.debug('Str match') | |||
matched.append(i) | |||
elif isinstance(valueCriterion, tuple) and value in valueCriterion: | |||
_logger.debug('Tuple match') | |||
matched.append(i) | |||
for i in reversed(matched): | |||
_logger.debug(f'Matched remaining criterion {i}: {criteria[i]}') | |||
del criteria[i] | |||
if not criteria: | |||
break | |||
_logger.debug(f'Remaining unmatched criteria: {criteria!r}') | |||
return not bool(criteria) | |||
def serialise(self) -> str: | |||
'''Convert the index to a string suitable for e.g. a simple text file storage''' | |||
@@ -120,7 +154,7 @@ class Index(list[tuple[str, str]]): | |||
return ''.join(f'{key}: {value}\n' for key, value in self) | |||
@classmethod | |||
def deserialise(cls, f: typing.Union[str, bytes, os.PathLike, typing.TextIO]): | |||
def deserialise(cls, f: typing.Union[str, bytes, os.PathLike, typing.TextIO], *, validate = True): | |||
'''Import a serialised index from a filename or file-like object''' | |||
if isinstance(f, (str, bytes, os.PathLike)): | |||
@@ -129,7 +163,8 @@ class Index(list[tuple[str, str]]): | |||
cm = contextlib.nullcontext(f) | |||
with cm as fp: | |||
o = cls((key, value[:-1]) for key, value in map(functools.partial(str.split, sep = ': '), fp)) | |||
o.validate() | |||
if validate: | |||
o.validate() | |||
return o | |||
@@ -304,9 +339,10 @@ class Module(metaclass = ModuleMeta): | |||
'''Whether or not this module is for handling `inputUrl`.''' | |||
return False | |||
def __init__(self, inputUrl: InputURL, id_: typing.Optional[str] = None): | |||
def __init__(self, inputUrl: InputURL, storage: typing.Optional[codearchiver.storage.Storage] = None, id_: typing.Optional[str] = None): | |||
self._inputUrl = inputUrl | |||
self._url = inputUrl.url | |||
self._storage = storage | |||
self._id = id_ | |||
self._httpClient = HttpClient() | |||
@@ -13,6 +13,7 @@ _logger = logging.getLogger(__name__) | |||
class GitIndex(codearchiver.core.Index): | |||
fields = [ | |||
codearchiver.core.IndexField(key = 'Based on bundle', required = False, repeatable = True), | |||
codearchiver.core.IndexField(key = 'Ref', required = True, repeatable = True), | |||
codearchiver.core.IndexField(key = 'Root commit', required = True, repeatable = True), | |||
codearchiver.core.IndexField(key = 'Commit', required = True, repeatable = True), | |||
@@ -57,21 +58,43 @@ class Git(codearchiver.core.Module): | |||
else: | |||
_logger.error(f'Failed to fetch {commit}') | |||
_logger.info(f'Bundling into {bundle}') | |||
codearchiver.subprocess.run_with_log(['git', 'bundle', 'create', '--progress', f'../{bundle}', '--all'], cwd = directory) | |||
_logger.info(f'Collecting repository metadata for index') | |||
_, refs = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory) | |||
_, commits = codearchiver.subprocess.run_with_log(['git', 'log', '--reflog', '--format=format:%H% P'], cwd = directory) | |||
_, commits = codearchiver.subprocess.run_with_log(['git', 'log', '--reflog', '--all', '--format=format:%H% P'], cwd = directory) | |||
commits = list(map(functools.partial(str.split, sep = ' '), commits.splitlines())) | |||
rootCommits = [c[0] for c in commits if len(c) == 1] | |||
# Check whether there are relevant prior bundles to create an incremental one | |||
# Collect their commits shared with this clone (else `git bundle` complains about 'bad object') | |||
commitSet = set(c[0] for c in commits) # For fast lookup | |||
oldCommits = {} # dict to keep the order reasonable | |||
basedOnBundles = {} # ditto | |||
if self._storage: | |||
for oldBundle in self._storage.search_indices([('Root commit', c) for c in rootCommits]): | |||
if not oldBundle.startswith('git_'): #TODO Is there a more generic and elegant approach? | |||
continue | |||
_logger.info(f'Previous bundle: {oldBundle!r}') | |||
with self._storage.open_index(oldBundle) as fp: | |||
idx = GitIndex.deserialise(fp) | |||
for key, value in idx: | |||
if key == 'Commit' and value in commitSet: | |||
oldCommits[value] = True | |||
basedOnBundles[oldBundle] = True | |||
_logger.info(f'Bundling into {bundle}') | |||
codearchiver.subprocess.run_with_log(['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all'], cwd = directory, input = ''.join(f'^{commit}\n' for commit in oldCommits).encode('ascii')) | |||
_logger.info(f'Removing clone') | |||
shutil.rmtree(directory) | |||
index = GitIndex() | |||
for oldBundle in basedOnBundles: | |||
index.append('Based on bundle', oldBundle) | |||
for line in refs.splitlines(): | |||
index.append('Ref', line) | |||
for commitHash, *parents in map(functools.partial(str.split, sep = ' '), commits.splitlines()): | |||
index.append('Commit', commitHash) | |||
for commitHash, *parents in commits: | |||
if commitHash not in oldCommits: | |||
index.append('Commit', commitHash) | |||
if not parents: | |||
index.append('Root commit', commitHash) | |||
@@ -1,6 +1,8 @@ | |||
import abc | |||
import codearchiver.core | |||
import collections.abc | |||
import contextlib | |||
import glob | |||
import logging | |||
import os.path | |||
import shutil | |||
@@ -12,20 +14,33 @@ _logger = logging.getLogger(__name__) | |||
class Storage(abc.ABC): | |||
@abc.abstractmethod | |||
def put(self, filename: str, index: typing.Optional[codearchiver.core.Index] = None): | |||
def put(self, filename: str, index: typing.Optional['codearchiver.core.Index'] = None): | |||
'''Put a local file and (if provided) its index into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.''' | |||
def put_result(self, result: codearchiver.core.Result): | |||
def put_result(self, result: 'codearchiver.core.Result'): | |||
'''Put a module's Result into storage. The semantics are as for `put`, and the exact behaviour regarding partial copies and leftover files on errors is undefined.''' | |||
for fn, index in result.files: | |||
self.put(fn, index) | |||
for _, subresult in result.submoduleResults: | |||
self.put_result(subresult) | |||
@abc.abstractmethod | |||
def search_indices(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> collections.abc.Iterator[str]: | |||
''' | |||
Search all indices in storage by criteria. | |||
Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in a file's index. | |||
Yields all filenames where all criteria match. | |||
''' | |||
@abc.abstractmethod | |||
@contextlib.contextmanager | |||
def open_index(self, filename: str) -> typing.TextIO: | |||
'''Open the index for a file in serialised form.''' | |||
@abc.abstractmethod | |||
@contextlib.contextmanager | |||
def open(self, filename: str) -> typing.Iterator[typing.BinaryIO]: | |||
'''Open a file from storage.''' | |||
def open(self, filename: str, mode: typing.Optional[str] = 'rb') -> typing.Iterator[typing.Union[typing.BinaryIO, typing.TextIO]]: | |||
'''Open a file from storage. The mode must be r or rb.''' | |||
class DirectoryStorage(Storage): | |||
@@ -58,7 +73,23 @@ class DirectoryStorage(Storage): | |||
with open(indexFilename, 'x') as fp: | |||
fp.write(index.serialise()) | |||
def search_indices(self, criteria): | |||
_logger.info(f'Searching indices by criteria: {criteria!r}') | |||
for indexFilename in glob.glob('*.codearchiver-index', root_dir = self._directory): | |||
_logger.info(f'Searching index {indexFilename}') | |||
with self.open(indexFilename, 'r') as fp: | |||
idx = codearchiver.core.Index.deserialise(fp, validate = False) | |||
if idx.matches(criteria): | |||
_logger.info(f'Found index match {indexFilename}') | |||
yield indexFilename.rsplit('.', 1)[0] | |||
_logger.info('Done searching indices') | |||
@contextlib.contextmanager | |||
def open_index(self, filename): | |||
with self.open(f'{filename}.codearchiver-index', 'r') as fp: | |||
yield fp | |||
@contextlib.contextmanager | |||
def open(self, filename): | |||
with open(filename, 'rb') as fp: | |||
def open(self, filename, mode = 'rb'): | |||
with open(os.path.join(self._directory, filename), mode) as fp: | |||
yield fp |