Also fix a small discrepancy between the commit list and bundle due to --reflog vs --alltags/v1.0
@@ -190,8 +190,8 @@ def main(): | |||||
import codearchiver.storage | import codearchiver.storage | ||||
with _dump_locals_on_exception(): | with _dump_locals_on_exception(): | ||||
inputUrl = codearchiver.core.InputURL(args.url) | inputUrl = codearchiver.core.InputURL(args.url) | ||||
module = codearchiver.core.get_module_instance(inputUrl) | |||||
storage = codearchiver.storage.DirectoryStorage(os.getcwd()) | storage = codearchiver.storage.DirectoryStorage(os.getcwd()) | ||||
module = codearchiver.core.get_module_instance(inputUrl, storage = storage) | |||||
with tempfile.TemporaryDirectory(prefix = 'tmp.codearchiver.', dir = os.getcwd()) as td: | with tempfile.TemporaryDirectory(prefix = 'tmp.codearchiver.', dir = os.getcwd()) as td: | ||||
_logger.debug(f'Running in {td}') | _logger.debug(f'Running in {td}') | ||||
os.chdir(td) | os.chdir(td) | ||||
@@ -1,5 +1,6 @@ | |||||
import abc | import abc | ||||
#import codearchiver.modules # In get_module_class | #import codearchiver.modules # In get_module_class | ||||
import codearchiver.storage | |||||
import codearchiver.version | import codearchiver.version | ||||
import collections | import collections | ||||
import contextlib | import contextlib | ||||
@@ -113,6 +114,39 @@ class Index(list[tuple[str, str]]): | |||||
if repeatedUnrepeatableKeys: | if repeatedUnrepeatableKeys: | ||||
raise IndexValidationError(f'Repeated unrepeatable key(s): {", ".join(sorted(repeatedUnrepeatableKeys))}') | raise IndexValidationError(f'Repeated unrepeatable key(s): {", ".join(sorted(repeatedUnrepeatableKeys))}') | ||||
def matches(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> bool: | |||||
''' | |||||
Check whether the criteria match this index | |||||
Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in the index. | |||||
Multiple criteria may use the same key to perform an AND search. | |||||
The index is a match if all criteria match. | |||||
''' | |||||
criteria = criteria.copy() | |||||
_logger.debug(f'Searching index for {criteria!r}') | |||||
keysOfInterest = set(key for key, _ in criteria) | |||||
for key, value in self: | |||||
if key not in keysOfInterest: | |||||
continue | |||||
_logger.debug(f'Potentially interesting entry: {key!r} = {value!r}') | |||||
matched = [] # Indices to remove from remaining criteria | |||||
for i, (keyCriterion, valueCriterion) in enumerate(criteria): | |||||
if keyCriterion != key: | |||||
continue | |||||
if isinstance(valueCriterion, str) and valueCriterion == value: | |||||
_logger.debug('Str match') | |||||
matched.append(i) | |||||
elif isinstance(valueCriterion, tuple) and value in valueCriterion: | |||||
_logger.debug('Tuple match') | |||||
matched.append(i) | |||||
for i in reversed(matched): | |||||
_logger.debug(f'Matched remaining criterion {i}: {criteria[i]}') | |||||
del criteria[i] | |||||
if not criteria: | |||||
break | |||||
_logger.debug(f'Remaining unmatched criteria: {criteria!r}') | |||||
return not bool(criteria) | |||||
def serialise(self) -> str: | def serialise(self) -> str: | ||||
'''Convert the index to a string suitable for e.g. a simple text file storage''' | '''Convert the index to a string suitable for e.g. a simple text file storage''' | ||||
@@ -120,7 +154,7 @@ class Index(list[tuple[str, str]]): | |||||
return ''.join(f'{key}: {value}\n' for key, value in self) | return ''.join(f'{key}: {value}\n' for key, value in self) | ||||
@classmethod | @classmethod | ||||
def deserialise(cls, f: typing.Union[str, bytes, os.PathLike, typing.TextIO]): | |||||
def deserialise(cls, f: typing.Union[str, bytes, os.PathLike, typing.TextIO], *, validate = True): | |||||
'''Import a serialised index from a filename or file-like object''' | '''Import a serialised index from a filename or file-like object''' | ||||
if isinstance(f, (str, bytes, os.PathLike)): | if isinstance(f, (str, bytes, os.PathLike)): | ||||
@@ -129,7 +163,8 @@ class Index(list[tuple[str, str]]): | |||||
cm = contextlib.nullcontext(f) | cm = contextlib.nullcontext(f) | ||||
with cm as fp: | with cm as fp: | ||||
o = cls((key, value[:-1]) for key, value in map(functools.partial(str.split, sep = ': '), fp)) | o = cls((key, value[:-1]) for key, value in map(functools.partial(str.split, sep = ': '), fp)) | ||||
o.validate() | |||||
if validate: | |||||
o.validate() | |||||
return o | return o | ||||
@@ -304,9 +339,10 @@ class Module(metaclass = ModuleMeta): | |||||
'''Whether or not this module is for handling `inputUrl`.''' | '''Whether or not this module is for handling `inputUrl`.''' | ||||
return False | return False | ||||
def __init__(self, inputUrl: InputURL, id_: typing.Optional[str] = None): | |||||
def __init__(self, inputUrl: InputURL, storage: typing.Optional[codearchiver.storage.Storage] = None, id_: typing.Optional[str] = None): | |||||
self._inputUrl = inputUrl | self._inputUrl = inputUrl | ||||
self._url = inputUrl.url | self._url = inputUrl.url | ||||
self._storage = storage | |||||
self._id = id_ | self._id = id_ | ||||
self._httpClient = HttpClient() | self._httpClient = HttpClient() | ||||
@@ -13,6 +13,7 @@ _logger = logging.getLogger(__name__) | |||||
class GitIndex(codearchiver.core.Index): | class GitIndex(codearchiver.core.Index): | ||||
fields = [ | fields = [ | ||||
codearchiver.core.IndexField(key = 'Based on bundle', required = False, repeatable = True), | |||||
codearchiver.core.IndexField(key = 'Ref', required = True, repeatable = True), | codearchiver.core.IndexField(key = 'Ref', required = True, repeatable = True), | ||||
codearchiver.core.IndexField(key = 'Root commit', required = True, repeatable = True), | codearchiver.core.IndexField(key = 'Root commit', required = True, repeatable = True), | ||||
codearchiver.core.IndexField(key = 'Commit', required = True, repeatable = True), | codearchiver.core.IndexField(key = 'Commit', required = True, repeatable = True), | ||||
@@ -57,21 +58,43 @@ class Git(codearchiver.core.Module): | |||||
else: | else: | ||||
_logger.error(f'Failed to fetch {commit}') | _logger.error(f'Failed to fetch {commit}') | ||||
_logger.info(f'Bundling into {bundle}') | |||||
codearchiver.subprocess.run_with_log(['git', 'bundle', 'create', '--progress', f'../{bundle}', '--all'], cwd = directory) | |||||
_logger.info(f'Collecting repository metadata for index') | _logger.info(f'Collecting repository metadata for index') | ||||
_, refs = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory) | _, refs = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory) | ||||
_, commits = codearchiver.subprocess.run_with_log(['git', 'log', '--reflog', '--format=format:%H% P'], cwd = directory) | |||||
_, commits = codearchiver.subprocess.run_with_log(['git', 'log', '--reflog', '--all', '--format=format:%H% P'], cwd = directory) | |||||
commits = list(map(functools.partial(str.split, sep = ' '), commits.splitlines())) | |||||
rootCommits = [c[0] for c in commits if len(c) == 1] | |||||
# Check whether there are relevant prior bundles to create an incremental one | |||||
# Collect their commits shared with this clone (else `git bundle` complains about 'bad object') | |||||
commitSet = set(c[0] for c in commits) # For fast lookup | |||||
oldCommits = {} # dict to keep the order reasonable | |||||
basedOnBundles = {} # ditto | |||||
if self._storage: | |||||
for oldBundle in self._storage.search_indices([('Root commit', c) for c in rootCommits]): | |||||
if not oldBundle.startswith('git_'): #TODO Is there a more generic and elegant approach? | |||||
continue | |||||
_logger.info(f'Previous bundle: {oldBundle!r}') | |||||
with self._storage.open_index(oldBundle) as fp: | |||||
idx = GitIndex.deserialise(fp) | |||||
for key, value in idx: | |||||
if key == 'Commit' and value in commitSet: | |||||
oldCommits[value] = True | |||||
basedOnBundles[oldBundle] = True | |||||
_logger.info(f'Bundling into {bundle}') | |||||
codearchiver.subprocess.run_with_log(['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all'], cwd = directory, input = ''.join(f'^{commit}\n' for commit in oldCommits).encode('ascii')) | |||||
_logger.info(f'Removing clone') | _logger.info(f'Removing clone') | ||||
shutil.rmtree(directory) | shutil.rmtree(directory) | ||||
index = GitIndex() | index = GitIndex() | ||||
for oldBundle in basedOnBundles: | |||||
index.append('Based on bundle', oldBundle) | |||||
for line in refs.splitlines(): | for line in refs.splitlines(): | ||||
index.append('Ref', line) | index.append('Ref', line) | ||||
for commitHash, *parents in map(functools.partial(str.split, sep = ' '), commits.splitlines()): | |||||
index.append('Commit', commitHash) | |||||
for commitHash, *parents in commits: | |||||
if commitHash not in oldCommits: | |||||
index.append('Commit', commitHash) | |||||
if not parents: | if not parents: | ||||
index.append('Root commit', commitHash) | index.append('Root commit', commitHash) | ||||
@@ -1,6 +1,8 @@ | |||||
import abc | import abc | ||||
import codearchiver.core | import codearchiver.core | ||||
import collections.abc | |||||
import contextlib | import contextlib | ||||
import glob | |||||
import logging | import logging | ||||
import os.path | import os.path | ||||
import shutil | import shutil | ||||
@@ -12,20 +14,33 @@ _logger = logging.getLogger(__name__) | |||||
class Storage(abc.ABC): | class Storage(abc.ABC): | ||||
@abc.abstractmethod | @abc.abstractmethod | ||||
def put(self, filename: str, index: typing.Optional[codearchiver.core.Index] = None): | |||||
def put(self, filename: str, index: typing.Optional['codearchiver.core.Index'] = None): | |||||
'''Put a local file and (if provided) its index into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.''' | '''Put a local file and (if provided) its index into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.''' | ||||
def put_result(self, result: codearchiver.core.Result): | |||||
def put_result(self, result: 'codearchiver.core.Result'): | |||||
'''Put a module's Result into storage. The semantics are as for `put`, and the exact behaviour regarding partial copies and leftover files on errors is undefined.''' | '''Put a module's Result into storage. The semantics are as for `put`, and the exact behaviour regarding partial copies and leftover files on errors is undefined.''' | ||||
for fn, index in result.files: | for fn, index in result.files: | ||||
self.put(fn, index) | self.put(fn, index) | ||||
for _, subresult in result.submoduleResults: | for _, subresult in result.submoduleResults: | ||||
self.put_result(subresult) | self.put_result(subresult) | ||||
@abc.abstractmethod | |||||
def search_indices(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> collections.abc.Iterator[str]: | |||||
''' | |||||
Search all indices in storage by criteria. | |||||
Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in a file's index. | |||||
Yields all filenames where all criteria match. | |||||
''' | |||||
@abc.abstractmethod | |||||
@contextlib.contextmanager | |||||
def open_index(self, filename: str) -> typing.TextIO: | |||||
'''Open the index for a file in serialised form.''' | |||||
@abc.abstractmethod | @abc.abstractmethod | ||||
@contextlib.contextmanager | @contextlib.contextmanager | ||||
def open(self, filename: str) -> typing.Iterator[typing.BinaryIO]: | |||||
'''Open a file from storage.''' | |||||
def open(self, filename: str, mode: typing.Optional[str] = 'rb') -> typing.Iterator[typing.Union[typing.BinaryIO, typing.TextIO]]: | |||||
'''Open a file from storage. The mode must be r or rb.''' | |||||
class DirectoryStorage(Storage): | class DirectoryStorage(Storage): | ||||
@@ -58,7 +73,23 @@ class DirectoryStorage(Storage): | |||||
with open(indexFilename, 'x') as fp: | with open(indexFilename, 'x') as fp: | ||||
fp.write(index.serialise()) | fp.write(index.serialise()) | ||||
def search_indices(self, criteria): | |||||
_logger.info(f'Searching indices by criteria: {criteria!r}') | |||||
for indexFilename in glob.glob('*.codearchiver-index', root_dir = self._directory): | |||||
_logger.info(f'Searching index {indexFilename}') | |||||
with self.open(indexFilename, 'r') as fp: | |||||
idx = codearchiver.core.Index.deserialise(fp, validate = False) | |||||
if idx.matches(criteria): | |||||
_logger.info(f'Found index match {indexFilename}') | |||||
yield indexFilename.rsplit('.', 1)[0] | |||||
_logger.info('Done searching indices') | |||||
@contextlib.contextmanager | |||||
def open_index(self, filename): | |||||
with self.open(f'{filename}.codearchiver-index', 'r') as fp: | |||||
yield fp | |||||
@contextlib.contextmanager | @contextlib.contextmanager | ||||
def open(self, filename): | |||||
with open(filename, 'rb') as fp: | |||||
def open(self, filename, mode = 'rb'): | |||||
with open(os.path.join(self._directory, filename), mode) as fp: | |||||
yield fp | yield fp |