Browse Source

Support incremental Git bundles

Also fix a small discrepancy between the commit list and bundle due to --reflog vs --all
tags/v1.0
JustAnotherArchivist 1 year ago
parent
commit
8e83c9b7b4
4 changed files with 106 additions and 16 deletions
  1. +1
    -1
      codearchiver/cli.py
  2. +39
    -3
      codearchiver/core.py
  3. +29
    -6
      codearchiver/modules/git.py
  4. +37
    -6
      codearchiver/storage.py

+ 1
- 1
codearchiver/cli.py View File

@@ -190,8 +190,8 @@ def main():
import codearchiver.storage
with _dump_locals_on_exception():
inputUrl = codearchiver.core.InputURL(args.url)
module = codearchiver.core.get_module_instance(inputUrl)
storage = codearchiver.storage.DirectoryStorage(os.getcwd())
module = codearchiver.core.get_module_instance(inputUrl, storage = storage)
with tempfile.TemporaryDirectory(prefix = 'tmp.codearchiver.', dir = os.getcwd()) as td:
_logger.debug(f'Running in {td}')
os.chdir(td)


+ 39
- 3
codearchiver/core.py View File

@@ -1,5 +1,6 @@
import abc
#import codearchiver.modules # In get_module_class
import codearchiver.storage
import codearchiver.version
import collections
import contextlib
@@ -113,6 +114,39 @@ class Index(list[tuple[str, str]]):
if repeatedUnrepeatableKeys:
raise IndexValidationError(f'Repeated unrepeatable key(s): {", ".join(sorted(repeatedUnrepeatableKeys))}')

def matches(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> bool:
'''
Check whether the criteria match this index
Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in the index.
Multiple criteria may use the same key to perform an AND search.
The index is a match if all criteria match.
'''

criteria = criteria.copy()
_logger.debug(f'Searching index for {criteria!r}')
keysOfInterest = set(key for key, _ in criteria)
for key, value in self:
if key not in keysOfInterest:
continue
_logger.debug(f'Potentially interesting entry: {key!r} = {value!r}')
matched = [] # Indices to remove from remaining criteria
for i, (keyCriterion, valueCriterion) in enumerate(criteria):
if keyCriterion != key:
continue
if isinstance(valueCriterion, str) and valueCriterion == value:
_logger.debug('Str match')
matched.append(i)
elif isinstance(valueCriterion, tuple) and value in valueCriterion:
_logger.debug('Tuple match')
matched.append(i)
for i in reversed(matched):
_logger.debug(f'Matched remaining criterion {i}: {criteria[i]}')
del criteria[i]
if not criteria:
break
_logger.debug(f'Remaining unmatched criteria: {criteria!r}')
return not bool(criteria)

def serialise(self) -> str:
'''Convert the index to a string suitable for e.g. a simple text file storage'''

@@ -120,7 +154,7 @@ class Index(list[tuple[str, str]]):
return ''.join(f'{key}: {value}\n' for key, value in self)

@classmethod
def deserialise(cls, f: typing.Union[str, bytes, os.PathLike, typing.TextIO]):
def deserialise(cls, f: typing.Union[str, bytes, os.PathLike, typing.TextIO], *, validate = True):
'''Import a serialised index from a filename or file-like object'''

if isinstance(f, (str, bytes, os.PathLike)):
@@ -129,7 +163,8 @@ class Index(list[tuple[str, str]]):
cm = contextlib.nullcontext(f)
with cm as fp:
o = cls((key, value[:-1]) for key, value in map(functools.partial(str.split, sep = ': '), fp))
o.validate()
if validate:
o.validate()
return o


@@ -304,9 +339,10 @@ class Module(metaclass = ModuleMeta):
'''Whether or not this module is for handling `inputUrl`.'''
return False

def __init__(self, inputUrl: InputURL, id_: typing.Optional[str] = None):
def __init__(self, inputUrl: InputURL, storage: typing.Optional[codearchiver.storage.Storage] = None, id_: typing.Optional[str] = None):
self._inputUrl = inputUrl
self._url = inputUrl.url
self._storage = storage
self._id = id_
self._httpClient = HttpClient()



+ 29
- 6
codearchiver/modules/git.py View File

@@ -13,6 +13,7 @@ _logger = logging.getLogger(__name__)

class GitIndex(codearchiver.core.Index):
fields = [
codearchiver.core.IndexField(key = 'Based on bundle', required = False, repeatable = True),
codearchiver.core.IndexField(key = 'Ref', required = True, repeatable = True),
codearchiver.core.IndexField(key = 'Root commit', required = True, repeatable = True),
codearchiver.core.IndexField(key = 'Commit', required = True, repeatable = True),
@@ -57,21 +58,43 @@ class Git(codearchiver.core.Module):
else:
_logger.error(f'Failed to fetch {commit}')

_logger.info(f'Bundling into {bundle}')
codearchiver.subprocess.run_with_log(['git', 'bundle', 'create', '--progress', f'../{bundle}', '--all'], cwd = directory)

_logger.info(f'Collecting repository metadata for index')
_, refs = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory)
_, commits = codearchiver.subprocess.run_with_log(['git', 'log', '--reflog', '--format=format:%H% P'], cwd = directory)
_, commits = codearchiver.subprocess.run_with_log(['git', 'log', '--reflog', '--all', '--format=format:%H% P'], cwd = directory)
commits = list(map(functools.partial(str.split, sep = ' '), commits.splitlines()))
rootCommits = [c[0] for c in commits if len(c) == 1]

# Check whether there are relevant prior bundles to create an incremental one
# Collect their commits shared with this clone (else `git bundle` complains about 'bad object')
commitSet = set(c[0] for c in commits) # For fast lookup
oldCommits = {} # dict to keep the order reasonable
basedOnBundles = {} # ditto
if self._storage:
for oldBundle in self._storage.search_indices([('Root commit', c) for c in rootCommits]):
if not oldBundle.startswith('git_'): #TODO Is there a more generic and elegant approach?
continue
_logger.info(f'Previous bundle: {oldBundle!r}')
with self._storage.open_index(oldBundle) as fp:
idx = GitIndex.deserialise(fp)
for key, value in idx:
if key == 'Commit' and value in commitSet:
oldCommits[value] = True
basedOnBundles[oldBundle] = True

_logger.info(f'Bundling into {bundle}')
codearchiver.subprocess.run_with_log(['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all'], cwd = directory, input = ''.join(f'^{commit}\n' for commit in oldCommits).encode('ascii'))

_logger.info(f'Removing clone')
shutil.rmtree(directory)

index = GitIndex()
for oldBundle in basedOnBundles:
index.append('Based on bundle', oldBundle)
for line in refs.splitlines():
index.append('Ref', line)
for commitHash, *parents in map(functools.partial(str.split, sep = ' '), commits.splitlines()):
index.append('Commit', commitHash)
for commitHash, *parents in commits:
if commitHash not in oldCommits:
index.append('Commit', commitHash)
if not parents:
index.append('Root commit', commitHash)



+ 37
- 6
codearchiver/storage.py View File

@@ -1,6 +1,8 @@
import abc
import codearchiver.core
import collections.abc
import contextlib
import glob
import logging
import os.path
import shutil
@@ -12,20 +14,33 @@ _logger = logging.getLogger(__name__)

class Storage(abc.ABC):
@abc.abstractmethod
def put(self, filename: str, index: typing.Optional[codearchiver.core.Index] = None):
def put(self, filename: str, index: typing.Optional['codearchiver.core.Index'] = None):
'''Put a local file and (if provided) its index into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.'''

def put_result(self, result: codearchiver.core.Result):
def put_result(self, result: 'codearchiver.core.Result'):
'''Put a module's Result into storage. The semantics are as for `put`, and the exact behaviour regarding partial copies and leftover files on errors is undefined.'''
for fn, index in result.files:
self.put(fn, index)
for _, subresult in result.submoduleResults:
self.put_result(subresult)

@abc.abstractmethod
def search_indices(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> collections.abc.Iterator[str]:
'''
Search all indices in storage by criteria.
Each criterion consists of a key and one or more possible values. A criterion matches if at least one of the specified values is present in a file's index.
Yields all filenames where all criteria match.
'''

@abc.abstractmethod
@contextlib.contextmanager
def open_index(self, filename: str) -> typing.TextIO:
'''Open the index for a file in serialised form.'''

@abc.abstractmethod
@contextlib.contextmanager
def open(self, filename: str) -> typing.Iterator[typing.BinaryIO]:
'''Open a file from storage.'''
def open(self, filename: str, mode: typing.Optional[str] = 'rb') -> typing.Iterator[typing.Union[typing.BinaryIO, typing.TextIO]]:
'''Open a file from storage. The mode must be r or rb.'''


class DirectoryStorage(Storage):
@@ -58,7 +73,23 @@ class DirectoryStorage(Storage):
with open(indexFilename, 'x') as fp:
fp.write(index.serialise())

def search_indices(self, criteria):
_logger.info(f'Searching indices by criteria: {criteria!r}')
for indexFilename in glob.glob('*.codearchiver-index', root_dir = self._directory):
_logger.info(f'Searching index {indexFilename}')
with self.open(indexFilename, 'r') as fp:
idx = codearchiver.core.Index.deserialise(fp, validate = False)
if idx.matches(criteria):
_logger.info(f'Found index match {indexFilename}')
yield indexFilename.rsplit('.', 1)[0]
_logger.info('Done searching indices')

@contextlib.contextmanager
def open_index(self, filename):
with self.open(f'{filename}.codearchiver-index', 'r') as fp:
yield fp

@contextlib.contextmanager
def open(self, filename):
with open(filename, 'rb') as fp:
def open(self, filename, mode = 'rb'):
with open(os.path.join(self._directory, filename), mode) as fp:
yield fp

Loading…
Cancel
Save