Browse Source

Refactor Git bundling to allow for verification of the bundle contents

This verifies that all objects from the current clone are in either the dependency bundles or the current bundle. This guarantees that the repo as it has been clone at the time of retrieval can be reconstructed exactly from the bundles.

As a side-effect, if a non-standard Git server were to include objects in a clone pack that are not discoverable from refs, this will fail any attempt to archive such a clone. This could in the future be resolved by adding custom refs for those extra objects.

This also fixes a bug where prior bundles could be included as a dependency even though they contain no relevant data due to their refs (as refs are always listed in the bundle metadata). Instead, dependency detection now operates directly on commit and tag objects, which can only be present in one bundle.
tags/v1.0
JustAnotherArchivist 1 year ago
parent
commit
9a61800758
1 changed files with 65 additions and 23 deletions
  1. +65
    -23
      codearchiver/modules/git.py

+ 65
- 23
codearchiver/modules/git.py View File

@@ -19,7 +19,7 @@ class GitMetadata(codearchiver.core.Metadata):
codearchiver.core.MetadataField(key = 'Based on bundle', required = False, repeatable = True),
codearchiver.core.MetadataField(key = 'Ref', required = True, repeatable = True),
codearchiver.core.MetadataField(key = 'Root commit', required = True, repeatable = True),
codearchiver.core.MetadataField(key = 'Commit', required = False, repeatable = True),
codearchiver.core.MetadataField(key = 'Object', required = False, repeatable = True),
)
version = 0

@@ -71,36 +71,53 @@ class Git(codearchiver.core.Module):
_logger.info('Collecting repository metadata')
_, refs, _ = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory)
refs = list(map(str.strip, refs.splitlines()))
_, commits, _ = codearchiver.subprocess.run_with_log(['git', 'log', '--reflog', '--all', '--format=format:%H% P'], cwd = directory)
commits = list(map(functools.partial(str.split, sep = ' '), commits.splitlines()))
rootCommits = [c[0] for c in commits if len(c) == 1]
_, rootCommits, _ = codearchiver.subprocess.run_with_log(['git', 'rev-list', '--max-parents=0', '--all'], cwd = directory)
rootCommits = list(filter(None, rootCommits.splitlines()))
_, objects, _ = codearchiver.subprocess.run_with_log(['git', 'cat-file', '--batch-check', '--batch-all-objects', '--unordered', '--buffer'], cwd = directory)
objects = {oid: otype for oid, otype, osize in map(functools.partial(str.split, sep = ' '), objects.splitlines())}

# Check whether there are relevant prior bundles to create an incremental one
# Collect their commits and ref IDs shared with this clone (else `git bundle` complains about 'bad object')
objectsSet = set(itertools.chain((c[0] for c in commits), (r.split(' ', 1)[0] for r in refs))) # For fast lookup
knownObjects = {} # dict to keep the order reasonable
basedOnBundles = {} # ditto
commitsAndTags = {oid for oid, otype in objects.items() if otype in ('commit', 'tag')}
basedOnBundles = {} # dict to keep the order
baseBundleObjects = set()
if self._storage:
for oldBundle in self._storage.search_metadata([('Module', type(self).name)] + [('Root commit', c) for c in rootCommits]):
_logger.info('Checking for previous bundles')

# A note on dependency optimisation: we want the minimal set of previous bundles {B0, …, Bn} that maximises the cover with the current clone S.
# In other words, in the general case, this is a set cover problem between I = S ∩ (B0 ∪ … ∪ Bn} as the universe and Bi ∩ I as the subsets.
# Fortunately, solving the actual set cover problem is not necessary.
# This is because the previous bundles must be disjoint: commit/tag objects are never duplicated. (Trees and blobs might be, but deduplicating those isn't possible.)
# Therefore, any previous bundle that contains at least one commit or tag object in the current clone must be a dependency.

for oldBundle in self._storage.search_metadata([('Module', type(self).name), ('Root commit', tuple(rootCommits))]):
_logger.info(f'Previous bundle: {oldBundle!r}')
with self._storage.open_metadata(oldBundle) as fp:
idx = GitMetadata.deserialise(fp)
isMatch = False
oldObjects = set() # commit and tag IDs in this bundle
for key, value in idx:
_logger.debug(f'Key/value in previous bundle: {key} → {value!r}')
if key == 'Ref':
value = value.split(' ', 1)[0]
if key in ('Ref', 'Commit') and value in objectsSet and value not in knownObjects:
_logger.debug(f'Filtering out {value}')
knownObjects[value] = True
basedOnBundles[oldBundle] = True
if key != 'Object':
continue
oid, otype = value.split(' ', 1)
oldObjects.add(oid)
if otype not in ('commit', 'tag'):
continue
if not isMatch and oid in commitsAndTags:
isMatch = True
if isMatch:
basedOnBundles[oldBundle] = True
baseBundleObjects |= oldObjects

_logger.info(f'Bundling into {bundle}')
status , _, stderr = codearchiver.subprocess.run_with_log(['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all'], cwd = directory, input = ''.join(f'^{o}\n' for o in knownObjects).encode('ascii'), check = False)
cmd = ['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all']
objectsToExclude = baseBundleObjects & commitsAndTags
input = ''.join(f'^{o}\n' for o in objectsToExclude).encode('ascii')
status, _, stderr = codearchiver.subprocess.run_with_log(cmd, cwd = directory, input = input, check = False)
if status == 128 and stderr == 'fatal: Refusing to create empty bundle.\n':
# Manually write an empty bundle instead
# Cf. Documentation/technical/bundle-format.txt and Documentation/technical/pack-format.txt in git's repository for details on the formats
_logger.info('Writing empty bundle directly instead')
with open(bundle, 'wb') as fp:
with open(bundle, 'xb') as fp:
fp.write(b'# v2 git bundle\n') # bundle signature
fp.write(b'\n') # bundle end of prerequisites and refs
packdata = b'PACK' # pack signature
@@ -111,6 +128,32 @@ class Git(codearchiver.core.Module):
elif status != 0:
raise RuntimeError(f'git bundle creation returned with non-zero exit status {status}.')

_logger.info('Indexing bundle')
# Yes, this is stupid, but unfortunately, `git index-pack` can only read from stdin inside a repo and will still write the packfile to disk anyway.
# So sadly, the only way here (for now) is to make a copy of the packfile and then run index-pack on it.
with open(bundle, 'rb') as fpin:
# Skip over header
for line in fpin:
if line == b'\n':
break
# Copy remainder (= packfile) to tmp.pack
with open('tmp.pack', 'xb') as fpout:
shutil.copyfileobj(fpin, fpout)
codearchiver.subprocess.run_with_log(['git', 'index-pack', '-v', 'tmp.pack'])
with open('tmp.idx', 'rb') as fp:
_, index, _ = codearchiver.subprocess.run_with_log(['git', 'show-index'], input = fp)
indexObjectIds = {l.rstrip('\n').split(' ', 2)[1] for l in index.splitlines()}
try:
indexObjects = {oid: objects[oid] for oid in indexObjectIds}
except KeyError as e:
# This should never happen since the bundle is created from the clone with exclusions...
raise RuntimeError(f'Bundle {bundle} contains object not contained in the present clone') from e
if objects.keys() - (baseBundleObjects | indexObjectIds) != set():
# If there is at least one object in the clone that is not in the base bundles or the bundle index...
raise RuntimeError('Object mismatch between clone and bundles')
os.remove('tmp.pack')
os.remove('tmp.idx')

_logger.info(f'Removing clone')
shutil.rmtree(directory)

@@ -120,11 +163,10 @@ class Git(codearchiver.core.Module):
metadata.append('Based on bundle', oldBundle)
for line in refs:
metadata.append('Ref', line)
for commitHash, *parents in commits:
if commitHash not in knownObjects:
metadata.append('Commit', commitHash)
if not parents:
metadata.append('Root commit', commitHash)
for commitId in rootCommits:
metadata.append('Root commit', commitId)
for oid, otype in indexObjects.items():
metadata.append('Object', f'{oid} {otype}')

return codearchiver.core.Result(id = self._id, files = [(bundle, metadata)])



Loading…
Cancel
Save