Refactor Git bundling to allow for verification of the bundle contents
This verifies that all objects from the current clone are in either the dependency bundles or the current bundle. This guarantees that the repo as it has been clone at the time of retrieval can be reconstructed exactly from the bundles.
As a side-effect, if a non-standard Git server were to include objects in a clone pack that are not discoverable from refs, this will fail any attempt to archive such a clone. This could in the future be resolved by adding custom refs for those extra objects.
This also fixes a bug where prior bundles could be included as a dependency even though they contain no relevant data due to their refs (as refs are always listed in the bundle metadata). Instead, dependency detection now operates directly on commit and tag objects, which can only be present in one bundle.
objects = {oid: otype for oid, otype, osize in map(functools.partial(str.split, sep = ' '), objects.splitlines())}
# Check whether there are relevant prior bundles to create an incremental one
# Collect their commits and ref IDs shared with this clone (else `git bundle` complains about 'bad object')
objectsSet = set(itertools.chain((c[0] for c in commits), (r.split(' ', 1)[0] for r in refs))) # For fast lookup
knownObjects = {} # dict to keep the order reasonable
basedOnBundles = {} # ditto
commitsAndTags = {oid for oid, otype in objects.items() if otype in ('commit', 'tag')}
basedOnBundles = {} # dict to keep the order
baseBundleObjects = set()
if self._storage:
for oldBundle in self._storage.search_metadata([('Module', type(self).name)] + [('Root commit', c) for c in rootCommits]):
_logger.info('Checking for previous bundles')
# A note on dependency optimisation: we want the minimal set of previous bundles {B0, …, Bn} that maximises the cover with the current clone S.
# In other words, in the general case, this is a set cover problem between I = S ∩ (B0 ∪ … ∪ Bn} as the universe and Bi ∩ I as the subsets.
# Fortunately, solving the actual set cover problem is not necessary.
# This is because the previous bundles must be disjoint: commit/tag objects are never duplicated. (Trees and blobs might be, but deduplicating those isn't possible.)
# Therefore, any previous bundle that contains at least one commit or tag object in the current clone must be a dependency.
for oldBundle in self._storage.search_metadata([('Module', type(self).name), ('Root commit', tuple(rootCommits))]):
_logger.info(f'Previous bundle: {oldBundle!r}')
with self._storage.open_metadata(oldBundle) as fp:
idx = GitMetadata.deserialise(fp)
isMatch = False
oldObjects = set() # commit and tag IDs in this bundle
for key, value in idx:
_logger.debug(f'Key/value in previous bundle: {key} → {value!r}')
if key == 'Ref':
value = value.split(' ', 1)[0]
if key in ('Ref', 'Commit') and value in objectsSet and value not in knownObjects:
_logger.debug(f'Filtering out {value}')
knownObjects[value] = True
basedOnBundles[oldBundle] = True
if key != 'Object':
continue
oid, otype = value.split(' ', 1)
oldObjects.add(oid)
if otype not in ('commit', 'tag'):
continue
if not isMatch and oid in commitsAndTags:
isMatch = True
if isMatch:
basedOnBundles[oldBundle] = True
baseBundleObjects |= oldObjects
_logger.info(f'Bundling into {bundle}')
status , _, stderr = codearchiver.subprocess.run_with_log(['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all'], cwd = directory, input = ''.join(f'^{o}\n' for o in knownObjects).encode('ascii'), check = False)