diff --git a/codearchiver/modules/git.py b/codearchiver/modules/git.py index 3d0830c..bb74df1 100644 --- a/codearchiver/modules/git.py +++ b/codearchiver/modules/git.py @@ -19,7 +19,7 @@ class GitMetadata(codearchiver.core.Metadata): codearchiver.core.MetadataField(key = 'Based on bundle', required = False, repeatable = True), codearchiver.core.MetadataField(key = 'Ref', required = True, repeatable = True), codearchiver.core.MetadataField(key = 'Root commit', required = True, repeatable = True), - codearchiver.core.MetadataField(key = 'Commit', required = False, repeatable = True), + codearchiver.core.MetadataField(key = 'Object', required = False, repeatable = True), ) version = 0 @@ -71,36 +71,53 @@ class Git(codearchiver.core.Module): _logger.info('Collecting repository metadata') _, refs, _ = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory) refs = list(map(str.strip, refs.splitlines())) - _, commits, _ = codearchiver.subprocess.run_with_log(['git', 'log', '--reflog', '--all', '--format=format:%H% P'], cwd = directory) - commits = list(map(functools.partial(str.split, sep = ' '), commits.splitlines())) - rootCommits = [c[0] for c in commits if len(c) == 1] + _, rootCommits, _ = codearchiver.subprocess.run_with_log(['git', 'rev-list', '--max-parents=0', '--all'], cwd = directory) + rootCommits = list(filter(None, rootCommits.splitlines())) + _, objects, _ = codearchiver.subprocess.run_with_log(['git', 'cat-file', '--batch-check', '--batch-all-objects', '--unordered', '--buffer'], cwd = directory) + objects = {oid: otype for oid, otype, osize in map(functools.partial(str.split, sep = ' '), objects.splitlines())} # Check whether there are relevant prior bundles to create an incremental one - # Collect their commits and ref IDs shared with this clone (else `git bundle` complains about 'bad object') - objectsSet = set(itertools.chain((c[0] for c in commits), (r.split(' ', 1)[0] for r in refs))) # For fast lookup - knownObjects = {} # dict to keep the order reasonable - basedOnBundles = {} # ditto + commitsAndTags = {oid for oid, otype in objects.items() if otype in ('commit', 'tag')} + basedOnBundles = {} # dict to keep the order + baseBundleObjects = set() if self._storage: - for oldBundle in self._storage.search_metadata([('Module', type(self).name)] + [('Root commit', c) for c in rootCommits]): + _logger.info('Checking for previous bundles') + + # A note on dependency optimisation: we want the minimal set of previous bundles {B0, …, Bn} that maximises the cover with the current clone S. + # In other words, in the general case, this is a set cover problem between I = S ∩ (B0 ∪ … ∪ Bn} as the universe and Bi ∩ I as the subsets. + # Fortunately, solving the actual set cover problem is not necessary. + # This is because the previous bundles must be disjoint: commit/tag objects are never duplicated. (Trees and blobs might be, but deduplicating those isn't possible.) + # Therefore, any previous bundle that contains at least one commit or tag object in the current clone must be a dependency. + + for oldBundle in self._storage.search_metadata([('Module', type(self).name), ('Root commit', tuple(rootCommits))]): _logger.info(f'Previous bundle: {oldBundle!r}') with self._storage.open_metadata(oldBundle) as fp: idx = GitMetadata.deserialise(fp) + isMatch = False + oldObjects = set() # commit and tag IDs in this bundle for key, value in idx: - _logger.debug(f'Key/value in previous bundle: {key} → {value!r}') - if key == 'Ref': - value = value.split(' ', 1)[0] - if key in ('Ref', 'Commit') and value in objectsSet and value not in knownObjects: - _logger.debug(f'Filtering out {value}') - knownObjects[value] = True - basedOnBundles[oldBundle] = True + if key != 'Object': + continue + oid, otype = value.split(' ', 1) + oldObjects.add(oid) + if otype not in ('commit', 'tag'): + continue + if not isMatch and oid in commitsAndTags: + isMatch = True + if isMatch: + basedOnBundles[oldBundle] = True + baseBundleObjects |= oldObjects _logger.info(f'Bundling into {bundle}') - status , _, stderr = codearchiver.subprocess.run_with_log(['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all'], cwd = directory, input = ''.join(f'^{o}\n' for o in knownObjects).encode('ascii'), check = False) + cmd = ['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all'] + objectsToExclude = baseBundleObjects & commitsAndTags + input = ''.join(f'^{o}\n' for o in objectsToExclude).encode('ascii') + status, _, stderr = codearchiver.subprocess.run_with_log(cmd, cwd = directory, input = input, check = False) if status == 128 and stderr == 'fatal: Refusing to create empty bundle.\n': # Manually write an empty bundle instead # Cf. Documentation/technical/bundle-format.txt and Documentation/technical/pack-format.txt in git's repository for details on the formats _logger.info('Writing empty bundle directly instead') - with open(bundle, 'wb') as fp: + with open(bundle, 'xb') as fp: fp.write(b'# v2 git bundle\n') # bundle signature fp.write(b'\n') # bundle end of prerequisites and refs packdata = b'PACK' # pack signature @@ -111,6 +128,32 @@ class Git(codearchiver.core.Module): elif status != 0: raise RuntimeError(f'git bundle creation returned with non-zero exit status {status}.') + _logger.info('Indexing bundle') + # Yes, this is stupid, but unfortunately, `git index-pack` can only read from stdin inside a repo and will still write the packfile to disk anyway. + # So sadly, the only way here (for now) is to make a copy of the packfile and then run index-pack on it. + with open(bundle, 'rb') as fpin: + # Skip over header + for line in fpin: + if line == b'\n': + break + # Copy remainder (= packfile) to tmp.pack + with open('tmp.pack', 'xb') as fpout: + shutil.copyfileobj(fpin, fpout) + codearchiver.subprocess.run_with_log(['git', 'index-pack', '-v', 'tmp.pack']) + with open('tmp.idx', 'rb') as fp: + _, index, _ = codearchiver.subprocess.run_with_log(['git', 'show-index'], input = fp) + indexObjectIds = {l.rstrip('\n').split(' ', 2)[1] for l in index.splitlines()} + try: + indexObjects = {oid: objects[oid] for oid in indexObjectIds} + except KeyError as e: + # This should never happen since the bundle is created from the clone with exclusions... + raise RuntimeError(f'Bundle {bundle} contains object not contained in the present clone') from e + if objects.keys() - (baseBundleObjects | indexObjectIds) != set(): + # If there is at least one object in the clone that is not in the base bundles or the bundle index... + raise RuntimeError('Object mismatch between clone and bundles') + os.remove('tmp.pack') + os.remove('tmp.idx') + _logger.info(f'Removing clone') shutil.rmtree(directory) @@ -120,11 +163,10 @@ class Git(codearchiver.core.Module): metadata.append('Based on bundle', oldBundle) for line in refs: metadata.append('Ref', line) - for commitHash, *parents in commits: - if commitHash not in knownObjects: - metadata.append('Commit', commitHash) - if not parents: - metadata.append('Root commit', commitHash) + for commitId in rootCommits: + metadata.append('Root commit', commitId) + for oid, otype in indexObjects.items(): + metadata.append('Object', f'{oid} {otype}') return codearchiver.core.Result(id = self._id, files = [(bundle, metadata)])