From cc7bdbb3f496d75284f777c4a5bc44dd48f80038 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 14 Mar 2023 23:37:20 +0000 Subject: [PATCH] Fix tag objects not getting deduplicated --- codearchiver/modules/git.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/codearchiver/modules/git.py b/codearchiver/modules/git.py index b6bfc50..3d0830c 100644 --- a/codearchiver/modules/git.py +++ b/codearchiver/modules/git.py @@ -3,6 +3,7 @@ import codearchiver.subprocess import datetime import functools import hashlib +import itertools import logging import os.path import shutil @@ -69,14 +70,15 @@ class Git(codearchiver.core.Module): _logger.info('Collecting repository metadata') _, refs, _ = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory) + refs = list(map(str.strip, refs.splitlines())) _, commits, _ = codearchiver.subprocess.run_with_log(['git', 'log', '--reflog', '--all', '--format=format:%H% P'], cwd = directory) commits = list(map(functools.partial(str.split, sep = ' '), commits.splitlines())) rootCommits = [c[0] for c in commits if len(c) == 1] # Check whether there are relevant prior bundles to create an incremental one - # Collect their commits shared with this clone (else `git bundle` complains about 'bad object') - commitSet = set(c[0] for c in commits) # For fast lookup - oldCommits = {} # dict to keep the order reasonable + # Collect their commits and ref IDs shared with this clone (else `git bundle` complains about 'bad object') + objectsSet = set(itertools.chain((c[0] for c in commits), (r.split(' ', 1)[0] for r in refs))) # For fast lookup + knownObjects = {} # dict to keep the order reasonable basedOnBundles = {} # ditto if self._storage: for oldBundle in self._storage.search_metadata([('Module', type(self).name)] + [('Root commit', c) for c in rootCommits]): @@ -84,12 +86,16 @@ class Git(codearchiver.core.Module): with self._storage.open_metadata(oldBundle) as fp: idx = GitMetadata.deserialise(fp) for key, value in idx: - if key == 'Commit' and value in commitSet: - oldCommits[value] = True + _logger.debug(f'Key/value in previous bundle: {key} → {value!r}') + if key == 'Ref': + value = value.split(' ', 1)[0] + if key in ('Ref', 'Commit') and value in objectsSet and value not in knownObjects: + _logger.debug(f'Filtering out {value}') + knownObjects[value] = True basedOnBundles[oldBundle] = True _logger.info(f'Bundling into {bundle}') - status , _, stderr = codearchiver.subprocess.run_with_log(['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all'], cwd = directory, input = ''.join(f'^{commit}\n' for commit in oldCommits).encode('ascii'), check = False) + status , _, stderr = codearchiver.subprocess.run_with_log(['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all'], cwd = directory, input = ''.join(f'^{o}\n' for o in knownObjects).encode('ascii'), check = False) if status == 128 and stderr == 'fatal: Refusing to create empty bundle.\n': # Manually write an empty bundle instead # Cf. Documentation/technical/bundle-format.txt and Documentation/technical/pack-format.txt in git's repository for details on the formats @@ -112,10 +118,10 @@ class Git(codearchiver.core.Module): metadata.append('Git version', gitVersion) for oldBundle in basedOnBundles: metadata.append('Based on bundle', oldBundle) - for line in refs.splitlines(): + for line in refs: metadata.append('Ref', line) for commitHash, *parents in commits: - if commitHash not in oldCommits: + if commitHash not in knownObjects: metadata.append('Commit', commitHash) if not parents: metadata.append('Root commit', commitHash)