Browse Source

Fix tag objects not getting deduplicated

tags/v1.0
JustAnotherArchivist 1 year ago
parent
commit
cc7bdbb3f4
1 changed files with 14 additions and 8 deletions
  1. +14
    -8
      codearchiver/modules/git.py

+ 14
- 8
codearchiver/modules/git.py View File

@@ -3,6 +3,7 @@ import codearchiver.subprocess
import datetime
import functools
import hashlib
import itertools
import logging
import os.path
import shutil
@@ -69,14 +70,15 @@ class Git(codearchiver.core.Module):

_logger.info('Collecting repository metadata')
_, refs, _ = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory)
refs = list(map(str.strip, refs.splitlines()))
_, commits, _ = codearchiver.subprocess.run_with_log(['git', 'log', '--reflog', '--all', '--format=format:%H% P'], cwd = directory)
commits = list(map(functools.partial(str.split, sep = ' '), commits.splitlines()))
rootCommits = [c[0] for c in commits if len(c) == 1]

# Check whether there are relevant prior bundles to create an incremental one
# Collect their commits shared with this clone (else `git bundle` complains about 'bad object')
commitSet = set(c[0] for c in commits) # For fast lookup
oldCommits = {} # dict to keep the order reasonable
# Collect their commits and ref IDs shared with this clone (else `git bundle` complains about 'bad object')
objectsSet = set(itertools.chain((c[0] for c in commits), (r.split(' ', 1)[0] for r in refs))) # For fast lookup
knownObjects = {} # dict to keep the order reasonable
basedOnBundles = {} # ditto
if self._storage:
for oldBundle in self._storage.search_metadata([('Module', type(self).name)] + [('Root commit', c) for c in rootCommits]):
@@ -84,12 +86,16 @@ class Git(codearchiver.core.Module):
with self._storage.open_metadata(oldBundle) as fp:
idx = GitMetadata.deserialise(fp)
for key, value in idx:
if key == 'Commit' and value in commitSet:
oldCommits[value] = True
_logger.debug(f'Key/value in previous bundle: {key} → {value!r}')
if key == 'Ref':
value = value.split(' ', 1)[0]
if key in ('Ref', 'Commit') and value in objectsSet and value not in knownObjects:
_logger.debug(f'Filtering out {value}')
knownObjects[value] = True
basedOnBundles[oldBundle] = True

_logger.info(f'Bundling into {bundle}')
status , _, stderr = codearchiver.subprocess.run_with_log(['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all'], cwd = directory, input = ''.join(f'^{commit}\n' for commit in oldCommits).encode('ascii'), check = False)
status , _, stderr = codearchiver.subprocess.run_with_log(['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all'], cwd = directory, input = ''.join(f'^{o}\n' for o in knownObjects).encode('ascii'), check = False)
if status == 128 and stderr == 'fatal: Refusing to create empty bundle.\n':
# Manually write an empty bundle instead
# Cf. Documentation/technical/bundle-format.txt and Documentation/technical/pack-format.txt in git's repository for details on the formats
@@ -112,10 +118,10 @@ class Git(codearchiver.core.Module):
metadata.append('Git version', gitVersion)
for oldBundle in basedOnBundles:
metadata.append('Based on bundle', oldBundle)
for line in refs.splitlines():
for line in refs:
metadata.append('Ref', line)
for commitHash, *parents in commits:
if commitHash not in oldCommits:
if commitHash not in knownObjects:
metadata.append('Commit', commitHash)
if not parents:
metadata.append('Root commit', commitHash)


Loading…
Cancel
Save