From 66666a153834d1cf1d469d76f2d40cf473a43000 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 28 Mar 2023 04:36:46 +0000 Subject: [PATCH] Workaround for incremental bundles with deltified objects --- codearchiver/modules/git.py | 76 +++++++++++++++++++++++++++++++++---- 1 file changed, 69 insertions(+), 7 deletions(-) diff --git a/codearchiver/modules/git.py b/codearchiver/modules/git.py index 70dd4da..a64d836 100644 --- a/codearchiver/modules/git.py +++ b/codearchiver/modules/git.py @@ -14,6 +14,42 @@ import tempfile _logger = logging.getLogger(__name__) +class _HashingFileReader: + '''A tiny wrapper around a file-like object which calculates the hash of the file as it is being read.''' + + def __init__(self, fp, hasher = hashlib.sha1, skipStart = 0, skipEnd = 0): + self._fp = fp + self._hasher = hasher() + self._skipStart = skipStart + self._skipEnd = skipEnd + self._buf = b'' + + def read(self, n): + data = self._fp.read(n) + if self._skipStart > 0: + # Requires that the first block is bigger than skipStart+skipEnd because it otherwise gets very complicated + # Yes, this fails if the file is smaller than that, but given what it's used for, that's not an issue. + if len(data) < self._skipStart + self._skipEnd: + raise ValueError(f'first read is required to return at least {self._skipStart + self._skipEnd} bytes') + start = self._skipStart + self._skipStart = 0 + else: + start = 0 + bufPlusData = self._buf + data + if self._skipEnd > 0: + self._buf = bufPlusData[-self._skipEnd:] + end = -self._skipEnd + else: + end = None + self._hasher.update(bufPlusData[start:end]) + return data + + def digest(self): + if self._skipStart > 0 or len(self._buf) != self._skipEnd: + raise ValueError('data skipping failed') + return self._hasher.digest() + + class GitMetadata(codearchiver.core.Metadata): fields = ( codearchiver.core.MetadataField(key = 'Git version', required = True, repeatable = False), @@ -170,20 +206,46 @@ class Git(codearchiver.core.Module): raise RuntimeError(f'git bundle creation returned with non-zero exit status {status}.') _logger.info('Indexing bundle') - # Yes, this is stupid, but unfortunately, `git index-pack` can only read from stdin inside a repo and will still write the packfile to disk anyway. - # So sadly, the only way here (for now) is to make a copy of the packfile and then run index-pack on it. + + # The bundle's packfile might contain deltified objects. + # Those cannot be run through `index-pack` without using `--fix-thin` and running it in a repo containing the missing objects. + # However, `--fix-thin` has the side effect that it appends those missing objects to the output pack as well, so they also show up in the `show-index` output afterwards. + # The fact that this always appends is undocumented, so it can't simply be relied on. + # So this does the following: + # - Index the pack; as the data is being read, calculate a hash of the pack data without header and trailer checksum + # - Verify that the corresponding bytes from the index-pack output file have the same hash. + # - Read the index and filter out any entries that lie beyond the size of the bundle packfile minus 20 (trailer checksum) + # This gets an accurate index of exactly which objects are in this pack; some might depend on data from other bundles, but that's fine. + # The extra copy to disk is unfortunately unavoidable anyway, but this hash verification at least makes it somewhat valuable. + + # Index with inline hash calculation + bundleSize = os.path.getsize(bundle) with open(bundle, 'rb') as fpin: # Skip over header for line in fpin: if line == b'\n': break - # Copy remainder (= packfile) to tmp.pack - with open('tmp.pack', 'xb') as fpout: - shutil.copyfileobj(fpin, fpout) - codearchiver.subprocess.run_with_log(['git', 'index-pack', '-v', 'tmp.pack']) + packOffset = fpin.tell() + hashWrapper = _HashingFileReader(fpin, skipStart = 12, skipEnd = 20) + codearchiver.subprocess.run_with_log(['git', 'index-pack', '-v', '--fix-thin', '--stdin', '../tmp.pack'], input = hashWrapper, cwd = directory) + bundlePackSize = bundleSize - packOffset - 12 - 20 + bundlePackHash = hashWrapper.digest() + # Verify hash of first part of the index-pack output pack + with open('tmp.pack', 'rb') as fp: + fp.seek(12) # Header + indexPackRead = 0 + hasher = hashlib.sha1() + while indexPackRead < bundlePackSize: + data = fp.read(min(bundlePackSize - indexPackRead, 1048576)) + indexPackRead += len(data) + hasher.update(data) + indexPackHash = hasher.digest() + if bundlePackHash != indexPackHash or indexPackRead != bundlePackSize: + raise RuntimeError(f'index pack hash comparison failed: expected {bundlePackHash!r} (size: {bundlePackSize}), got {indexPackHash!r} (size: {indexPackRead})') + # Parse index with open('tmp.idx', 'rb') as fp: _, index, _ = codearchiver.subprocess.run_with_log(['git', 'show-index'], input = fp) - indexObjectIds = {l.rstrip('\n').split(' ', 2)[1] for l in index.splitlines()} + indexObjectIds = {oid for offset, oid, _ in map(lambda l: l.rstrip('\n').split(' ', 2), index.splitlines()) if int(offset) < bundlePackSize} try: indexObjects = {oid: objects[oid] for oid in indexObjectIds} except KeyError as e: