Browse Source

Workaround for incremental bundles with deltified objects

tags/v1.1
JustAnotherArchivist 1 year ago
parent
commit
66666a1538
1 changed files with 69 additions and 7 deletions
  1. +69
    -7
      codearchiver/modules/git.py

+ 69
- 7
codearchiver/modules/git.py View File

@@ -14,6 +14,42 @@ import tempfile
_logger = logging.getLogger(__name__)


class _HashingFileReader:
'''A tiny wrapper around a file-like object which calculates the hash of the file as it is being read.'''

def __init__(self, fp, hasher = hashlib.sha1, skipStart = 0, skipEnd = 0):
self._fp = fp
self._hasher = hasher()
self._skipStart = skipStart
self._skipEnd = skipEnd
self._buf = b''

def read(self, n):
data = self._fp.read(n)
if self._skipStart > 0:
# Requires that the first block is bigger than skipStart+skipEnd because it otherwise gets very complicated
# Yes, this fails if the file is smaller than that, but given what it's used for, that's not an issue.
if len(data) < self._skipStart + self._skipEnd:
raise ValueError(f'first read is required to return at least {self._skipStart + self._skipEnd} bytes')
start = self._skipStart
self._skipStart = 0
else:
start = 0
bufPlusData = self._buf + data
if self._skipEnd > 0:
self._buf = bufPlusData[-self._skipEnd:]
end = -self._skipEnd
else:
end = None
self._hasher.update(bufPlusData[start:end])
return data

def digest(self):
if self._skipStart > 0 or len(self._buf) != self._skipEnd:
raise ValueError('data skipping failed')
return self._hasher.digest()


class GitMetadata(codearchiver.core.Metadata):
fields = (
codearchiver.core.MetadataField(key = 'Git version', required = True, repeatable = False),
@@ -170,20 +206,46 @@ class Git(codearchiver.core.Module):
raise RuntimeError(f'git bundle creation returned with non-zero exit status {status}.')

_logger.info('Indexing bundle')
# Yes, this is stupid, but unfortunately, `git index-pack` can only read from stdin inside a repo and will still write the packfile to disk anyway.
# So sadly, the only way here (for now) is to make a copy of the packfile and then run index-pack on it.

# The bundle's packfile might contain deltified objects.
# Those cannot be run through `index-pack` without using `--fix-thin` and running it in a repo containing the missing objects.
# However, `--fix-thin` has the side effect that it appends those missing objects to the output pack as well, so they also show up in the `show-index` output afterwards.
# The fact that this always appends is undocumented, so it can't simply be relied on.
# So this does the following:
# - Index the pack; as the data is being read, calculate a hash of the pack data without header and trailer checksum
# - Verify that the corresponding bytes from the index-pack output file have the same hash.
# - Read the index and filter out any entries that lie beyond the size of the bundle packfile minus 20 (trailer checksum)
# This gets an accurate index of exactly which objects are in this pack; some might depend on data from other bundles, but that's fine.
# The extra copy to disk is unfortunately unavoidable anyway, but this hash verification at least makes it somewhat valuable.

# Index with inline hash calculation
bundleSize = os.path.getsize(bundle)
with open(bundle, 'rb') as fpin:
# Skip over header
for line in fpin:
if line == b'\n':
break
# Copy remainder (= packfile) to tmp.pack
with open('tmp.pack', 'xb') as fpout:
shutil.copyfileobj(fpin, fpout)
codearchiver.subprocess.run_with_log(['git', 'index-pack', '-v', 'tmp.pack'])
packOffset = fpin.tell()
hashWrapper = _HashingFileReader(fpin, skipStart = 12, skipEnd = 20)
codearchiver.subprocess.run_with_log(['git', 'index-pack', '-v', '--fix-thin', '--stdin', '../tmp.pack'], input = hashWrapper, cwd = directory)
bundlePackSize = bundleSize - packOffset - 12 - 20
bundlePackHash = hashWrapper.digest()
# Verify hash of first part of the index-pack output pack
with open('tmp.pack', 'rb') as fp:
fp.seek(12) # Header
indexPackRead = 0
hasher = hashlib.sha1()
while indexPackRead < bundlePackSize:
data = fp.read(min(bundlePackSize - indexPackRead, 1048576))
indexPackRead += len(data)
hasher.update(data)
indexPackHash = hasher.digest()
if bundlePackHash != indexPackHash or indexPackRead != bundlePackSize:
raise RuntimeError(f'index pack hash comparison failed: expected {bundlePackHash!r} (size: {bundlePackSize}), got {indexPackHash!r} (size: {indexPackRead})')
# Parse index
with open('tmp.idx', 'rb') as fp:
_, index, _ = codearchiver.subprocess.run_with_log(['git', 'show-index'], input = fp)
indexObjectIds = {l.rstrip('\n').split(' ', 2)[1] for l in index.splitlines()}
indexObjectIds = {oid for offset, oid, _ in map(lambda l: l.rstrip('\n').split(' ', 2), index.splitlines()) if int(offset) < bundlePackSize}
try:
indexObjects = {oid: objects[oid] for oid in indexObjectIds}
except KeyError as e:


Loading…
Cancel
Save