Workaround for incremental bundles with deltified objects

1 year ago · 66666a1538
--- a/codearchiver/modules/git.py
+++ b/codearchiver/modules/git.py
@@ -14,6 +14,42 @@ import tempfile
 _logger = logging.getLogger(__name__)


 class _HashingFileReader:
 	'''A tiny wrapper around a file-like object which calculates the hash of the file as it is being read.'''

 	def __init__(self, fp, hasher = hashlib.sha1, skipStart = 0, skipEnd = 0):
 		self._fp = fp
 		self._hasher = hasher()
 		self._skipStart = skipStart
 		self._skipEnd = skipEnd
 		self._buf = b''

 	def read(self, n):
 		data = self._fp.read(n)
 		if self._skipStart > 0:
 			# Requires that the first block is bigger than skipStart+skipEnd because it otherwise gets very complicated
 			# Yes, this fails if the file is smaller than that, but given what it's used for, that's not an issue.
 			if len(data) < self._skipStart + self._skipEnd:
 				raise ValueError(f'first read is required to return at least {self._skipStart + self._skipEnd} bytes')
 			start = self._skipStart
 			self._skipStart = 0
 		else:
 			start = 0
 		bufPlusData = self._buf + data
 		if self._skipEnd > 0:
 			self._buf = bufPlusData[-self._skipEnd:]
 			end = -self._skipEnd
 		else:
 			end = None
 		self._hasher.update(bufPlusData[start:end])
 		return data

 	def digest(self):
 		if self._skipStart > 0 or len(self._buf) != self._skipEnd:
 			raise ValueError('data skipping failed')
 		return self._hasher.digest()


 class GitMetadata(codearchiver.core.Metadata):
 	fields = (
 		codearchiver.core.MetadataField(key = 'Git version', required = True, repeatable = False),
@@ -170,20 +206,46 @@ class Git(codearchiver.core.Module):
 					raise RuntimeError(f'git bundle creation returned with non-zero exit status {status}.')

 				_logger.info('Indexing bundle')
 				# Yes, this is stupid, but unfortunately, `git index-pack` can only read from stdin inside a repo and will still write the packfile to disk anyway.
 				# So sadly, the only way here (for now) is to make a copy of the packfile and then run index-pack on it.

 				# The bundle's packfile might contain deltified objects.
 				# Those cannot be run through `index-pack` without using `--fix-thin` and running it in a repo containing the missing objects.
 				# However, `--fix-thin` has the side effect that it appends those missing objects to the output pack as well, so they also show up in the `show-index` output afterwards.
 				# The fact that this always appends is undocumented, so it can't simply be relied on.
 				# So this does the following:
 				# - Index the pack; as the data is being read, calculate a hash of the pack data without header and trailer checksum
 				# - Verify that the corresponding bytes from the index-pack output file have the same hash.
 				# - Read the index and filter out any entries that lie beyond the size of the bundle packfile minus 20 (trailer checksum)
 				# This gets an accurate index of exactly which objects are in this pack; some might depend on data from other bundles, but that's fine.
 				# The extra copy to disk is unfortunately unavoidable anyway, but this hash verification at least makes it somewhat valuable.

 				# Index with inline hash calculation
 				bundleSize = os.path.getsize(bundle)
 				with open(bundle, 'rb') as fpin:
 					# Skip over header
 					for line in fpin:
 						if line == b'\n':
 							break
 					# Copy remainder (= packfile) to tmp.pack
 					with open('tmp.pack', 'xb') as fpout:
 						shutil.copyfileobj(fpin, fpout)
 				codearchiver.subprocess.run_with_log(['git', 'index-pack', '-v', 'tmp.pack'])
 					packOffset = fpin.tell()
 					hashWrapper = _HashingFileReader(fpin, skipStart = 12, skipEnd = 20)
 					codearchiver.subprocess.run_with_log(['git', 'index-pack', '-v', '--fix-thin', '--stdin', '../tmp.pack'], input = hashWrapper, cwd = directory)
 					bundlePackSize = bundleSize - packOffset - 12 - 20
 					bundlePackHash = hashWrapper.digest()
 				# Verify hash of first part of the index-pack output pack
 				with open('tmp.pack', 'rb') as fp:
 					fp.seek(12) # Header
 					indexPackRead = 0
 					hasher = hashlib.sha1()
 					while indexPackRead < bundlePackSize:
 						data = fp.read(min(bundlePackSize - indexPackRead, 1048576))
 						indexPackRead += len(data)
 						hasher.update(data)
 					indexPackHash = hasher.digest()
 				if bundlePackHash != indexPackHash or indexPackRead != bundlePackSize:
 					raise RuntimeError(f'index pack hash comparison failed: expected {bundlePackHash!r} (size: {bundlePackSize}), got {indexPackHash!r} (size: {indexPackRead})')
 				# Parse index
 				with open('tmp.idx', 'rb') as fp:
 					_, index, _ = codearchiver.subprocess.run_with_log(['git', 'show-index'], input = fp)
 				indexObjectIds = {l.rstrip('\n').split(' ', 2)[1] for l in index.splitlines()}
 				indexObjectIds = {oid for offset, oid, _ in map(lambda l: l.rstrip('\n').split(' ', 2), index.splitlines()) if int(offset) < bundlePackSize}
 				try:
 					indexObjects = {oid: objects[oid] for oid in indexObjectIds}
 				except KeyError as e: