From 66666a153834d1cf1d469d76f2d40cf473a43000 Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Tue, 28 Mar 2023 04:36:46 +0000
Subject: [PATCH] Workaround for incremental bundles with deltified objects

---
 codearchiver/modules/git.py | 76 +++++++++++++++++++++++++++++++++----
 1 file changed, 69 insertions(+), 7 deletions(-)

diff --git a/codearchiver/modules/git.py b/codearchiver/modules/git.py
index 70dd4da..a64d836 100644
--- a/codearchiver/modules/git.py
+++ b/codearchiver/modules/git.py
@@ -14,6 +14,42 @@ import tempfile
 _logger = logging.getLogger(__name__)
 
 
+class _HashingFileReader:
+	'''A tiny wrapper around a file-like object which calculates the hash of the file as it is being read.'''
+
+	def __init__(self, fp, hasher = hashlib.sha1, skipStart = 0, skipEnd = 0):
+		self._fp = fp
+		self._hasher = hasher()
+		self._skipStart = skipStart
+		self._skipEnd = skipEnd
+		self._buf = b''
+
+	def read(self, n):
+		data = self._fp.read(n)
+		if self._skipStart > 0:
+			# Requires that the first block is bigger than skipStart+skipEnd because it otherwise gets very complicated
+			# Yes, this fails if the file is smaller than that, but given what it's used for, that's not an issue.
+			if len(data) < self._skipStart + self._skipEnd:
+				raise ValueError(f'first read is required to return at least {self._skipStart + self._skipEnd} bytes')
+			start = self._skipStart
+			self._skipStart = 0
+		else:
+			start = 0
+		bufPlusData = self._buf + data
+		if self._skipEnd > 0:
+			self._buf = bufPlusData[-self._skipEnd:]
+			end = -self._skipEnd
+		else:
+			end = None
+		self._hasher.update(bufPlusData[start:end])
+		return data
+
+	def digest(self):
+		if self._skipStart > 0 or len(self._buf) != self._skipEnd:
+			raise ValueError('data skipping failed')
+		return self._hasher.digest()
+
+
 class GitMetadata(codearchiver.core.Metadata):
 	fields = (
 		codearchiver.core.MetadataField(key = 'Git version', required = True, repeatable = False),
@@ -170,20 +206,46 @@ class Git(codearchiver.core.Module):
 					raise RuntimeError(f'git bundle creation returned with non-zero exit status {status}.')
 
 				_logger.info('Indexing bundle')
-				# Yes, this is stupid, but unfortunately, `git index-pack` can only read from stdin inside a repo and will still write the packfile to disk anyway.
-				# So sadly, the only way here (for now) is to make a copy of the packfile and then run index-pack on it.
+
+				# The bundle's packfile might contain deltified objects.
+				# Those cannot be run through `index-pack` without using `--fix-thin` and running it in a repo containing the missing objects.
+				# However, `--fix-thin` has the side effect that it appends those missing objects to the output pack as well, so they also show up in the `show-index` output afterwards.
+				# The fact that this always appends is undocumented, so it can't simply be relied on.
+				# So this does the following:
+				# - Index the pack; as the data is being read, calculate a hash of the pack data without header and trailer checksum
+				# - Verify that the corresponding bytes from the index-pack output file have the same hash.
+				# - Read the index and filter out any entries that lie beyond the size of the bundle packfile minus 20 (trailer checksum)
+				# This gets an accurate index of exactly which objects are in this pack; some might depend on data from other bundles, but that's fine.
+				# The extra copy to disk is unfortunately unavoidable anyway, but this hash verification at least makes it somewhat valuable.
+
+				# Index with inline hash calculation
+				bundleSize = os.path.getsize(bundle)
 				with open(bundle, 'rb') as fpin:
 					# Skip over header
 					for line in fpin:
 						if line == b'\n':
 							break
-					# Copy remainder (= packfile) to tmp.pack
-					with open('tmp.pack', 'xb') as fpout:
-						shutil.copyfileobj(fpin, fpout)
-				codearchiver.subprocess.run_with_log(['git', 'index-pack', '-v', 'tmp.pack'])
+					packOffset = fpin.tell()
+					hashWrapper = _HashingFileReader(fpin, skipStart = 12, skipEnd = 20)
+					codearchiver.subprocess.run_with_log(['git', 'index-pack', '-v', '--fix-thin', '--stdin', '../tmp.pack'], input = hashWrapper, cwd = directory)
+					bundlePackSize = bundleSize - packOffset - 12 - 20
+					bundlePackHash = hashWrapper.digest()
+				# Verify hash of first part of the index-pack output pack
+				with open('tmp.pack', 'rb') as fp:
+					fp.seek(12) # Header
+					indexPackRead = 0
+					hasher = hashlib.sha1()
+					while indexPackRead < bundlePackSize:
+						data = fp.read(min(bundlePackSize - indexPackRead, 1048576))
+						indexPackRead += len(data)
+						hasher.update(data)
+					indexPackHash = hasher.digest()
+				if bundlePackHash != indexPackHash or indexPackRead != bundlePackSize:
+					raise RuntimeError(f'index pack hash comparison failed: expected {bundlePackHash!r} (size: {bundlePackSize}), got {indexPackHash!r} (size: {indexPackRead})')
+				# Parse index
 				with open('tmp.idx', 'rb') as fp:
 					_, index, _ = codearchiver.subprocess.run_with_log(['git', 'show-index'], input = fp)
-				indexObjectIds = {l.rstrip('\n').split(' ', 2)[1] for l in index.splitlines()}
+				indexObjectIds = {oid for offset, oid, _ in map(lambda l: l.rstrip('\n').split(' ', 2), index.splitlines()) if int(offset) < bundlePackSize}
 				try:
 					indexObjects = {oid: objects[oid] for oid in indexObjectIds}
 				except KeyError as e: