|
|
@@ -14,6 +14,42 @@ import tempfile |
|
|
|
_logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
|
|
|
class _HashingFileReader: |
|
|
|
'''A tiny wrapper around a file-like object which calculates the hash of the file as it is being read.''' |
|
|
|
|
|
|
|
def __init__(self, fp, hasher = hashlib.sha1, skipStart = 0, skipEnd = 0): |
|
|
|
self._fp = fp |
|
|
|
self._hasher = hasher() |
|
|
|
self._skipStart = skipStart |
|
|
|
self._skipEnd = skipEnd |
|
|
|
self._buf = b'' |
|
|
|
|
|
|
|
def read(self, n): |
|
|
|
data = self._fp.read(n) |
|
|
|
if self._skipStart > 0: |
|
|
|
# Requires that the first block is bigger than skipStart+skipEnd because it otherwise gets very complicated |
|
|
|
# Yes, this fails if the file is smaller than that, but given what it's used for, that's not an issue. |
|
|
|
if len(data) < self._skipStart + self._skipEnd: |
|
|
|
raise ValueError(f'first read is required to return at least {self._skipStart + self._skipEnd} bytes') |
|
|
|
start = self._skipStart |
|
|
|
self._skipStart = 0 |
|
|
|
else: |
|
|
|
start = 0 |
|
|
|
bufPlusData = self._buf + data |
|
|
|
if self._skipEnd > 0: |
|
|
|
self._buf = bufPlusData[-self._skipEnd:] |
|
|
|
end = -self._skipEnd |
|
|
|
else: |
|
|
|
end = None |
|
|
|
self._hasher.update(bufPlusData[start:end]) |
|
|
|
return data |
|
|
|
|
|
|
|
def digest(self): |
|
|
|
if self._skipStart > 0 or len(self._buf) != self._skipEnd: |
|
|
|
raise ValueError('data skipping failed') |
|
|
|
return self._hasher.digest() |
|
|
|
|
|
|
|
|
|
|
|
class GitMetadata(codearchiver.core.Metadata): |
|
|
|
fields = ( |
|
|
|
codearchiver.core.MetadataField(key = 'Git version', required = True, repeatable = False), |
|
|
@@ -170,20 +206,46 @@ class Git(codearchiver.core.Module): |
|
|
|
raise RuntimeError(f'git bundle creation returned with non-zero exit status {status}.') |
|
|
|
|
|
|
|
_logger.info('Indexing bundle') |
|
|
|
# Yes, this is stupid, but unfortunately, `git index-pack` can only read from stdin inside a repo and will still write the packfile to disk anyway. |
|
|
|
# So sadly, the only way here (for now) is to make a copy of the packfile and then run index-pack on it. |
|
|
|
|
|
|
|
# The bundle's packfile might contain deltified objects. |
|
|
|
# Those cannot be run through `index-pack` without using `--fix-thin` and running it in a repo containing the missing objects. |
|
|
|
# However, `--fix-thin` has the side effect that it appends those missing objects to the output pack as well, so they also show up in the `show-index` output afterwards. |
|
|
|
# The fact that this always appends is undocumented, so it can't simply be relied on. |
|
|
|
# So this does the following: |
|
|
|
# - Index the pack; as the data is being read, calculate a hash of the pack data without header and trailer checksum |
|
|
|
# - Verify that the corresponding bytes from the index-pack output file have the same hash. |
|
|
|
# - Read the index and filter out any entries that lie beyond the size of the bundle packfile minus 20 (trailer checksum) |
|
|
|
# This gets an accurate index of exactly which objects are in this pack; some might depend on data from other bundles, but that's fine. |
|
|
|
# The extra copy to disk is unfortunately unavoidable anyway, but this hash verification at least makes it somewhat valuable. |
|
|
|
|
|
|
|
# Index with inline hash calculation |
|
|
|
bundleSize = os.path.getsize(bundle) |
|
|
|
with open(bundle, 'rb') as fpin: |
|
|
|
# Skip over header |
|
|
|
for line in fpin: |
|
|
|
if line == b'\n': |
|
|
|
break |
|
|
|
# Copy remainder (= packfile) to tmp.pack |
|
|
|
with open('tmp.pack', 'xb') as fpout: |
|
|
|
shutil.copyfileobj(fpin, fpout) |
|
|
|
codearchiver.subprocess.run_with_log(['git', 'index-pack', '-v', 'tmp.pack']) |
|
|
|
packOffset = fpin.tell() |
|
|
|
hashWrapper = _HashingFileReader(fpin, skipStart = 12, skipEnd = 20) |
|
|
|
codearchiver.subprocess.run_with_log(['git', 'index-pack', '-v', '--fix-thin', '--stdin', '../tmp.pack'], input = hashWrapper, cwd = directory) |
|
|
|
bundlePackSize = bundleSize - packOffset - 12 - 20 |
|
|
|
bundlePackHash = hashWrapper.digest() |
|
|
|
# Verify hash of first part of the index-pack output pack |
|
|
|
with open('tmp.pack', 'rb') as fp: |
|
|
|
fp.seek(12) # Header |
|
|
|
indexPackRead = 0 |
|
|
|
hasher = hashlib.sha1() |
|
|
|
while indexPackRead < bundlePackSize: |
|
|
|
data = fp.read(min(bundlePackSize - indexPackRead, 1048576)) |
|
|
|
indexPackRead += len(data) |
|
|
|
hasher.update(data) |
|
|
|
indexPackHash = hasher.digest() |
|
|
|
if bundlePackHash != indexPackHash or indexPackRead != bundlePackSize: |
|
|
|
raise RuntimeError(f'index pack hash comparison failed: expected {bundlePackHash!r} (size: {bundlePackSize}), got {indexPackHash!r} (size: {indexPackRead})') |
|
|
|
# Parse index |
|
|
|
with open('tmp.idx', 'rb') as fp: |
|
|
|
_, index, _ = codearchiver.subprocess.run_with_log(['git', 'show-index'], input = fp) |
|
|
|
indexObjectIds = {l.rstrip('\n').split(' ', 2)[1] for l in index.splitlines()} |
|
|
|
indexObjectIds = {oid for offset, oid, _ in map(lambda l: l.rstrip('\n').split(' ', 2), index.splitlines()) if int(offset) < bundlePackSize} |
|
|
|
try: |
|
|
|
indexObjects = {oid: objects[oid] for oid in indexObjectIds} |
|
|
|
except KeyError as e: |
|
|
|