JustAnotherArchivist
/
codearchiver

import codearchiver.core
import codearchiver.subprocess
import datetime
import functools
import hashlib
import itertools
import logging
import os
import shutil
import subprocess
import tempfile


_logger = logging.getLogger(__name__)


class _HashingFileReader:
	'''A tiny wrapper around a file-like object which calculates the hash of the file as it is being read.'''

	def __init__(self, fp, hasher = hashlib.sha1, skipStart = 0, skipEnd = 0):
		self._fp = fp
		self._hasher = hasher()
		self._skipStart = skipStart
		self._skipEnd = skipEnd
		self._buf = b''

	def read(self, n):
		data = self._fp.read(n)
		if self._skipStart > 0:
			# Requires that the first block is bigger than skipStart+skipEnd because it otherwise gets very complicated
			# Yes, this fails if the file is smaller than that, but given what it's used for, that's not an issue.
			if len(data) < self._skipStart + self._skipEnd:
				raise ValueError(f'first read is required to return at least {self._skipStart + self._skipEnd} bytes')
			start = self._skipStart
			self._skipStart = 0
		else:
			start = 0
		bufPlusData = self._buf + data
		if self._skipEnd > 0:
			self._buf = bufPlusData[-self._skipEnd:]
			end = -self._skipEnd
		else:
			end = None
		self._hasher.update(bufPlusData[start:end])
		return data

	def digest(self):
		if self._skipStart > 0 or len(self._buf) != self._skipEnd:
			raise ValueError('data skipping failed')
		return self._hasher.digest()


class GitMetadata(codearchiver.core.Metadata):
	fields = (
		codearchiver.core.MetadataField(key = 'Git version', required = True, repeatable = False),
		codearchiver.core.MetadataField(key = 'Based on bundle', required = False, repeatable = True),
		codearchiver.core.MetadataField(key = 'Ref', required = True, repeatable = True),
		codearchiver.core.MetadataField(key = 'Head', required = True, repeatable = False),
		codearchiver.core.MetadataField(key = 'Root commit', required = True, repeatable = True),
		codearchiver.core.MetadataField(key = 'Object', required = False, repeatable = True),
	)
	version = 0


class Git(codearchiver.core.Module):
	name = 'git'
	MetadataClass = GitMetadata

	@staticmethod
	def matches(inputUrl):
		return inputUrl.url.endswith('.git')

	def __init__(self, *args, extraBranches = {}, **kwargs):
		super().__init__(*args, **kwargs)
		self._extraBranches = extraBranches

	def _find_storage_bundles(self, criteria, checkOids, temporary = False):
		'''Search `self._storage` for bundles or temporary metadata matching `criteria` and containing at least one element of `checkOids`. Yields tuples `(name, objects, oids)`.'''
		searchMethod = self._storage.search_metadata if not temporary else self._storage.search_temporary_metadata
		openMethod = self._storage.open_metadata if not temporary else self._storage.open_temporary_metadata
		matchedBundles = {}  # bundle name → (objects, oids)
		for oldBundle in searchMethod(criteria):
			_logger.info(f'Matching bundle: {oldBundle!r}')
			with openMethod(oldBundle) as fp:
				idx = GitMetadata.deserialise(fp)
			isMatch = False
			oldObjects = set()  # commit and tag lines in this bundle
			oldOids = set()  # commit and tag IDs in this bundle
			for key, value in idx:
				if key != 'Object':
					continue
				oid, otype = value.split(' ', 1)
				oldObjects.add(value)
				oldOids.add(oid)
				if otype not in ('commit', 'tag'):
					continue
				if not isMatch and oid in checkOids:
					isMatch = True
			if isMatch:
				yield (oldBundle, oldObjects, oldOids)

	def process(self):
		with tempfile.TemporaryDirectory(prefix = 'tmp.codearchiver.git.', dir = os.getcwd()) as directory:
			bundle = f'{self._id}_git.bundle'
			if os.path.exists(bundle):
				_logger.fatal(f'{bundle!r} already exists')
				raise FileExistsError(f'{bundle!r} already exists')

			_, gitVersion, _ = codearchiver.subprocess.run_with_log(['git', '--version'])
			if not gitVersion.startswith('git version ') or not gitVersion.endswith('\n') or gitVersion[12:-1].strip('0123456789.') != '':
				raise RuntimeError(f'Unexpected output from `git --version`: {gitVersion!r}')
			gitVersion = gitVersion[12:-1]

			_logger.info(f'Cloning {self._url} into {directory}')
			startTime = datetime.datetime.utcnow()
			codearchiver.subprocess.run_with_log(['git', 'clone', '--verbose', '--progress', '--mirror', self._url, directory], env = {**os.environ, 'GIT_TERMINAL_PROMPT': '0'})

			if self._extraBranches:
				for branch, commit in self._extraBranches.items():
					_logger.info(f'Fetching commit {commit} as {branch}')
					r, _, _ = codearchiver.subprocess.run_with_log(['git', 'fetch', '--verbose', '--progress', 'origin', commit], cwd = directory, check = False)
					if r == 0:
						r2, _, _ = codearchiver.subprocess.run_with_log(['git', 'update-ref', f'refs/codearchiver/{branch}', commit, ''], cwd = directory, check = False)
						if r2 != 0:
							_logger.error(f'Failed to update-ref refs/codearchiver/{branch} to {commit}')
					else:
						_logger.error(f'Failed to fetch {commit}')
				# This leaves over a FETCH_HEAD file, but git-bundle does not care about that, so it can safely be ignored.
			endTime = datetime.datetime.utcnow()

			_logger.info('Collecting repository metadata')
			_, refs, _ = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory)
			refs = list(map(str.strip, refs.splitlines()))
			_, rootCommits, _ = codearchiver.subprocess.run_with_log(['git', 'rev-list', '--max-parents=0', '--all'], cwd = directory)
			rootCommits = list(filter(None, rootCommits.splitlines()))
			_, objects, _ = codearchiver.subprocess.run_with_log(['git', 'cat-file', '--batch-check', '--batch-all-objects', '--unordered', '--buffer'], cwd = directory)
			objects = {oid: otype for oid, otype, osize in map(functools.partial(str.split, sep = ' '), objects.splitlines())}
			with open(os.path.join(directory, 'HEAD'), 'r') as fp:
				head = fp.read()
			if not head.startswith('ref: refs/heads/') or not head.endswith('\n'):
				raise RuntimeError(f'Unexpected HEAD content: {head!r}')
			head = head[:-1]  # Remove trailing \n

			metadata = self.create_metadata(bundle, startTime, endTime)
			metadata.append('Git version', gitVersion)
			for line in refs:
				metadata.append('Ref', line)
			metadata.append('Head', head)
			for commitId in rootCommits:
				metadata.append('Root commit', commitId)

			# Check whether there are relevant prior bundles to create an incremental one
			commitsAndTags = {oid for oid, otype in objects.items() if otype in ('commit', 'tag')}
			tmpMetadataDependencies = []  # temporary metadata names this depends on, to be resolved later
			baseOids = set()  # all oids this depends on (including temporary metadata, but only commits and tags from there)
			baseInProgressObjects = set()  # 'oid otype' lines for finding the bundles at the end
			newCommitsAndTags = set()  # oids of commits and tags not covered in previous bundles or existing temporary metadata
			temporaryMetadataName = None
			if self._storage:
				_logger.info('Checking for previous bundles')

				# A note on dependency optimisation: we want the minimal set of previous bundles {B0, …, Bn} that maximises the cover with the current clone S.
				# In other words, in the general case, this is a set cover problem between I = S ∩ (B0 ∪ … ∪ Bn} as the universe and Bi ∩ I as the subsets.
				# Fortunately, solving the actual set cover problem is not necessary.
				# This is because the previous bundles must be disjoint: commit/tag objects are never duplicated. (Trees and blobs might be, but deduplicating those isn't possible.)
				# Therefore, any previous bundle that contains at least one commit or tag object in the current clone must be a dependency.

				# To support parallel archival of related repositories, this uses other processes' temporary metadata from and writes its own to storage.
				# First, obtain all relevant prior bundles.
				# Second, obtain all relevant temporary metadata. Make a note of these and also exclude their commits from this bundle. Write own temporary metadata.
				# Third, upon completion (below), wait for the depended-on temporary metadata to disappear, search for the corresponding bundles, and finalise own metadata.

				with self._storage.lock():
					for oldBundleName, oldObjects, oldOids in self._find_storage_bundles([('Module', type(self).name), ('Root commit', tuple(rootCommits))], commitsAndTags):
						metadata.append('Based on bundle', oldBundleName)
						baseOids |= oldOids
					for tmpMetadataName, tmpObjects, tmpOids in self._find_storage_bundles([('Module', type(self).name), ('Root commit', tuple(rootCommits))], commitsAndTags, temporary = True):
						tmpMetadataDependencies.append(tmpMetadataName)
						baseOids |= tmpOids
						baseInProgressObjects |= tmpObjects

					newCommitsAndTags = commitsAndTags - baseOids
					for oid in newCommitsAndTags:
						metadata.append('Object', f'{oid} {objects[oid]}')
					temporaryMetadataName = self._storage.add_temporary_metadata(metadata)

			try:
				_logger.info(f'Bundling into {bundle}')
				cmd = ['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all']
				objectsToExclude = baseOids & commitsAndTags
				del commitsAndTags
				input = ''.join(f'^{o}\n' for o in objectsToExclude).encode('ascii')
				del objectsToExclude
				status, _, stderr = codearchiver.subprocess.run_with_log(cmd, cwd = directory, input = input, check = False)
				del input
				if status == 128 and (stderr == 'fatal: Refusing to create empty bundle.\n' or stderr.endswith('\nfatal: Refusing to create empty bundle.\n')):
					# Manually write an empty bundle instead
					# Cf. Documentation/technical/bundle-format.txt and Documentation/technical/pack-format.txt in git's repository for details on the formats
					_logger.info('Writing empty bundle directly instead')
					with open(bundle, 'xb') as fp:
						fp.write(b'# v2 git bundle\n')  # bundle signature
						fp.write(b'\n')  # bundle end of prerequisites and refs
						packdata = b'PACK'  # pack signature
						packdata += b'\0\0\0\x02'  # pack version
						packdata += b'\0\0\0\0'  # pack number of objects
						fp.write(packdata)
						fp.write(hashlib.sha1(packdata).digest())  # pack checksum trailer
				elif status != 0:
					raise RuntimeError(f'git bundle creation returned with non-zero exit status {status}.')

				_logger.info('Indexing bundle')

				# The bundle's packfile might contain deltified objects.
				# Those cannot be run through `index-pack` without using `--fix-thin` and running it in a repo containing the missing objects.
				# However, `--fix-thin` has the side effect that it appends those missing objects to the output pack as well, so they also show up in the `show-index` output afterwards.
				# The fact that this always appends is undocumented, so it can't simply be relied on.
				# So this does the following:
				# - Index the pack; as the data is being read, calculate a hash of the pack data without header and trailer checksum
				# - Verify that the corresponding bytes from the index-pack output file have the same hash.
				# - Read the index and filter out any entries that lie beyond the size of the bundle packfile minus 20 (trailer checksum)
				# This gets an accurate index of exactly which objects are in this pack; some might depend on data from other bundles, but that's fine.
				# The extra copy to disk is unfortunately unavoidable anyway, but this hash verification at least makes it somewhat valuable.

				# Index with inline hash calculation
				bundleSize = os.path.getsize(bundle)
				with open(bundle, 'rb') as fpin:
					# Skip over header
					for line in fpin:
						if line == b'\n':
							break
					packOffset = fpin.tell()
					hashWrapper = _HashingFileReader(fpin, skipStart = 12, skipEnd = 20)
					codearchiver.subprocess.run_with_log(['git', 'index-pack', '-v', '--fix-thin', '--stdin', '../tmp.pack'], input = hashWrapper, cwd = directory)
					bundlePackSize = bundleSize - packOffset - 12 - 20
					bundlePackHash = hashWrapper.digest()
				# Verify hash of first part of the index-pack output pack
				with open('tmp.pack', 'rb') as fp:
					fp.seek(12) # Header
					indexPackRead = 0
					hasher = hashlib.sha1()
					while indexPackRead < bundlePackSize:
						data = fp.read(min(bundlePackSize - indexPackRead, 1048576))
						indexPackRead += len(data)
						hasher.update(data)
					indexPackHash = hasher.digest()
				if bundlePackHash != indexPackHash or indexPackRead != bundlePackSize:
					raise RuntimeError(f'index pack hash comparison failed: expected {bundlePackHash!r} (size: {bundlePackSize}), got {indexPackHash!r} (size: {indexPackRead})')
				# Parse index
				with open('tmp.idx', 'rb') as fp:
					_, index, _ = codearchiver.subprocess.run_with_log(['git', 'show-index'], input = fp)
				indexObjectIds = {oid for offset, oid, _ in map(lambda l: l.rstrip('\n').split(' ', 2), index.splitlines()) if int(offset) < bundlePackSize}
				del index
				try:
					indexObjects = {oid: objects[oid] for oid in indexObjectIds}
				except KeyError as e:
					# This should never happen since the bundle is created from the clone with exclusions...
					raise RuntimeError(f'Bundle {bundle} contains object not contained in the present clone') from e
				os.remove('tmp.pack')
				os.remove('tmp.idx')

				_logger.info('Checking for submodules')
				_, commitsWithSubmodules, _ = codearchiver.subprocess.run_with_log(['git', 'log', '--format=format:%H', '--diff-filter=d', '--all', '--', '.gitmodules'], cwd = directory)
				if commitsWithSubmodules:
					_logger.warning('Submodules found but extraction not supported')
				del commitsWithSubmodules

				# Ensure that all commits and tags included in the temporary metadata made it into the pack, else data may be lost!
				indexCommitsAndTags = {oid for oid, otype in indexObjects.items() if otype in ('commit', 'tag')}
				if newCommitsAndTags - indexCommitsAndTags != set():
					raise RuntimeError('Bundle does not contain all commits/tags that were written to temporary metadata, aborting due to data loss risk')
				for oid, otype in indexObjects.items():
					if oid in newCommitsAndTags:
						# Already added to metadata earlier
						continue
					metadata.append('Object', f'{oid} {otype}')
				del indexObjects, indexCommitsAndTags

				# Bundling completed without issues; wait for depended-on bundles, add them to the metadata, then replace own temporary metadata
				if self._storage:
					if tmpMetadataDependencies:
						self._storage.wait_temporary_metadata(tmpMetadataDependencies)
					with self._storage.lock():
						if tmpMetadataDependencies:
							criteria = [('Module', type(self).name), ('Root commit', tuple(rootCommits)), ('Object', tuple(baseInProgressObjects))]
							missingObjects = baseInProgressObjects.copy()
							for oldBundleName, oldObjects, oldOids in self._find_storage_bundles(criteria, {value.split(' ', 1)[0] for value in baseInProgressObjects}):
								metadata.append('Based on bundle', oldBundleName)
								baseOids |= oldOids
								missingObjects -= oldObjects

							# Verification: all commit/tag objects collected from temporary metadata must be covered
							if missingObjects:
								raise RuntimeError('Resolved temporary metadata bundles do not cover all expected objects')

						# Verification: all objects in the clone are either in a base bundle or in the index
						# This can only be done here because all oids are needed, not just the commit/tag objects
						if objects.keys() - (baseOids | indexObjectIds) != set():
							raise RuntimeError('Object mismatch between clone and bundles')

						self._storage.replace_temporary_metadata(temporaryMetadataName, bundle, metadata)
			except:
				# Attempt to remove the temporary metadata, then reraise
				if self._storage:
					with self._storage.lock():
						self._storage.remove_temporary_metadata(temporaryMetadataName)
				raise

		return codearchiver.core.Result(id = self._id, files = [(bundle, metadata)])

	def __repr__(self):
		return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r}, extraBranches = {self._extraBranches!r})'