JustAnotherArchivist
/
codearchiver


			
							import codearchiver.core
import codearchiver.subprocess
import datetime
import functools
import hashlib
import itertools
import logging
import os
import shutil
import subprocess
import tempfile


_logger = logging.getLogger(__name__)


class GitMetadata(codearchiver.core.Metadata):
	fields = (
		codearchiver.core.MetadataField(key = 'Git version', required = True, repeatable = False),
		codearchiver.core.MetadataField(key = 'Based on bundle', required = False, repeatable = True),
		codearchiver.core.MetadataField(key = 'Ref', required = True, repeatable = True),
		codearchiver.core.MetadataField(key = 'Head', required = True, repeatable = False),
		codearchiver.core.MetadataField(key = 'Root commit', required = True, repeatable = True),
		codearchiver.core.MetadataField(key = 'Object', required = False, repeatable = True),
	)
	version = 0


class Git(codearchiver.core.Module):
	name = 'git'
	MetadataClass = GitMetadata

	@staticmethod
	def matches(inputUrl):
		return inputUrl.url.endswith('.git')

	def __init__(self, *args, extraBranches = {}, **kwargs):
		super().__init__(*args, **kwargs)
		self._extraBranches = extraBranches

	def _find_storage_bundles(self, criteria, checkOids, temporary = False):
		'''Search `self._storage` for bundles or temporary metadata matching `criteria` and containing at least one element of `checkOids`. Yields tuples `(name, objects, oids)`.'''
		searchMethod = self._storage.search_metadata if not temporary else self._storage.search_temporary_metadata
		openMethod = self._storage.open_metadata if not temporary else self._storage.open_temporary_metadata
		matchedBundles = {}  # bundle name → (objects, oids)
		for oldBundle in searchMethod(criteria):
			_logger.info(f'Matching bundle: {oldBundle!r}')
			with openMethod(oldBundle) as fp:
				idx = GitMetadata.deserialise(fp)
			isMatch = False
			oldObjects = set()  # commit and tag lines in this bundle
			oldOids = set()  # commit and tag IDs in this bundle
			for key, value in idx:
				if key != 'Object':
					continue
				oid, otype = value.split(' ', 1)
				oldObjects.add(value)
				oldOids.add(oid)
				if otype not in ('commit', 'tag'):
					continue
				if not isMatch and oid in checkOids:
					isMatch = True
			if isMatch:
				yield (oldBundle, oldObjects, oldOids)

	def process(self):
		with tempfile.TemporaryDirectory(prefix = 'tmp.codearchiver.git.', dir = os.getcwd()) as directory:
			bundle = f'{self._id}_git.bundle'
			if os.path.exists(bundle):
				_logger.fatal(f'{bundle!r} already exists')
				raise FileExistsError(f'{bundle!r} already exists')

			_, gitVersion, _ = codearchiver.subprocess.run_with_log(['git', '--version'])
			if not gitVersion.startswith('git version ') or not gitVersion.endswith('\n') or gitVersion[12:-1].strip('0123456789.') != '':
				raise RuntimeError(f'Unexpected output from `git --version`: {gitVersion!r}')
			gitVersion = gitVersion[12:-1]

			_logger.info(f'Cloning {self._url} into {directory}')
			startTime = datetime.datetime.utcnow()
			codearchiver.subprocess.run_with_log(['git', 'clone', '--verbose', '--progress', '--mirror', self._url, directory], env = {**os.environ, 'GIT_TERMINAL_PROMPT': '0'})

			if self._extraBranches:
				for branch, commit in self._extraBranches.items():
					_logger.info(f'Fetching commit {commit} as {branch}')
					r, _, _ = codearchiver.subprocess.run_with_log(['git', 'fetch', '--verbose', '--progress', 'origin', commit], cwd = directory, check = False)
					if r == 0:
						r2, _, _ = codearchiver.subprocess.run_with_log(['git', 'update-ref', f'refs/codearchiver/{branch}', commit, ''], cwd = directory, check = False)
						if r2 != 0:
							_logger.error(f'Failed to update-ref refs/codearchiver/{branch} to {commit}')
					else:
						_logger.error(f'Failed to fetch {commit}')
				# This leaves over a FETCH_HEAD file, but git-bundle does not care about that, so it can safely be ignored.
			endTime = datetime.datetime.utcnow()

			_logger.info('Collecting repository metadata')
			_, refs, _ = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory)
			refs = list(map(str.strip, refs.splitlines()))
			_, rootCommits, _ = codearchiver.subprocess.run_with_log(['git', 'rev-list', '--max-parents=0', '--all'], cwd = directory)
			rootCommits = list(filter(None, rootCommits.splitlines()))
			_, objects, _ = codearchiver.subprocess.run_with_log(['git', 'cat-file', '--batch-check', '--batch-all-objects', '--unordered', '--buffer'], cwd = directory)
			objects = {oid: otype for oid, otype, osize in map(functools.partial(str.split, sep = ' '), objects.splitlines())}
			with open(os.path.join(directory, 'HEAD'), 'r') as fp:
				head = fp.read()
			if not head.startswith('ref: refs/heads/') or not head.endswith('\n'):
				raise RuntimeError(f'Unexpected HEAD content: {head!r}')
			head = head[:-1]  # Remove trailing \n

			metadata = self.create_metadata(bundle, startTime, endTime)
			metadata.append('Git version', gitVersion)
			for line in refs:
				metadata.append('Ref', line)
			metadata.append('Head', head)
			for commitId in rootCommits:
				metadata.append('Root commit', commitId)

			# Check whether there are relevant prior bundles to create an incremental one
			commitsAndTags = {oid for oid, otype in objects.items() if otype in ('commit', 'tag')}
			tmpMetadataDependencies = []  # temporary metadata names this depends on, to be resolved later
			baseOids = set()  # all oids this depends on (including temporary metadata, but only commits and tags from there)
			baseInProgressObjects = set()  # 'oid otype' lines for finding the bundles at the end
			newCommitsAndTags = set()  # oids of commits and tags not covered in previous bundles or existing temporary metadata
			temporaryMetadataName = None
			if self._storage:
				_logger.info('Checking for previous bundles')

				# A note on dependency optimisation: we want the minimal set of previous bundles {B0, …, Bn} that maximises the cover with the current clone S.
				# In other words, in the general case, this is a set cover problem between I = S ∩ (B0 ∪ … ∪ Bn} as the universe and Bi ∩ I as the subsets.
				# Fortunately, solving the actual set cover problem is not necessary.
				# This is because the previous bundles must be disjoint: commit/tag objects are never duplicated. (Trees and blobs might be, but deduplicating those isn't possible.)
				# Therefore, any previous bundle that contains at least one commit or tag object in the current clone must be a dependency.

				# To support parallel archival of related repositories, this uses other processes' temporary metadata from and writes its own to storage.
				# First, obtain all relevant prior bundles.
				# Second, obtain all relevant temporary metadata. Make a note of these and also exclude their commits from this bundle. Write own temporary metadata.
				# Third, upon completion (below), wait for the depended-on temporary metadata to disappear, search for the corresponding bundles, and finalise own metadata.

				with self._storage.lock():
					for oldBundleName, oldObjects, oldOids in self._find_storage_bundles([('Module', type(self).name), ('Root commit', tuple(rootCommits))], commitsAndTags):
						metadata.append('Based on bundle', oldBundleName)
						baseOids |= oldOids
					for tmpMetadataName, tmpObjects, tmpOids in self._find_storage_bundles([('Module', type(self).name), ('Root commit', tuple(rootCommits))], commitsAndTags, temporary = True):
						tmpMetadataDependencies.append(tmpMetadataName)
						baseOids |= tmpOids
						baseInProgressObjects |= tmpObjects

					newCommitsAndTags = commitsAndTags - baseOids
					for oid in newCommitsAndTags:
						metadata.append('Object', f'{oid} {objects[oid]}')
					temporaryMetadataName = self._storage.add_temporary_metadata(metadata)

			try:
				_logger.info(f'Bundling into {bundle}')
				cmd = ['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all']
				objectsToExclude = baseOids & commitsAndTags
				input = ''.join(f'^{o}\n' for o in objectsToExclude).encode('ascii')
				status, _, stderr = codearchiver.subprocess.run_with_log(cmd, cwd = directory, input = input, check = False)
				if status == 128 and (stderr == 'fatal: Refusing to create empty bundle.\n' or stderr.endswith('\nfatal: Refusing to create empty bundle.\n')):
					# Manually write an empty bundle instead
					# Cf. Documentation/technical/bundle-format.txt and Documentation/technical/pack-format.txt in git's repository for details on the formats
					_logger.info('Writing empty bundle directly instead')
					with open(bundle, 'xb') as fp:
						fp.write(b'# v2 git bundle\n')  # bundle signature
						fp.write(b'\n')  # bundle end of prerequisites and refs
						packdata = b'PACK'  # pack signature
						packdata += b'\0\0\0\x02'  # pack version
						packdata += b'\0\0\0\0'  # pack number of objects
						fp.write(packdata)
						fp.write(hashlib.sha1(packdata).digest())  # pack checksum trailer
				elif status != 0:
					raise RuntimeError(f'git bundle creation returned with non-zero exit status {status}.')

				_logger.info('Indexing bundle')
				# Yes, this is stupid, but unfortunately, `git index-pack` can only read from stdin inside a repo and will still write the packfile to disk anyway.
				# So sadly, the only way here (for now) is to make a copy of the packfile and then run index-pack on it.
				with open(bundle, 'rb') as fpin:
					# Skip over header
					for line in fpin:
						if line == b'\n':
							break
					# Copy remainder (= packfile) to tmp.pack
					with open('tmp.pack', 'xb') as fpout:
						shutil.copyfileobj(fpin, fpout)
				codearchiver.subprocess.run_with_log(['git', 'index-pack', '-v', 'tmp.pack'])
				with open('tmp.idx', 'rb') as fp:
					_, index, _ = codearchiver.subprocess.run_with_log(['git', 'show-index'], input = fp)
				indexObjectIds = {l.rstrip('\n').split(' ', 2)[1] for l in index.splitlines()}
				try:
					indexObjects = {oid: objects[oid] for oid in indexObjectIds}
				except KeyError as e:
					# This should never happen since the bundle is created from the clone with exclusions...
					raise RuntimeError(f'Bundle {bundle} contains object not contained in the present clone') from e
				os.remove('tmp.pack')
				os.remove('tmp.idx')

				_logger.info('Checking for submodules')
				_, commitsWithSubmodules, _ = codearchiver.subprocess.run_with_log(['git', 'log', '--format=format:%H', '--diff-filter=d', '--all', '--', '.gitmodules'], cwd = directory)
				if commitsWithSubmodules:
					_logger.warning('Submodules found but extraction not supported')

				# Ensure that all commits and tags included in the temporary metadata made it into the pack, else data may be lost!
				indexCommitsAndTags = {oid for oid, otype in indexObjects.items() if otype in ('commit', 'tag')}
				if newCommitsAndTags - indexCommitsAndTags != set():
					raise RuntimeError('Bundle does not contain all commits/tags that were written to temporary metadata, aborting due to data loss risk')
				for oid, otype in indexObjects.items():
					if oid in newCommitsAndTags:
						# Already added to metadata earlier
						continue
					metadata.append('Object', f'{oid} {otype}')

				# Bundling completed without issues; wait for depended-on bundles, add them to the metadata, then replace own temporary metadata
				if self._storage:
					self._storage.wait_temporary_metadata(tmpMetadataDependencies)
					with self._storage.lock():
						criteria = [('Module', type(self).name), ('Root commit', tuple(rootCommits)), ('Object', tuple(baseInProgressObjects))]
						missingObjects = baseInProgressObjects.copy()
						for oldBundleName, oldObjects, oldOids in self._find_storage_bundles(criteria, {value.split(' ', 1)[0] for value in baseInProgressObjects}):
							metadata.append('Based on bundle', oldBundleName)
							baseOids |= oldOids
							missingObjects -= oldObjects

						# Verification: all commit/tag objects collected from temporary metadata must be covered
						if missingObjects:
							raise RuntimeError('Resolved temporary metadata bundles do not cover all expected objects')

						# Verification: all objects in the clone are either in a base bundle or in the index
						# This can only be done here because all oids are needed, not just the commit/tag objects
						if objects.keys() - (baseOids | indexObjectIds) != set():
							raise RuntimeError('Object mismatch between clone and bundles')

						self._storage.replace_temporary_metadata(temporaryMetadataName, bundle, metadata)
			except:
				# Attempt to remove the temporary metadata, then reraise
				if self._storage:
					with self._storage.lock():
						self._storage.remove_temporary_metadata(temporaryMetadataName)
				raise

		return codearchiver.core.Result(id = self._id, files = [(bundle, metadata)])

	def __repr__(self):
		return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r}, extraBranches = {self._extraBranches!r})'