|
- import codearchiver.core
- import codearchiver.subprocess
- import datetime
- import functools
- import hashlib
- import itertools
- import logging
- import os
- import shutil
- import subprocess
- import tempfile
-
-
- _logger = logging.getLogger(__name__)
-
-
- class GitMetadata(codearchiver.core.Metadata):
- fields = (
- codearchiver.core.MetadataField(key = 'Git version', required = True, repeatable = False),
- codearchiver.core.MetadataField(key = 'Based on bundle', required = False, repeatable = True),
- codearchiver.core.MetadataField(key = 'Ref', required = True, repeatable = True),
- codearchiver.core.MetadataField(key = 'Head', required = True, repeatable = False),
- codearchiver.core.MetadataField(key = 'Root commit', required = True, repeatable = True),
- codearchiver.core.MetadataField(key = 'Object', required = False, repeatable = True),
- )
- version = 0
-
-
- class Git(codearchiver.core.Module):
- name = 'git'
- MetadataClass = GitMetadata
-
- @staticmethod
- def matches(inputUrl):
- return inputUrl.url.endswith('.git')
-
- def __init__(self, *args, extraBranches = {}, **kwargs):
- super().__init__(*args, **kwargs)
- self._extraBranches = extraBranches
-
- def _find_storage_bundles(self, criteria, checkOids, temporary = False):
- '''Search `self._storage` for bundles or temporary metadata matching `criteria` and containing at least one element of `checkOids`. Yields tuples `(name, objects, oids)`.'''
- searchMethod = self._storage.search_metadata if not temporary else self._storage.search_temporary_metadata
- openMethod = self._storage.open_metadata if not temporary else self._storage.open_temporary_metadata
- matchedBundles = {} # bundle name → (objects, oids)
- for oldBundle in searchMethod(criteria):
- _logger.info(f'Matching bundle: {oldBundle!r}')
- with openMethod(oldBundle) as fp:
- idx = GitMetadata.deserialise(fp)
- isMatch = False
- oldObjects = set() # commit and tag lines in this bundle
- oldOids = set() # commit and tag IDs in this bundle
- for key, value in idx:
- if key != 'Object':
- continue
- oid, otype = value.split(' ', 1)
- oldObjects.add(value)
- oldOids.add(oid)
- if otype not in ('commit', 'tag'):
- continue
- if not isMatch and oid in checkOids:
- isMatch = True
- if isMatch:
- yield (oldBundle, oldObjects, oldOids)
-
- def process(self):
- with tempfile.TemporaryDirectory(prefix = 'tmp.codearchiver.git.', dir = os.getcwd()) as directory:
- bundle = f'{self._id}_git.bundle'
- if os.path.exists(bundle):
- _logger.fatal(f'{bundle!r} already exists')
- raise FileExistsError(f'{bundle!r} already exists')
-
- _, gitVersion, _ = codearchiver.subprocess.run_with_log(['git', '--version'])
- if not gitVersion.startswith('git version ') or not gitVersion.endswith('\n') or gitVersion[12:-1].strip('0123456789.') != '':
- raise RuntimeError(f'Unexpected output from `git --version`: {gitVersion!r}')
- gitVersion = gitVersion[12:-1]
-
- _logger.info(f'Cloning {self._url} into {directory}')
- startTime = datetime.datetime.utcnow()
- codearchiver.subprocess.run_with_log(['git', 'clone', '--verbose', '--progress', '--mirror', self._url, directory], env = {**os.environ, 'GIT_TERMINAL_PROMPT': '0'})
-
- if self._extraBranches:
- for branch, commit in self._extraBranches.items():
- _logger.info(f'Fetching commit {commit} as {branch}')
- r, _, _ = codearchiver.subprocess.run_with_log(['git', 'fetch', '--verbose', '--progress', 'origin', commit], cwd = directory, check = False)
- if r == 0:
- r2, _, _ = codearchiver.subprocess.run_with_log(['git', 'update-ref', f'refs/codearchiver/{branch}', commit, ''], cwd = directory, check = False)
- if r2 != 0:
- _logger.error(f'Failed to update-ref refs/codearchiver/{branch} to {commit}')
- else:
- _logger.error(f'Failed to fetch {commit}')
- # This leaves over a FETCH_HEAD file, but git-bundle does not care about that, so it can safely be ignored.
- endTime = datetime.datetime.utcnow()
-
- _logger.info('Collecting repository metadata')
- _, refs, _ = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory)
- refs = list(map(str.strip, refs.splitlines()))
- _, rootCommits, _ = codearchiver.subprocess.run_with_log(['git', 'rev-list', '--max-parents=0', '--all'], cwd = directory)
- rootCommits = list(filter(None, rootCommits.splitlines()))
- _, objects, _ = codearchiver.subprocess.run_with_log(['git', 'cat-file', '--batch-check', '--batch-all-objects', '--unordered', '--buffer'], cwd = directory)
- objects = {oid: otype for oid, otype, osize in map(functools.partial(str.split, sep = ' '), objects.splitlines())}
- with open(os.path.join(directory, 'HEAD'), 'r') as fp:
- head = fp.read()
- if not head.startswith('ref: refs/heads/') or not head.endswith('\n'):
- raise RuntimeError(f'Unexpected HEAD content: {head!r}')
- head = head[:-1] # Remove trailing \n
-
- metadata = self.create_metadata(bundle, startTime, endTime)
- metadata.append('Git version', gitVersion)
- for line in refs:
- metadata.append('Ref', line)
- metadata.append('Head', head)
- for commitId in rootCommits:
- metadata.append('Root commit', commitId)
-
- # Check whether there are relevant prior bundles to create an incremental one
- commitsAndTags = {oid for oid, otype in objects.items() if otype in ('commit', 'tag')}
- tmpMetadataDependencies = [] # temporary metadata names this depends on, to be resolved later
- baseOids = set() # all oids this depends on (including temporary metadata, but only commits and tags from there)
- baseInProgressObjects = set() # 'oid otype' lines for finding the bundles at the end
- newCommitsAndTags = set() # oids of commits and tags not covered in previous bundles or existing temporary metadata
- temporaryMetadataName = None
- if self._storage:
- _logger.info('Checking for previous bundles')
-
- # A note on dependency optimisation: we want the minimal set of previous bundles {B0, …, Bn} that maximises the cover with the current clone S.
- # In other words, in the general case, this is a set cover problem between I = S ∩ (B0 ∪ … ∪ Bn} as the universe and Bi ∩ I as the subsets.
- # Fortunately, solving the actual set cover problem is not necessary.
- # This is because the previous bundles must be disjoint: commit/tag objects are never duplicated. (Trees and blobs might be, but deduplicating those isn't possible.)
- # Therefore, any previous bundle that contains at least one commit or tag object in the current clone must be a dependency.
-
- # To support parallel archival of related repositories, this uses other processes' temporary metadata from and writes its own to storage.
- # First, obtain all relevant prior bundles.
- # Second, obtain all relevant temporary metadata. Make a note of these and also exclude their commits from this bundle. Write own temporary metadata.
- # Third, upon completion (below), wait for the depended-on temporary metadata to disappear, search for the corresponding bundles, and finalise own metadata.
-
- with self._storage.lock():
- for oldBundleName, oldObjects, oldOids in self._find_storage_bundles([('Module', type(self).name), ('Root commit', tuple(rootCommits))], commitsAndTags):
- metadata.append('Based on bundle', oldBundleName)
- baseOids |= oldOids
- for tmpMetadataName, tmpObjects, tmpOids in self._find_storage_bundles([('Module', type(self).name), ('Root commit', tuple(rootCommits))], commitsAndTags, temporary = True):
- tmpMetadataDependencies.append(tmpMetadataName)
- baseOids |= tmpOids
- baseInProgressObjects |= tmpObjects
-
- newCommitsAndTags = commitsAndTags - baseOids
- for oid in newCommitsAndTags:
- metadata.append('Object', f'{oid} {objects[oid]}')
- temporaryMetadataName = self._storage.add_temporary_metadata(metadata)
-
- try:
- _logger.info(f'Bundling into {bundle}')
- cmd = ['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all']
- objectsToExclude = baseOids & commitsAndTags
- input = ''.join(f'^{o}\n' for o in objectsToExclude).encode('ascii')
- status, _, stderr = codearchiver.subprocess.run_with_log(cmd, cwd = directory, input = input, check = False)
- if status == 128 and (stderr == 'fatal: Refusing to create empty bundle.\n' or stderr.endswith('\nfatal: Refusing to create empty bundle.\n')):
- # Manually write an empty bundle instead
- # Cf. Documentation/technical/bundle-format.txt and Documentation/technical/pack-format.txt in git's repository for details on the formats
- _logger.info('Writing empty bundle directly instead')
- with open(bundle, 'xb') as fp:
- fp.write(b'# v2 git bundle\n') # bundle signature
- fp.write(b'\n') # bundle end of prerequisites and refs
- packdata = b'PACK' # pack signature
- packdata += b'\0\0\0\x02' # pack version
- packdata += b'\0\0\0\0' # pack number of objects
- fp.write(packdata)
- fp.write(hashlib.sha1(packdata).digest()) # pack checksum trailer
- elif status != 0:
- raise RuntimeError(f'git bundle creation returned with non-zero exit status {status}.')
-
- _logger.info('Indexing bundle')
- # Yes, this is stupid, but unfortunately, `git index-pack` can only read from stdin inside a repo and will still write the packfile to disk anyway.
- # So sadly, the only way here (for now) is to make a copy of the packfile and then run index-pack on it.
- with open(bundle, 'rb') as fpin:
- # Skip over header
- for line in fpin:
- if line == b'\n':
- break
- # Copy remainder (= packfile) to tmp.pack
- with open('tmp.pack', 'xb') as fpout:
- shutil.copyfileobj(fpin, fpout)
- codearchiver.subprocess.run_with_log(['git', 'index-pack', '-v', 'tmp.pack'])
- with open('tmp.idx', 'rb') as fp:
- _, index, _ = codearchiver.subprocess.run_with_log(['git', 'show-index'], input = fp)
- indexObjectIds = {l.rstrip('\n').split(' ', 2)[1] for l in index.splitlines()}
- try:
- indexObjects = {oid: objects[oid] for oid in indexObjectIds}
- except KeyError as e:
- # This should never happen since the bundle is created from the clone with exclusions...
- raise RuntimeError(f'Bundle {bundle} contains object not contained in the present clone') from e
- os.remove('tmp.pack')
- os.remove('tmp.idx')
-
- _logger.info('Checking for submodules')
- _, commitsWithSubmodules, _ = codearchiver.subprocess.run_with_log(['git', 'log', '--format=format:%H', '--diff-filter=d', '--all', '--', '.gitmodules'], cwd = directory)
- if commitsWithSubmodules:
- _logger.warning('Submodules found but extraction not supported')
-
- # Ensure that all commits and tags included in the temporary metadata made it into the pack, else data may be lost!
- indexCommitsAndTags = {oid for oid, otype in indexObjects.items() if otype in ('commit', 'tag')}
- if newCommitsAndTags - indexCommitsAndTags != set():
- raise RuntimeError('Bundle does not contain all commits/tags that were written to temporary metadata, aborting due to data loss risk')
- for oid, otype in indexObjects.items():
- if oid in newCommitsAndTags:
- # Already added to metadata earlier
- continue
- metadata.append('Object', f'{oid} {otype}')
-
- # Bundling completed without issues; wait for depended-on bundles, add them to the metadata, then replace own temporary metadata
- if self._storage:
- self._storage.wait_temporary_metadata(tmpMetadataDependencies)
- with self._storage.lock():
- criteria = [('Module', type(self).name), ('Root commit', tuple(rootCommits)), ('Object', tuple(baseInProgressObjects))]
- missingObjects = baseInProgressObjects.copy()
- for oldBundleName, oldObjects, oldOids in self._find_storage_bundles(criteria, {value.split(' ', 1)[0] for value in baseInProgressObjects}):
- metadata.append('Based on bundle', oldBundleName)
- baseOids |= oldOids
- missingObjects -= oldObjects
-
- # Verification: all commit/tag objects collected from temporary metadata must be covered
- if missingObjects:
- raise RuntimeError('Resolved temporary metadata bundles do not cover all expected objects')
-
- # Verification: all objects in the clone are either in a base bundle or in the index
- # This can only be done here because all oids are needed, not just the commit/tag objects
- if objects.keys() - (baseOids | indexObjectIds) != set():
- raise RuntimeError('Object mismatch between clone and bundles')
-
- self._storage.replace_temporary_metadata(temporaryMetadataName, bundle, metadata)
- except:
- # Attempt to remove the temporary metadata, then reraise
- if self._storage:
- with self._storage.lock():
- self._storage.remove_temporary_metadata(temporaryMetadataName)
- raise
-
- return codearchiver.core.Result(id = self._id, files = [(bundle, metadata)])
-
- def __repr__(self):
- return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r}, extraBranches = {self._extraBranches!r})'
|