|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311 |
- import codearchiver.core
- import codearchiver.subprocess
- import datetime
- import functools
- import hashlib
- import itertools
- import logging
- import os
- import shutil
- import subprocess
- import tempfile
-
-
- _logger = logging.getLogger(__name__)
-
-
- class _HashingFileReader:
- '''A tiny wrapper around a file-like object which calculates the hash of the file as it is being read.'''
-
- def __init__(self, fp, hasher = hashlib.sha1, skipStart = 0, skipEnd = 0):
- self._fp = fp
- self._hasher = hasher()
- self._skipStart = skipStart
- self._skipEnd = skipEnd
- self._buf = b''
-
- def read(self, n):
- data = self._fp.read(n)
- if self._skipStart > 0:
- # Requires that the first block is bigger than skipStart+skipEnd because it otherwise gets very complicated
- # Yes, this fails if the file is smaller than that, but given what it's used for, that's not an issue.
- if len(data) < self._skipStart + self._skipEnd:
- raise ValueError(f'first read is required to return at least {self._skipStart + self._skipEnd} bytes')
- start = self._skipStart
- self._skipStart = 0
- else:
- start = 0
- bufPlusData = self._buf + data
- if self._skipEnd > 0:
- self._buf = bufPlusData[-self._skipEnd:]
- end = -self._skipEnd
- else:
- end = None
- self._hasher.update(bufPlusData[start:end])
- return data
-
- def digest(self):
- if self._skipStart > 0 or len(self._buf) != self._skipEnd:
- raise ValueError('data skipping failed')
- return self._hasher.digest()
-
-
- class GitMetadata(codearchiver.core.Metadata):
- fields = (
- codearchiver.core.MetadataField(key = 'Git version', required = True, repeatable = False),
- codearchiver.core.MetadataField(key = 'Based on bundle', required = False, repeatable = True),
- codearchiver.core.MetadataField(key = 'Ref', required = True, repeatable = True),
- codearchiver.core.MetadataField(key = 'Head', required = True, repeatable = False),
- codearchiver.core.MetadataField(key = 'Root commit', required = True, repeatable = True),
- codearchiver.core.MetadataField(key = 'Object', required = False, repeatable = True),
- )
- version = 0
-
-
- class Git(codearchiver.core.Module):
- name = 'git'
- MetadataClass = GitMetadata
-
- @staticmethod
- def matches(inputUrl):
- return inputUrl.url.endswith('.git')
-
- def __init__(self, *args, extraBranches = {}, **kwargs):
- super().__init__(*args, **kwargs)
- self._extraBranches = extraBranches
-
- def _find_storage_bundles(self, criteria, checkOids, temporary = False):
- '''Search `self._storage` for bundles or temporary metadata matching `criteria` and containing at least one element of `checkOids`. Yields tuples `(name, objects, oids)`.'''
- searchMethod = self._storage.search_metadata if not temporary else self._storage.search_temporary_metadata
- openMethod = self._storage.open_metadata if not temporary else self._storage.open_temporary_metadata
- matchedBundles = {} # bundle name → (objects, oids)
- for oldBundle in searchMethod(criteria):
- _logger.info(f'Matching bundle: {oldBundle!r}')
- with openMethod(oldBundle) as fp:
- idx = GitMetadata.deserialise(fp)
- isMatch = False
- oldObjects = set() # commit and tag lines in this bundle
- oldOids = set() # commit and tag IDs in this bundle
- for key, value in idx:
- if key != 'Object':
- continue
- oid, otype = value.split(' ', 1)
- oldObjects.add(value)
- oldOids.add(oid)
- if otype not in ('commit', 'tag'):
- continue
- if not isMatch and oid in checkOids:
- isMatch = True
- if isMatch:
- yield (oldBundle, oldObjects, oldOids)
-
- def process(self):
- with tempfile.TemporaryDirectory(prefix = 'tmp.codearchiver.git.', dir = os.getcwd()) as directory:
- bundle = f'{self._id}_git.bundle'
- if os.path.exists(bundle):
- _logger.fatal(f'{bundle!r} already exists')
- raise FileExistsError(f'{bundle!r} already exists')
-
- _, gitVersion, _ = codearchiver.subprocess.run_with_log(['git', '--version'])
- if not gitVersion.startswith('git version ') or not gitVersion.endswith('\n') or gitVersion[12:-1].strip('0123456789.') != '':
- raise RuntimeError(f'Unexpected output from `git --version`: {gitVersion!r}')
- gitVersion = gitVersion[12:-1]
-
- _logger.info(f'Cloning {self._url} into {directory}')
- startTime = datetime.datetime.utcnow()
- codearchiver.subprocess.run_with_log(['git', 'clone', '--verbose', '--progress', '--mirror', self._url, directory], env = {**os.environ, 'GIT_TERMINAL_PROMPT': '0'})
-
- if self._extraBranches:
- for branch, commit in self._extraBranches.items():
- _logger.info(f'Fetching commit {commit} as {branch}')
- r, _, _ = codearchiver.subprocess.run_with_log(['git', 'fetch', '--verbose', '--progress', 'origin', commit], cwd = directory, check = False)
- if r == 0:
- r2, _, _ = codearchiver.subprocess.run_with_log(['git', 'update-ref', f'refs/codearchiver/{branch}', commit, ''], cwd = directory, check = False)
- if r2 != 0:
- _logger.error(f'Failed to update-ref refs/codearchiver/{branch} to {commit}')
- else:
- _logger.error(f'Failed to fetch {commit}')
- # This leaves over a FETCH_HEAD file, but git-bundle does not care about that, so it can safely be ignored.
- endTime = datetime.datetime.utcnow()
-
- _logger.info('Collecting repository metadata')
- _, refs, _ = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory)
- refs = list(map(str.strip, refs.splitlines()))
- _, rootCommits, _ = codearchiver.subprocess.run_with_log(['git', 'rev-list', '--max-parents=0', '--all'], cwd = directory)
- rootCommits = list(filter(None, rootCommits.splitlines()))
- _, objects, _ = codearchiver.subprocess.run_with_log(['git', 'cat-file', '--batch-check', '--batch-all-objects', '--unordered', '--buffer'], cwd = directory)
- objects = {oid: otype for oid, otype, osize in map(functools.partial(str.split, sep = ' '), objects.splitlines())}
- with open(os.path.join(directory, 'HEAD'), 'r') as fp:
- head = fp.read()
- if not head.startswith('ref: refs/heads/') or not head.endswith('\n'):
- raise RuntimeError(f'Unexpected HEAD content: {head!r}')
- head = head[:-1] # Remove trailing \n
-
- metadata = self.create_metadata(bundle, startTime, endTime)
- metadata.append('Git version', gitVersion)
- for line in refs:
- metadata.append('Ref', line)
- metadata.append('Head', head)
- for commitId in rootCommits:
- metadata.append('Root commit', commitId)
-
- # Check whether there are relevant prior bundles to create an incremental one
- commitsAndTags = {oid for oid, otype in objects.items() if otype in ('commit', 'tag')}
- tmpMetadataDependencies = [] # temporary metadata names this depends on, to be resolved later
- baseOids = set() # all oids this depends on (including temporary metadata, but only commits and tags from there)
- baseInProgressObjects = set() # 'oid otype' lines for finding the bundles at the end
- newCommitsAndTags = set() # oids of commits and tags not covered in previous bundles or existing temporary metadata
- temporaryMetadataName = None
- if self._storage:
- _logger.info('Checking for previous bundles')
-
- # A note on dependency optimisation: we want the minimal set of previous bundles {B0, …, Bn} that maximises the cover with the current clone S.
- # In other words, in the general case, this is a set cover problem between I = S ∩ (B0 ∪ … ∪ Bn} as the universe and Bi ∩ I as the subsets.
- # Fortunately, solving the actual set cover problem is not necessary.
- # This is because the previous bundles must be disjoint: commit/tag objects are never duplicated. (Trees and blobs might be, but deduplicating those isn't possible.)
- # Therefore, any previous bundle that contains at least one commit or tag object in the current clone must be a dependency.
-
- # To support parallel archival of related repositories, this uses other processes' temporary metadata from and writes its own to storage.
- # First, obtain all relevant prior bundles.
- # Second, obtain all relevant temporary metadata. Make a note of these and also exclude their commits from this bundle. Write own temporary metadata.
- # Third, upon completion (below), wait for the depended-on temporary metadata to disappear, search for the corresponding bundles, and finalise own metadata.
-
- with self._storage.lock():
- for oldBundleName, oldObjects, oldOids in self._find_storage_bundles([('Module', type(self).name), ('Root commit', tuple(rootCommits))], commitsAndTags):
- metadata.append('Based on bundle', oldBundleName)
- baseOids |= oldOids
- for tmpMetadataName, tmpObjects, tmpOids in self._find_storage_bundles([('Module', type(self).name), ('Root commit', tuple(rootCommits))], commitsAndTags, temporary = True):
- tmpMetadataDependencies.append(tmpMetadataName)
- baseOids |= tmpOids
- baseInProgressObjects |= tmpObjects
-
- newCommitsAndTags = commitsAndTags - baseOids
- for oid in newCommitsAndTags:
- metadata.append('Object', f'{oid} {objects[oid]}')
- temporaryMetadataName = self._storage.add_temporary_metadata(metadata)
-
- try:
- _logger.info(f'Bundling into {bundle}')
- cmd = ['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all']
- objectsToExclude = baseOids & commitsAndTags
- del commitsAndTags
- input = ''.join(f'^{o}\n' for o in objectsToExclude).encode('ascii')
- del objectsToExclude
- status, _, stderr = codearchiver.subprocess.run_with_log(cmd, cwd = directory, input = input, check = False)
- del input
- if status == 128 and (stderr == 'fatal: Refusing to create empty bundle.\n' or stderr.endswith('\nfatal: Refusing to create empty bundle.\n')):
- # Manually write an empty bundle instead
- # Cf. Documentation/technical/bundle-format.txt and Documentation/technical/pack-format.txt in git's repository for details on the formats
- _logger.info('Writing empty bundle directly instead')
- with open(bundle, 'xb') as fp:
- fp.write(b'# v2 git bundle\n') # bundle signature
- fp.write(b'\n') # bundle end of prerequisites and refs
- packdata = b'PACK' # pack signature
- packdata += b'\0\0\0\x02' # pack version
- packdata += b'\0\0\0\0' # pack number of objects
- fp.write(packdata)
- fp.write(hashlib.sha1(packdata).digest()) # pack checksum trailer
- elif status != 0:
- raise RuntimeError(f'git bundle creation returned with non-zero exit status {status}.')
-
- _logger.info('Indexing bundle')
-
- # The bundle's packfile might contain deltified objects.
- # Those cannot be run through `index-pack` without using `--fix-thin` and running it in a repo containing the missing objects.
- # However, `--fix-thin` has the side effect that it appends those missing objects to the output pack as well, so they also show up in the `show-index` output afterwards.
- # The fact that this always appends is undocumented, so it can't simply be relied on.
- # So this does the following:
- # - Index the pack; as the data is being read, calculate a hash of the pack data without header and trailer checksum
- # - Verify that the corresponding bytes from the index-pack output file have the same hash.
- # - Read the index and filter out any entries that lie beyond the size of the bundle packfile minus 20 (trailer checksum)
- # This gets an accurate index of exactly which objects are in this pack; some might depend on data from other bundles, but that's fine.
- # The extra copy to disk is unfortunately unavoidable anyway, but this hash verification at least makes it somewhat valuable.
-
- # Index with inline hash calculation
- bundleSize = os.path.getsize(bundle)
- with open(bundle, 'rb') as fpin:
- # Skip over header
- for line in fpin:
- if line == b'\n':
- break
- packOffset = fpin.tell()
- hashWrapper = _HashingFileReader(fpin, skipStart = 12, skipEnd = 20)
- codearchiver.subprocess.run_with_log(['git', 'index-pack', '-v', '--fix-thin', '--stdin', '../tmp.pack'], input = hashWrapper, cwd = directory)
- bundlePackSize = bundleSize - packOffset - 12 - 20
- bundlePackHash = hashWrapper.digest()
- # Verify hash of first part of the index-pack output pack
- with open('tmp.pack', 'rb') as fp:
- fp.seek(12) # Header
- indexPackRead = 0
- hasher = hashlib.sha1()
- while indexPackRead < bundlePackSize:
- data = fp.read(min(bundlePackSize - indexPackRead, 1048576))
- indexPackRead += len(data)
- hasher.update(data)
- indexPackHash = hasher.digest()
- if bundlePackHash != indexPackHash or indexPackRead != bundlePackSize:
- raise RuntimeError(f'index pack hash comparison failed: expected {bundlePackHash!r} (size: {bundlePackSize}), got {indexPackHash!r} (size: {indexPackRead})')
- # Parse index
- with open('tmp.idx', 'rb') as fp:
- _, index, _ = codearchiver.subprocess.run_with_log(['git', 'show-index'], input = fp)
- indexObjectIds = {oid for offset, oid, _ in map(lambda l: l.rstrip('\n').split(' ', 2), index.splitlines()) if int(offset) < bundlePackSize}
- del index
- try:
- indexObjects = {oid: objects[oid] for oid in indexObjectIds}
- except KeyError as e:
- # This should never happen since the bundle is created from the clone with exclusions...
- raise RuntimeError(f'Bundle {bundle} contains object not contained in the present clone') from e
- os.remove('tmp.pack')
- os.remove('tmp.idx')
-
- _logger.info('Checking for submodules')
- _, commitsWithSubmodules, _ = codearchiver.subprocess.run_with_log(['git', 'log', '--format=format:%H', '--diff-filter=d', '--all', '--', '.gitmodules'], cwd = directory)
- if commitsWithSubmodules:
- _logger.warning('Submodules found but extraction not supported')
- del commitsWithSubmodules
-
- # Ensure that all commits and tags included in the temporary metadata made it into the pack, else data may be lost!
- indexCommitsAndTags = {oid for oid, otype in indexObjects.items() if otype in ('commit', 'tag')}
- if newCommitsAndTags - indexCommitsAndTags != set():
- raise RuntimeError('Bundle does not contain all commits/tags that were written to temporary metadata, aborting due to data loss risk')
- for oid, otype in indexObjects.items():
- if oid in newCommitsAndTags:
- # Already added to metadata earlier
- continue
- metadata.append('Object', f'{oid} {otype}')
- del indexObjects, indexCommitsAndTags
-
- # Bundling completed without issues; wait for depended-on bundles, add them to the metadata, then replace own temporary metadata
- if self._storage:
- if tmpMetadataDependencies:
- self._storage.wait_temporary_metadata(tmpMetadataDependencies)
- with self._storage.lock():
- if tmpMetadataDependencies:
- criteria = [('Module', type(self).name), ('Root commit', tuple(rootCommits)), ('Object', tuple(baseInProgressObjects))]
- missingObjects = baseInProgressObjects.copy()
- for oldBundleName, oldObjects, oldOids in self._find_storage_bundles(criteria, {value.split(' ', 1)[0] for value in baseInProgressObjects}):
- metadata.append('Based on bundle', oldBundleName)
- baseOids |= oldOids
- missingObjects -= oldObjects
-
- # Verification: all commit/tag objects collected from temporary metadata must be covered
- if missingObjects:
- raise RuntimeError('Resolved temporary metadata bundles do not cover all expected objects')
-
- # Verification: all objects in the clone are either in a base bundle or in the index
- # This can only be done here because all oids are needed, not just the commit/tag objects
- if objects.keys() - (baseOids | indexObjectIds) != set():
- raise RuntimeError('Object mismatch between clone and bundles')
-
- self._storage.replace_temporary_metadata(temporaryMetadataName, bundle, metadata)
- except:
- # Attempt to remove the temporary metadata, then reraise
- if self._storage:
- with self._storage.lock():
- self._storage.remove_temporary_metadata(temporaryMetadataName)
- raise
-
- return codearchiver.core.Result(id = self._id, files = [(bundle, metadata)])
-
- def __repr__(self):
- return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r}, extraBranches = {self._extraBranches!r})'
|