|
|
@@ -5,9 +5,10 @@ import functools |
|
|
|
import hashlib |
|
|
|
import itertools |
|
|
|
import logging |
|
|
|
import os.path |
|
|
|
import os |
|
|
|
import shutil |
|
|
|
import subprocess |
|
|
|
import tempfile |
|
|
|
|
|
|
|
|
|
|
|
_logger = logging.getLogger(__name__) |
|
|
@@ -38,135 +39,129 @@ class Git(codearchiver.core.Module): |
|
|
|
self._extraBranches = extraBranches |
|
|
|
|
|
|
|
def process(self): |
|
|
|
directory = self._url.rsplit('/', 1)[1] |
|
|
|
if os.path.exists(directory): |
|
|
|
_logger.fatal(f'{directory!r} already exists') |
|
|
|
raise FileExistsError(f'{directory!r} already exists') |
|
|
|
bundle = f'{self._id}_git.bundle' |
|
|
|
if os.path.exists(bundle): |
|
|
|
_logger.fatal(f'{bundle!r} already exists') |
|
|
|
raise FileExistsError(f'{bundle!r} already exists') |
|
|
|
|
|
|
|
_, gitVersion, _ = codearchiver.subprocess.run_with_log(['git', '--version']) |
|
|
|
if not gitVersion.startswith('git version ') or not gitVersion.endswith('\n') or gitVersion[12:-1].strip('0123456789.') != '': |
|
|
|
raise RuntimeError(f'Unexpected output from `git --version`: {gitVersion!r}') |
|
|
|
gitVersion = gitVersion[12:-1] |
|
|
|
|
|
|
|
_logger.info(f'Cloning {self._url} into {directory}') |
|
|
|
startTime = datetime.datetime.utcnow() |
|
|
|
codearchiver.subprocess.run_with_log(['git', 'clone', '--verbose', '--progress', '--mirror', self._url, directory], env = {**os.environ, 'GIT_TERMINAL_PROMPT': '0'}) |
|
|
|
|
|
|
|
if self._extraBranches: |
|
|
|
for branch, commit in self._extraBranches.items(): |
|
|
|
_logger.info(f'Fetching commit {commit} as {branch}') |
|
|
|
r, _, _ = codearchiver.subprocess.run_with_log(['git', 'fetch', '--verbose', '--progress', 'origin', commit], cwd = directory, check = False) |
|
|
|
if r == 0: |
|
|
|
r2, _, _ = codearchiver.subprocess.run_with_log(['git', 'update-ref', f'refs/codearchiver/{branch}', commit, ''], cwd = directory, check = False) |
|
|
|
if r2 != 0: |
|
|
|
_logger.error(f'Failed to update-ref refs/codearchiver/{branch} to {commit}') |
|
|
|
else: |
|
|
|
_logger.error(f'Failed to fetch {commit}') |
|
|
|
# This leaves over a FETCH_HEAD file, but git-bundle does not care about that, so it can safely be ignored. |
|
|
|
endTime = datetime.datetime.utcnow() |
|
|
|
|
|
|
|
_logger.info('Collecting repository metadata') |
|
|
|
_, refs, _ = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory) |
|
|
|
refs = list(map(str.strip, refs.splitlines())) |
|
|
|
_, rootCommits, _ = codearchiver.subprocess.run_with_log(['git', 'rev-list', '--max-parents=0', '--all'], cwd = directory) |
|
|
|
rootCommits = list(filter(None, rootCommits.splitlines())) |
|
|
|
_, objects, _ = codearchiver.subprocess.run_with_log(['git', 'cat-file', '--batch-check', '--batch-all-objects', '--unordered', '--buffer'], cwd = directory) |
|
|
|
objects = {oid: otype for oid, otype, osize in map(functools.partial(str.split, sep = ' '), objects.splitlines())} |
|
|
|
with open(os.path.join(directory, 'HEAD'), 'r') as fp: |
|
|
|
head = fp.read() |
|
|
|
if not head.startswith('ref: refs/heads/') or not head.endswith('\n'): |
|
|
|
raise RuntimeError(f'Unexpected HEAD content: {head!r}') |
|
|
|
head = head[:-1] # Remove trailing \n |
|
|
|
|
|
|
|
# Check whether there are relevant prior bundles to create an incremental one |
|
|
|
commitsAndTags = {oid for oid, otype in objects.items() if otype in ('commit', 'tag')} |
|
|
|
basedOnBundles = {} # dict to keep the order |
|
|
|
baseBundleObjects = set() |
|
|
|
if self._storage: |
|
|
|
_logger.info('Checking for previous bundles') |
|
|
|
|
|
|
|
# A note on dependency optimisation: we want the minimal set of previous bundles {B0, …, Bn} that maximises the cover with the current clone S. |
|
|
|
# In other words, in the general case, this is a set cover problem between I = S ∩ (B0 ∪ … ∪ Bn} as the universe and Bi ∩ I as the subsets. |
|
|
|
# Fortunately, solving the actual set cover problem is not necessary. |
|
|
|
# This is because the previous bundles must be disjoint: commit/tag objects are never duplicated. (Trees and blobs might be, but deduplicating those isn't possible.) |
|
|
|
# Therefore, any previous bundle that contains at least one commit or tag object in the current clone must be a dependency. |
|
|
|
|
|
|
|
for oldBundle in self._storage.search_metadata([('Module', type(self).name), ('Root commit', tuple(rootCommits))]): |
|
|
|
_logger.info(f'Previous bundle: {oldBundle!r}') |
|
|
|
with self._storage.open_metadata(oldBundle) as fp: |
|
|
|
idx = GitMetadata.deserialise(fp) |
|
|
|
isMatch = False |
|
|
|
oldObjects = set() # commit and tag IDs in this bundle |
|
|
|
for key, value in idx: |
|
|
|
if key != 'Object': |
|
|
|
continue |
|
|
|
oid, otype = value.split(' ', 1) |
|
|
|
oldObjects.add(oid) |
|
|
|
if otype not in ('commit', 'tag'): |
|
|
|
continue |
|
|
|
if not isMatch and oid in commitsAndTags: |
|
|
|
isMatch = True |
|
|
|
if isMatch: |
|
|
|
basedOnBundles[oldBundle] = True |
|
|
|
baseBundleObjects |= oldObjects |
|
|
|
|
|
|
|
_logger.info(f'Bundling into {bundle}') |
|
|
|
cmd = ['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all'] |
|
|
|
objectsToExclude = baseBundleObjects & commitsAndTags |
|
|
|
input = ''.join(f'^{o}\n' for o in objectsToExclude).encode('ascii') |
|
|
|
status, _, stderr = codearchiver.subprocess.run_with_log(cmd, cwd = directory, input = input, check = False) |
|
|
|
if status == 128 and (stderr == 'fatal: Refusing to create empty bundle.\n' or stderr.endswith('\nfatal: Refusing to create empty bundle.\n')): |
|
|
|
# Manually write an empty bundle instead |
|
|
|
# Cf. Documentation/technical/bundle-format.txt and Documentation/technical/pack-format.txt in git's repository for details on the formats |
|
|
|
_logger.info('Writing empty bundle directly instead') |
|
|
|
with open(bundle, 'xb') as fp: |
|
|
|
fp.write(b'# v2 git bundle\n') # bundle signature |
|
|
|
fp.write(b'\n') # bundle end of prerequisites and refs |
|
|
|
packdata = b'PACK' # pack signature |
|
|
|
packdata += b'\0\0\0\x02' # pack version |
|
|
|
packdata += b'\0\0\0\0' # pack number of objects |
|
|
|
fp.write(packdata) |
|
|
|
fp.write(hashlib.sha1(packdata).digest()) # pack checksum trailer |
|
|
|
elif status != 0: |
|
|
|
raise RuntimeError(f'git bundle creation returned with non-zero exit status {status}.') |
|
|
|
|
|
|
|
_logger.info('Indexing bundle') |
|
|
|
# Yes, this is stupid, but unfortunately, `git index-pack` can only read from stdin inside a repo and will still write the packfile to disk anyway. |
|
|
|
# So sadly, the only way here (for now) is to make a copy of the packfile and then run index-pack on it. |
|
|
|
with open(bundle, 'rb') as fpin: |
|
|
|
# Skip over header |
|
|
|
for line in fpin: |
|
|
|
if line == b'\n': |
|
|
|
break |
|
|
|
# Copy remainder (= packfile) to tmp.pack |
|
|
|
with open('tmp.pack', 'xb') as fpout: |
|
|
|
shutil.copyfileobj(fpin, fpout) |
|
|
|
codearchiver.subprocess.run_with_log(['git', 'index-pack', '-v', 'tmp.pack']) |
|
|
|
with open('tmp.idx', 'rb') as fp: |
|
|
|
_, index, _ = codearchiver.subprocess.run_with_log(['git', 'show-index'], input = fp) |
|
|
|
indexObjectIds = {l.rstrip('\n').split(' ', 2)[1] for l in index.splitlines()} |
|
|
|
try: |
|
|
|
indexObjects = {oid: objects[oid] for oid in indexObjectIds} |
|
|
|
except KeyError as e: |
|
|
|
# This should never happen since the bundle is created from the clone with exclusions... |
|
|
|
raise RuntimeError(f'Bundle {bundle} contains object not contained in the present clone') from e |
|
|
|
if objects.keys() - (baseBundleObjects | indexObjectIds) != set(): |
|
|
|
# If there is at least one object in the clone that is not in the base bundles or the bundle index... |
|
|
|
raise RuntimeError('Object mismatch between clone and bundles') |
|
|
|
os.remove('tmp.pack') |
|
|
|
os.remove('tmp.idx') |
|
|
|
|
|
|
|
_logger.info('Checking for submodules') |
|
|
|
_, commitsWithSubmodules, _ = codearchiver.subprocess.run_with_log(['git', 'log', '--format=format:%H', '--diff-filter=d', '--all', '--', '.gitmodules'], cwd = directory) |
|
|
|
if commitsWithSubmodules: |
|
|
|
_logger.warning('Submodules found but extraction not supported') |
|
|
|
|
|
|
|
_logger.info(f'Removing clone') |
|
|
|
shutil.rmtree(directory) |
|
|
|
with tempfile.TemporaryDirectory(prefix = 'tmp.codearchiver.git.', dir = os.getcwd()) as directory: |
|
|
|
bundle = f'{self._id}_git.bundle' |
|
|
|
if os.path.exists(bundle): |
|
|
|
_logger.fatal(f'{bundle!r} already exists') |
|
|
|
raise FileExistsError(f'{bundle!r} already exists') |
|
|
|
|
|
|
|
_, gitVersion, _ = codearchiver.subprocess.run_with_log(['git', '--version']) |
|
|
|
if not gitVersion.startswith('git version ') or not gitVersion.endswith('\n') or gitVersion[12:-1].strip('0123456789.') != '': |
|
|
|
raise RuntimeError(f'Unexpected output from `git --version`: {gitVersion!r}') |
|
|
|
gitVersion = gitVersion[12:-1] |
|
|
|
|
|
|
|
_logger.info(f'Cloning {self._url} into {directory}') |
|
|
|
startTime = datetime.datetime.utcnow() |
|
|
|
codearchiver.subprocess.run_with_log(['git', 'clone', '--verbose', '--progress', '--mirror', self._url, directory], env = {**os.environ, 'GIT_TERMINAL_PROMPT': '0'}) |
|
|
|
|
|
|
|
if self._extraBranches: |
|
|
|
for branch, commit in self._extraBranches.items(): |
|
|
|
_logger.info(f'Fetching commit {commit} as {branch}') |
|
|
|
r, _, _ = codearchiver.subprocess.run_with_log(['git', 'fetch', '--verbose', '--progress', 'origin', commit], cwd = directory, check = False) |
|
|
|
if r == 0: |
|
|
|
r2, _, _ = codearchiver.subprocess.run_with_log(['git', 'update-ref', f'refs/codearchiver/{branch}', commit, ''], cwd = directory, check = False) |
|
|
|
if r2 != 0: |
|
|
|
_logger.error(f'Failed to update-ref refs/codearchiver/{branch} to {commit}') |
|
|
|
else: |
|
|
|
_logger.error(f'Failed to fetch {commit}') |
|
|
|
# This leaves over a FETCH_HEAD file, but git-bundle does not care about that, so it can safely be ignored. |
|
|
|
endTime = datetime.datetime.utcnow() |
|
|
|
|
|
|
|
_logger.info('Collecting repository metadata') |
|
|
|
_, refs, _ = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory) |
|
|
|
refs = list(map(str.strip, refs.splitlines())) |
|
|
|
_, rootCommits, _ = codearchiver.subprocess.run_with_log(['git', 'rev-list', '--max-parents=0', '--all'], cwd = directory) |
|
|
|
rootCommits = list(filter(None, rootCommits.splitlines())) |
|
|
|
_, objects, _ = codearchiver.subprocess.run_with_log(['git', 'cat-file', '--batch-check', '--batch-all-objects', '--unordered', '--buffer'], cwd = directory) |
|
|
|
objects = {oid: otype for oid, otype, osize in map(functools.partial(str.split, sep = ' '), objects.splitlines())} |
|
|
|
with open(os.path.join(directory, 'HEAD'), 'r') as fp: |
|
|
|
head = fp.read() |
|
|
|
if not head.startswith('ref: refs/heads/') or not head.endswith('\n'): |
|
|
|
raise RuntimeError(f'Unexpected HEAD content: {head!r}') |
|
|
|
head = head[:-1] # Remove trailing \n |
|
|
|
|
|
|
|
# Check whether there are relevant prior bundles to create an incremental one |
|
|
|
commitsAndTags = {oid for oid, otype in objects.items() if otype in ('commit', 'tag')} |
|
|
|
basedOnBundles = {} # dict to keep the order |
|
|
|
baseBundleObjects = set() |
|
|
|
if self._storage: |
|
|
|
_logger.info('Checking for previous bundles') |
|
|
|
|
|
|
|
# A note on dependency optimisation: we want the minimal set of previous bundles {B0, …, Bn} that maximises the cover with the current clone S. |
|
|
|
# In other words, in the general case, this is a set cover problem between I = S ∩ (B0 ∪ … ∪ Bn} as the universe and Bi ∩ I as the subsets. |
|
|
|
# Fortunately, solving the actual set cover problem is not necessary. |
|
|
|
# This is because the previous bundles must be disjoint: commit/tag objects are never duplicated. (Trees and blobs might be, but deduplicating those isn't possible.) |
|
|
|
# Therefore, any previous bundle that contains at least one commit or tag object in the current clone must be a dependency. |
|
|
|
|
|
|
|
for oldBundle in self._storage.search_metadata([('Module', type(self).name), ('Root commit', tuple(rootCommits))]): |
|
|
|
_logger.info(f'Previous bundle: {oldBundle!r}') |
|
|
|
with self._storage.open_metadata(oldBundle) as fp: |
|
|
|
idx = GitMetadata.deserialise(fp) |
|
|
|
isMatch = False |
|
|
|
oldObjects = set() # commit and tag IDs in this bundle |
|
|
|
for key, value in idx: |
|
|
|
if key != 'Object': |
|
|
|
continue |
|
|
|
oid, otype = value.split(' ', 1) |
|
|
|
oldObjects.add(oid) |
|
|
|
if otype not in ('commit', 'tag'): |
|
|
|
continue |
|
|
|
if not isMatch and oid in commitsAndTags: |
|
|
|
isMatch = True |
|
|
|
if isMatch: |
|
|
|
basedOnBundles[oldBundle] = True |
|
|
|
baseBundleObjects |= oldObjects |
|
|
|
|
|
|
|
_logger.info(f'Bundling into {bundle}') |
|
|
|
cmd = ['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all'] |
|
|
|
objectsToExclude = baseBundleObjects & commitsAndTags |
|
|
|
input = ''.join(f'^{o}\n' for o in objectsToExclude).encode('ascii') |
|
|
|
status, _, stderr = codearchiver.subprocess.run_with_log(cmd, cwd = directory, input = input, check = False) |
|
|
|
if status == 128 and (stderr == 'fatal: Refusing to create empty bundle.\n' or stderr.endswith('\nfatal: Refusing to create empty bundle.\n')): |
|
|
|
# Manually write an empty bundle instead |
|
|
|
# Cf. Documentation/technical/bundle-format.txt and Documentation/technical/pack-format.txt in git's repository for details on the formats |
|
|
|
_logger.info('Writing empty bundle directly instead') |
|
|
|
with open(bundle, 'xb') as fp: |
|
|
|
fp.write(b'# v2 git bundle\n') # bundle signature |
|
|
|
fp.write(b'\n') # bundle end of prerequisites and refs |
|
|
|
packdata = b'PACK' # pack signature |
|
|
|
packdata += b'\0\0\0\x02' # pack version |
|
|
|
packdata += b'\0\0\0\0' # pack number of objects |
|
|
|
fp.write(packdata) |
|
|
|
fp.write(hashlib.sha1(packdata).digest()) # pack checksum trailer |
|
|
|
elif status != 0: |
|
|
|
raise RuntimeError(f'git bundle creation returned with non-zero exit status {status}.') |
|
|
|
|
|
|
|
_logger.info('Indexing bundle') |
|
|
|
# Yes, this is stupid, but unfortunately, `git index-pack` can only read from stdin inside a repo and will still write the packfile to disk anyway. |
|
|
|
# So sadly, the only way here (for now) is to make a copy of the packfile and then run index-pack on it. |
|
|
|
with open(bundle, 'rb') as fpin: |
|
|
|
# Skip over header |
|
|
|
for line in fpin: |
|
|
|
if line == b'\n': |
|
|
|
break |
|
|
|
# Copy remainder (= packfile) to tmp.pack |
|
|
|
with open('tmp.pack', 'xb') as fpout: |
|
|
|
shutil.copyfileobj(fpin, fpout) |
|
|
|
codearchiver.subprocess.run_with_log(['git', 'index-pack', '-v', 'tmp.pack']) |
|
|
|
with open('tmp.idx', 'rb') as fp: |
|
|
|
_, index, _ = codearchiver.subprocess.run_with_log(['git', 'show-index'], input = fp) |
|
|
|
indexObjectIds = {l.rstrip('\n').split(' ', 2)[1] for l in index.splitlines()} |
|
|
|
try: |
|
|
|
indexObjects = {oid: objects[oid] for oid in indexObjectIds} |
|
|
|
except KeyError as e: |
|
|
|
# This should never happen since the bundle is created from the clone with exclusions... |
|
|
|
raise RuntimeError(f'Bundle {bundle} contains object not contained in the present clone') from e |
|
|
|
if objects.keys() - (baseBundleObjects | indexObjectIds) != set(): |
|
|
|
# If there is at least one object in the clone that is not in the base bundles or the bundle index... |
|
|
|
raise RuntimeError('Object mismatch between clone and bundles') |
|
|
|
os.remove('tmp.pack') |
|
|
|
os.remove('tmp.idx') |
|
|
|
|
|
|
|
_logger.info('Checking for submodules') |
|
|
|
_, commitsWithSubmodules, _ = codearchiver.subprocess.run_with_log(['git', 'log', '--format=format:%H', '--diff-filter=d', '--all', '--', '.gitmodules'], cwd = directory) |
|
|
|
if commitsWithSubmodules: |
|
|
|
_logger.warning('Submodules found but extraction not supported') |
|
|
|
|
|
|
|
metadata = self.create_metadata(bundle, startTime, endTime) |
|
|
|
metadata.append('Git version', gitVersion) |
|
|
|