A VCS repository archival tool
Non puoi selezionare più di 25 argomenti Gli argomenti devono iniziare con una lettera o un numero, possono includere trattini ('-') e possono essere lunghi fino a 35 caratteri.

180 righe
8.5 KiB

  1. import codearchiver.core
  2. import codearchiver.subprocess
  3. import datetime
  4. import functools
  5. import hashlib
  6. import itertools
  7. import logging
  8. import os.path
  9. import shutil
  10. import subprocess
  11. _logger = logging.getLogger(__name__)
  12. class GitMetadata(codearchiver.core.Metadata):
  13. fields = (
  14. codearchiver.core.MetadataField(key = 'Git version', required = True, repeatable = False),
  15. codearchiver.core.MetadataField(key = 'Based on bundle', required = False, repeatable = True),
  16. codearchiver.core.MetadataField(key = 'Ref', required = True, repeatable = True),
  17. codearchiver.core.MetadataField(key = 'Root commit', required = True, repeatable = True),
  18. codearchiver.core.MetadataField(key = 'Object', required = False, repeatable = True),
  19. )
  20. version = 0
  21. class Git(codearchiver.core.Module):
  22. name = 'git'
  23. MetadataClass = GitMetadata
  24. @staticmethod
  25. def matches(inputUrl):
  26. return inputUrl.url.endswith('.git')
  27. def __init__(self, *args, extraBranches = {}, **kwargs):
  28. super().__init__(*args, **kwargs)
  29. self._extraBranches = extraBranches
  30. def process(self):
  31. directory = self._url.rsplit('/', 1)[1]
  32. if os.path.exists(directory):
  33. _logger.fatal(f'{directory!r} already exists')
  34. raise FileExistsError(f'{directory!r} already exists')
  35. bundle = f'{self._id}.bundle'
  36. if os.path.exists(bundle):
  37. _logger.fatal(f'{bundle!r} already exists')
  38. raise FileExistsError(f'{bundle!r} already exists')
  39. _, gitVersion, _ = codearchiver.subprocess.run_with_log(['git', '--version'])
  40. if not gitVersion.startswith('git version ') or not gitVersion.endswith('\n') or gitVersion[12:-1].strip('0123456789.') != '':
  41. raise RuntimeError(f'Unexpected output from `git --version`: {gitVersion!r}')
  42. gitVersion = gitVersion[12:-1]
  43. _logger.info(f'Cloning {self._url} into {directory}')
  44. startTime = datetime.datetime.utcnow()
  45. codearchiver.subprocess.run_with_log(['git', 'clone', '--verbose', '--progress', '--mirror', self._url, directory], env = {**os.environ, 'GIT_TERMINAL_PROMPT': '0'})
  46. if self._extraBranches:
  47. for branch, commit in self._extraBranches.items():
  48. _logger.info(f'Fetching commit {commit} as {branch}')
  49. r, _, _ = codearchiver.subprocess.run_with_log(['git', 'fetch', '--verbose', '--progress', 'origin', commit], cwd = directory, check = False)
  50. if r == 0:
  51. r2, _, _ = codearchiver.subprocess.run_with_log(['git', 'update-ref', f'refs/codearchiver/{branch}', commit, ''], cwd = directory, check = False)
  52. if r2 != 0:
  53. _logger.error(f'Failed to update-ref refs/codearchiver/{branch} to {commit}')
  54. else:
  55. _logger.error(f'Failed to fetch {commit}')
  56. # This leaves over a FETCH_HEAD file, but git-bundle does not care about that, so it can safely be ignored.
  57. endTime = datetime.datetime.utcnow()
  58. _logger.info('Collecting repository metadata')
  59. _, refs, _ = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory)
  60. refs = list(map(str.strip, refs.splitlines()))
  61. _, rootCommits, _ = codearchiver.subprocess.run_with_log(['git', 'rev-list', '--max-parents=0', '--all'], cwd = directory)
  62. rootCommits = list(filter(None, rootCommits.splitlines()))
  63. _, objects, _ = codearchiver.subprocess.run_with_log(['git', 'cat-file', '--batch-check', '--batch-all-objects', '--unordered', '--buffer'], cwd = directory)
  64. objects = {oid: otype for oid, otype, osize in map(functools.partial(str.split, sep = ' '), objects.splitlines())}
  65. # Check whether there are relevant prior bundles to create an incremental one
  66. commitsAndTags = {oid for oid, otype in objects.items() if otype in ('commit', 'tag')}
  67. basedOnBundles = {} # dict to keep the order
  68. baseBundleObjects = set()
  69. if self._storage:
  70. _logger.info('Checking for previous bundles')
  71. # A note on dependency optimisation: we want the minimal set of previous bundles {B0, …, Bn} that maximises the cover with the current clone S.
  72. # In other words, in the general case, this is a set cover problem between I = S ∩ (B0 ∪ … ∪ Bn} as the universe and Bi ∩ I as the subsets.
  73. # Fortunately, solving the actual set cover problem is not necessary.
  74. # This is because the previous bundles must be disjoint: commit/tag objects are never duplicated. (Trees and blobs might be, but deduplicating those isn't possible.)
  75. # Therefore, any previous bundle that contains at least one commit or tag object in the current clone must be a dependency.
  76. for oldBundle in self._storage.search_metadata([('Module', type(self).name), ('Root commit', tuple(rootCommits))]):
  77. _logger.info(f'Previous bundle: {oldBundle!r}')
  78. with self._storage.open_metadata(oldBundle) as fp:
  79. idx = GitMetadata.deserialise(fp)
  80. isMatch = False
  81. oldObjects = set() # commit and tag IDs in this bundle
  82. for key, value in idx:
  83. if key != 'Object':
  84. continue
  85. oid, otype = value.split(' ', 1)
  86. oldObjects.add(oid)
  87. if otype not in ('commit', 'tag'):
  88. continue
  89. if not isMatch and oid in commitsAndTags:
  90. isMatch = True
  91. if isMatch:
  92. basedOnBundles[oldBundle] = True
  93. baseBundleObjects |= oldObjects
  94. _logger.info(f'Bundling into {bundle}')
  95. cmd = ['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all']
  96. objectsToExclude = baseBundleObjects & commitsAndTags
  97. input = ''.join(f'^{o}\n' for o in objectsToExclude).encode('ascii')
  98. status, _, stderr = codearchiver.subprocess.run_with_log(cmd, cwd = directory, input = input, check = False)
  99. if status == 128 and stderr == 'fatal: Refusing to create empty bundle.\n':
  100. # Manually write an empty bundle instead
  101. # Cf. Documentation/technical/bundle-format.txt and Documentation/technical/pack-format.txt in git's repository for details on the formats
  102. _logger.info('Writing empty bundle directly instead')
  103. with open(bundle, 'xb') as fp:
  104. fp.write(b'# v2 git bundle\n') # bundle signature
  105. fp.write(b'\n') # bundle end of prerequisites and refs
  106. packdata = b'PACK' # pack signature
  107. packdata += b'\0\0\0\x02' # pack version
  108. packdata += b'\0\0\0\0' # pack number of objects
  109. fp.write(packdata)
  110. fp.write(hashlib.sha1(packdata).digest()) # pack checksum trailer
  111. elif status != 0:
  112. raise RuntimeError(f'git bundle creation returned with non-zero exit status {status}.')
  113. _logger.info('Indexing bundle')
  114. # Yes, this is stupid, but unfortunately, `git index-pack` can only read from stdin inside a repo and will still write the packfile to disk anyway.
  115. # So sadly, the only way here (for now) is to make a copy of the packfile and then run index-pack on it.
  116. with open(bundle, 'rb') as fpin:
  117. # Skip over header
  118. for line in fpin:
  119. if line == b'\n':
  120. break
  121. # Copy remainder (= packfile) to tmp.pack
  122. with open('tmp.pack', 'xb') as fpout:
  123. shutil.copyfileobj(fpin, fpout)
  124. codearchiver.subprocess.run_with_log(['git', 'index-pack', '-v', 'tmp.pack'])
  125. with open('tmp.idx', 'rb') as fp:
  126. _, index, _ = codearchiver.subprocess.run_with_log(['git', 'show-index'], input = fp)
  127. indexObjectIds = {l.rstrip('\n').split(' ', 2)[1] for l in index.splitlines()}
  128. try:
  129. indexObjects = {oid: objects[oid] for oid in indexObjectIds}
  130. except KeyError as e:
  131. # This should never happen since the bundle is created from the clone with exclusions...
  132. raise RuntimeError(f'Bundle {bundle} contains object not contained in the present clone') from e
  133. if objects.keys() - (baseBundleObjects | indexObjectIds) != set():
  134. # If there is at least one object in the clone that is not in the base bundles or the bundle index...
  135. raise RuntimeError('Object mismatch between clone and bundles')
  136. os.remove('tmp.pack')
  137. os.remove('tmp.idx')
  138. _logger.info('Checking for submodules')
  139. _, commitsWithSubmodules, _ = codearchiver.subprocess.run_with_log(['git', 'log', '--format=format:%H', '--diff-filter=d', '--all', '--', '.gitmodules'], cwd = directory)
  140. if commitsWithSubmodules:
  141. _logger.warning('Submodules found but extraction not supported')
  142. _logger.info(f'Removing clone')
  143. shutil.rmtree(directory)
  144. metadata = self.create_metadata(bundle, startTime, endTime)
  145. metadata.append('Git version', gitVersion)
  146. for oldBundle in basedOnBundles:
  147. metadata.append('Based on bundle', oldBundle)
  148. for line in refs:
  149. metadata.append('Ref', line)
  150. for commitId in rootCommits:
  151. metadata.append('Root commit', commitId)
  152. for oid, otype in indexObjects.items():
  153. metadata.append('Object', f'{oid} {otype}')
  154. return codearchiver.core.Result(id = self._id, files = [(bundle, metadata)])
  155. def __repr__(self):
  156. return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r}, extraBranches = {self._extraBranches!r})'