A VCS repository archival tool
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

187 lines
8.9 KiB

  1. import codearchiver.core
  2. import codearchiver.subprocess
  3. import datetime
  4. import functools
  5. import hashlib
  6. import itertools
  7. import logging
  8. import os.path
  9. import shutil
  10. import subprocess
  11. _logger = logging.getLogger(__name__)
  12. class GitMetadata(codearchiver.core.Metadata):
  13. fields = (
  14. codearchiver.core.MetadataField(key = 'Git version', required = True, repeatable = False),
  15. codearchiver.core.MetadataField(key = 'Based on bundle', required = False, repeatable = True),
  16. codearchiver.core.MetadataField(key = 'Ref', required = True, repeatable = True),
  17. codearchiver.core.MetadataField(key = 'Head', required = True, repeatable = False),
  18. codearchiver.core.MetadataField(key = 'Root commit', required = True, repeatable = True),
  19. codearchiver.core.MetadataField(key = 'Object', required = False, repeatable = True),
  20. )
  21. version = 0
  22. class Git(codearchiver.core.Module):
  23. name = 'git'
  24. MetadataClass = GitMetadata
  25. @staticmethod
  26. def matches(inputUrl):
  27. return inputUrl.url.endswith('.git')
  28. def __init__(self, *args, extraBranches = {}, **kwargs):
  29. super().__init__(*args, **kwargs)
  30. self._extraBranches = extraBranches
  31. def process(self):
  32. directory = self._url.rsplit('/', 1)[1]
  33. if os.path.exists(directory):
  34. _logger.fatal(f'{directory!r} already exists')
  35. raise FileExistsError(f'{directory!r} already exists')
  36. bundle = f'{self._id}_git.bundle'
  37. if os.path.exists(bundle):
  38. _logger.fatal(f'{bundle!r} already exists')
  39. raise FileExistsError(f'{bundle!r} already exists')
  40. _, gitVersion, _ = codearchiver.subprocess.run_with_log(['git', '--version'])
  41. if not gitVersion.startswith('git version ') or not gitVersion.endswith('\n') or gitVersion[12:-1].strip('0123456789.') != '':
  42. raise RuntimeError(f'Unexpected output from `git --version`: {gitVersion!r}')
  43. gitVersion = gitVersion[12:-1]
  44. _logger.info(f'Cloning {self._url} into {directory}')
  45. startTime = datetime.datetime.utcnow()
  46. codearchiver.subprocess.run_with_log(['git', 'clone', '--verbose', '--progress', '--mirror', self._url, directory], env = {**os.environ, 'GIT_TERMINAL_PROMPT': '0'})
  47. if self._extraBranches:
  48. for branch, commit in self._extraBranches.items():
  49. _logger.info(f'Fetching commit {commit} as {branch}')
  50. r, _, _ = codearchiver.subprocess.run_with_log(['git', 'fetch', '--verbose', '--progress', 'origin', commit], cwd = directory, check = False)
  51. if r == 0:
  52. r2, _, _ = codearchiver.subprocess.run_with_log(['git', 'update-ref', f'refs/codearchiver/{branch}', commit, ''], cwd = directory, check = False)
  53. if r2 != 0:
  54. _logger.error(f'Failed to update-ref refs/codearchiver/{branch} to {commit}')
  55. else:
  56. _logger.error(f'Failed to fetch {commit}')
  57. # This leaves over a FETCH_HEAD file, but git-bundle does not care about that, so it can safely be ignored.
  58. endTime = datetime.datetime.utcnow()
  59. _logger.info('Collecting repository metadata')
  60. _, refs, _ = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory)
  61. refs = list(map(str.strip, refs.splitlines()))
  62. _, rootCommits, _ = codearchiver.subprocess.run_with_log(['git', 'rev-list', '--max-parents=0', '--all'], cwd = directory)
  63. rootCommits = list(filter(None, rootCommits.splitlines()))
  64. _, objects, _ = codearchiver.subprocess.run_with_log(['git', 'cat-file', '--batch-check', '--batch-all-objects', '--unordered', '--buffer'], cwd = directory)
  65. objects = {oid: otype for oid, otype, osize in map(functools.partial(str.split, sep = ' '), objects.splitlines())}
  66. with open(os.path.join(directory, 'HEAD'), 'r') as fp:
  67. head = fp.read()
  68. if not head.startswith('ref: refs/heads/') or not head.endswith('\n'):
  69. raise RuntimeError(f'Unexpected HEAD content: {head!r}')
  70. head = head[:-1] # Remove trailing \n
  71. # Check whether there are relevant prior bundles to create an incremental one
  72. commitsAndTags = {oid for oid, otype in objects.items() if otype in ('commit', 'tag')}
  73. basedOnBundles = {} # dict to keep the order
  74. baseBundleObjects = set()
  75. if self._storage:
  76. _logger.info('Checking for previous bundles')
  77. # A note on dependency optimisation: we want the minimal set of previous bundles {B0, …, Bn} that maximises the cover with the current clone S.
  78. # In other words, in the general case, this is a set cover problem between I = S ∩ (B0 ∪ … ∪ Bn} as the universe and Bi ∩ I as the subsets.
  79. # Fortunately, solving the actual set cover problem is not necessary.
  80. # This is because the previous bundles must be disjoint: commit/tag objects are never duplicated. (Trees and blobs might be, but deduplicating those isn't possible.)
  81. # Therefore, any previous bundle that contains at least one commit or tag object in the current clone must be a dependency.
  82. for oldBundle in self._storage.search_metadata([('Module', type(self).name), ('Root commit', tuple(rootCommits))]):
  83. _logger.info(f'Previous bundle: {oldBundle!r}')
  84. with self._storage.open_metadata(oldBundle) as fp:
  85. idx = GitMetadata.deserialise(fp)
  86. isMatch = False
  87. oldObjects = set() # commit and tag IDs in this bundle
  88. for key, value in idx:
  89. if key != 'Object':
  90. continue
  91. oid, otype = value.split(' ', 1)
  92. oldObjects.add(oid)
  93. if otype not in ('commit', 'tag'):
  94. continue
  95. if not isMatch and oid in commitsAndTags:
  96. isMatch = True
  97. if isMatch:
  98. basedOnBundles[oldBundle] = True
  99. baseBundleObjects |= oldObjects
  100. _logger.info(f'Bundling into {bundle}')
  101. cmd = ['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all']
  102. objectsToExclude = baseBundleObjects & commitsAndTags
  103. input = ''.join(f'^{o}\n' for o in objectsToExclude).encode('ascii')
  104. status, _, stderr = codearchiver.subprocess.run_with_log(cmd, cwd = directory, input = input, check = False)
  105. if status == 128 and (stderr == 'fatal: Refusing to create empty bundle.\n' or stderr.endswith('\nfatal: Refusing to create empty bundle.\n')):
  106. # Manually write an empty bundle instead
  107. # Cf. Documentation/technical/bundle-format.txt and Documentation/technical/pack-format.txt in git's repository for details on the formats
  108. _logger.info('Writing empty bundle directly instead')
  109. with open(bundle, 'xb') as fp:
  110. fp.write(b'# v2 git bundle\n') # bundle signature
  111. fp.write(b'\n') # bundle end of prerequisites and refs
  112. packdata = b'PACK' # pack signature
  113. packdata += b'\0\0\0\x02' # pack version
  114. packdata += b'\0\0\0\0' # pack number of objects
  115. fp.write(packdata)
  116. fp.write(hashlib.sha1(packdata).digest()) # pack checksum trailer
  117. elif status != 0:
  118. raise RuntimeError(f'git bundle creation returned with non-zero exit status {status}.')
  119. _logger.info('Indexing bundle')
  120. # Yes, this is stupid, but unfortunately, `git index-pack` can only read from stdin inside a repo and will still write the packfile to disk anyway.
  121. # So sadly, the only way here (for now) is to make a copy of the packfile and then run index-pack on it.
  122. with open(bundle, 'rb') as fpin:
  123. # Skip over header
  124. for line in fpin:
  125. if line == b'\n':
  126. break
  127. # Copy remainder (= packfile) to tmp.pack
  128. with open('tmp.pack', 'xb') as fpout:
  129. shutil.copyfileobj(fpin, fpout)
  130. codearchiver.subprocess.run_with_log(['git', 'index-pack', '-v', 'tmp.pack'])
  131. with open('tmp.idx', 'rb') as fp:
  132. _, index, _ = codearchiver.subprocess.run_with_log(['git', 'show-index'], input = fp)
  133. indexObjectIds = {l.rstrip('\n').split(' ', 2)[1] for l in index.splitlines()}
  134. try:
  135. indexObjects = {oid: objects[oid] for oid in indexObjectIds}
  136. except KeyError as e:
  137. # This should never happen since the bundle is created from the clone with exclusions...
  138. raise RuntimeError(f'Bundle {bundle} contains object not contained in the present clone') from e
  139. if objects.keys() - (baseBundleObjects | indexObjectIds) != set():
  140. # If there is at least one object in the clone that is not in the base bundles or the bundle index...
  141. raise RuntimeError('Object mismatch between clone and bundles')
  142. os.remove('tmp.pack')
  143. os.remove('tmp.idx')
  144. _logger.info('Checking for submodules')
  145. _, commitsWithSubmodules, _ = codearchiver.subprocess.run_with_log(['git', 'log', '--format=format:%H', '--diff-filter=d', '--all', '--', '.gitmodules'], cwd = directory)
  146. if commitsWithSubmodules:
  147. _logger.warning('Submodules found but extraction not supported')
  148. _logger.info(f'Removing clone')
  149. shutil.rmtree(directory)
  150. metadata = self.create_metadata(bundle, startTime, endTime)
  151. metadata.append('Git version', gitVersion)
  152. for oldBundle in basedOnBundles:
  153. metadata.append('Based on bundle', oldBundle)
  154. for line in refs:
  155. metadata.append('Ref', line)
  156. metadata.append('Head', head)
  157. for commitId in rootCommits:
  158. metadata.append('Root commit', commitId)
  159. for oid, otype in indexObjects.items():
  160. metadata.append('Object', f'{oid} {otype}')
  161. return codearchiver.core.Result(id = self._id, files = [(bundle, metadata)])
  162. def __repr__(self):
  163. return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r}, extraBranches = {self._extraBranches!r})'