A VCS repository archival tool
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

242 lines
13 KiB

  1. import codearchiver.core
  2. import codearchiver.subprocess
  3. import datetime
  4. import functools
  5. import hashlib
  6. import itertools
  7. import logging
  8. import os
  9. import shutil
  10. import subprocess
  11. import tempfile
  12. _logger = logging.getLogger(__name__)
  13. class GitMetadata(codearchiver.core.Metadata):
  14. fields = (
  15. codearchiver.core.MetadataField(key = 'Git version', required = True, repeatable = False),
  16. codearchiver.core.MetadataField(key = 'Based on bundle', required = False, repeatable = True),
  17. codearchiver.core.MetadataField(key = 'Ref', required = True, repeatable = True),
  18. codearchiver.core.MetadataField(key = 'Head', required = True, repeatable = False),
  19. codearchiver.core.MetadataField(key = 'Root commit', required = True, repeatable = True),
  20. codearchiver.core.MetadataField(key = 'Object', required = False, repeatable = True),
  21. )
  22. version = 0
  23. class Git(codearchiver.core.Module):
  24. name = 'git'
  25. MetadataClass = GitMetadata
  26. @staticmethod
  27. def matches(inputUrl):
  28. return inputUrl.url.endswith('.git')
  29. def __init__(self, *args, extraBranches = {}, **kwargs):
  30. super().__init__(*args, **kwargs)
  31. self._extraBranches = extraBranches
  32. def _find_storage_bundles(self, criteria, checkOids, temporary = False):
  33. '''Search `self._storage` for bundles or temporary metadata matching `criteria` and containing at least one element of `checkOids`. Yields tuples `(name, objects, oids)`.'''
  34. searchMethod = self._storage.search_metadata if not temporary else self._storage.search_temporary_metadata
  35. openMethod = self._storage.open_metadata if not temporary else self._storage.open_temporary_metadata
  36. matchedBundles = {} # bundle name → (objects, oids)
  37. for oldBundle in searchMethod(criteria):
  38. _logger.info(f'Matching bundle: {oldBundle!r}')
  39. with openMethod(oldBundle) as fp:
  40. idx = GitMetadata.deserialise(fp)
  41. isMatch = False
  42. oldObjects = set() # commit and tag lines in this bundle
  43. oldOids = set() # commit and tag IDs in this bundle
  44. for key, value in idx:
  45. if key != 'Object':
  46. continue
  47. oid, otype = value.split(' ', 1)
  48. oldObjects.add(value)
  49. oldOids.add(oid)
  50. if otype not in ('commit', 'tag'):
  51. continue
  52. if not isMatch and oid in checkOids:
  53. isMatch = True
  54. if isMatch:
  55. yield (oldBundle, oldObjects, oldOids)
  56. def process(self):
  57. with tempfile.TemporaryDirectory(prefix = 'tmp.codearchiver.git.', dir = os.getcwd()) as directory:
  58. bundle = f'{self._id}_git.bundle'
  59. if os.path.exists(bundle):
  60. _logger.fatal(f'{bundle!r} already exists')
  61. raise FileExistsError(f'{bundle!r} already exists')
  62. _, gitVersion, _ = codearchiver.subprocess.run_with_log(['git', '--version'])
  63. if not gitVersion.startswith('git version ') or not gitVersion.endswith('\n') or gitVersion[12:-1].strip('0123456789.') != '':
  64. raise RuntimeError(f'Unexpected output from `git --version`: {gitVersion!r}')
  65. gitVersion = gitVersion[12:-1]
  66. _logger.info(f'Cloning {self._url} into {directory}')
  67. startTime = datetime.datetime.utcnow()
  68. codearchiver.subprocess.run_with_log(['git', 'clone', '--verbose', '--progress', '--mirror', self._url, directory], env = {**os.environ, 'GIT_TERMINAL_PROMPT': '0'})
  69. if self._extraBranches:
  70. for branch, commit in self._extraBranches.items():
  71. _logger.info(f'Fetching commit {commit} as {branch}')
  72. r, _, _ = codearchiver.subprocess.run_with_log(['git', 'fetch', '--verbose', '--progress', 'origin', commit], cwd = directory, check = False)
  73. if r == 0:
  74. r2, _, _ = codearchiver.subprocess.run_with_log(['git', 'update-ref', f'refs/codearchiver/{branch}', commit, ''], cwd = directory, check = False)
  75. if r2 != 0:
  76. _logger.error(f'Failed to update-ref refs/codearchiver/{branch} to {commit}')
  77. else:
  78. _logger.error(f'Failed to fetch {commit}')
  79. # This leaves over a FETCH_HEAD file, but git-bundle does not care about that, so it can safely be ignored.
  80. endTime = datetime.datetime.utcnow()
  81. _logger.info('Collecting repository metadata')
  82. _, refs, _ = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory)
  83. refs = list(map(str.strip, refs.splitlines()))
  84. _, rootCommits, _ = codearchiver.subprocess.run_with_log(['git', 'rev-list', '--max-parents=0', '--all'], cwd = directory)
  85. rootCommits = list(filter(None, rootCommits.splitlines()))
  86. _, objects, _ = codearchiver.subprocess.run_with_log(['git', 'cat-file', '--batch-check', '--batch-all-objects', '--unordered', '--buffer'], cwd = directory)
  87. objects = {oid: otype for oid, otype, osize in map(functools.partial(str.split, sep = ' '), objects.splitlines())}
  88. with open(os.path.join(directory, 'HEAD'), 'r') as fp:
  89. head = fp.read()
  90. if not head.startswith('ref: refs/heads/') or not head.endswith('\n'):
  91. raise RuntimeError(f'Unexpected HEAD content: {head!r}')
  92. head = head[:-1] # Remove trailing \n
  93. metadata = self.create_metadata(bundle, startTime, endTime)
  94. metadata.append('Git version', gitVersion)
  95. for line in refs:
  96. metadata.append('Ref', line)
  97. metadata.append('Head', head)
  98. for commitId in rootCommits:
  99. metadata.append('Root commit', commitId)
  100. # Check whether there are relevant prior bundles to create an incremental one
  101. commitsAndTags = {oid for oid, otype in objects.items() if otype in ('commit', 'tag')}
  102. tmpMetadataDependencies = [] # temporary metadata names this depends on, to be resolved later
  103. baseOids = set() # all oids this depends on (including temporary metadata, but only commits and tags from there)
  104. baseInProgressObjects = set() # 'oid otype' lines for finding the bundles at the end
  105. newCommitsAndTags = set() # oids of commits and tags not covered in previous bundles or existing temporary metadata
  106. temporaryMetadataName = None
  107. if self._storage:
  108. _logger.info('Checking for previous bundles')
  109. # A note on dependency optimisation: we want the minimal set of previous bundles {B0, …, Bn} that maximises the cover with the current clone S.
  110. # In other words, in the general case, this is a set cover problem between I = S ∩ (B0 ∪ … ∪ Bn} as the universe and Bi ∩ I as the subsets.
  111. # Fortunately, solving the actual set cover problem is not necessary.
  112. # This is because the previous bundles must be disjoint: commit/tag objects are never duplicated. (Trees and blobs might be, but deduplicating those isn't possible.)
  113. # Therefore, any previous bundle that contains at least one commit or tag object in the current clone must be a dependency.
  114. # To support parallel archival of related repositories, this uses other processes' temporary metadata from and writes its own to storage.
  115. # First, obtain all relevant prior bundles.
  116. # Second, obtain all relevant temporary metadata. Make a note of these and also exclude their commits from this bundle. Write own temporary metadata.
  117. # Third, upon completion (below), wait for the depended-on temporary metadata to disappear, search for the corresponding bundles, and finalise own metadata.
  118. with self._storage.lock():
  119. for oldBundleName, oldObjects, oldOids in self._find_storage_bundles([('Module', type(self).name), ('Root commit', tuple(rootCommits))], commitsAndTags):
  120. metadata.append('Based on bundle', oldBundleName)
  121. baseOids |= oldOids
  122. for tmpMetadataName, tmpObjects, tmpOids in self._find_storage_bundles([('Module', type(self).name), ('Root commit', tuple(rootCommits))], commitsAndTags, temporary = True):
  123. tmpMetadataDependencies.append(tmpMetadataName)
  124. baseOids |= tmpOids
  125. baseInProgressObjects |= tmpObjects
  126. newCommitsAndTags = commitsAndTags - baseOids
  127. for oid in newCommitsAndTags:
  128. metadata.append('Object', f'{oid} {objects[oid]}')
  129. temporaryMetadataName = self._storage.add_temporary_metadata(metadata)
  130. try:
  131. _logger.info(f'Bundling into {bundle}')
  132. cmd = ['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all']
  133. objectsToExclude = baseOids & commitsAndTags
  134. input = ''.join(f'^{o}\n' for o in objectsToExclude).encode('ascii')
  135. status, _, stderr = codearchiver.subprocess.run_with_log(cmd, cwd = directory, input = input, check = False)
  136. if status == 128 and (stderr == 'fatal: Refusing to create empty bundle.\n' or stderr.endswith('\nfatal: Refusing to create empty bundle.\n')):
  137. # Manually write an empty bundle instead
  138. # Cf. Documentation/technical/bundle-format.txt and Documentation/technical/pack-format.txt in git's repository for details on the formats
  139. _logger.info('Writing empty bundle directly instead')
  140. with open(bundle, 'xb') as fp:
  141. fp.write(b'# v2 git bundle\n') # bundle signature
  142. fp.write(b'\n') # bundle end of prerequisites and refs
  143. packdata = b'PACK' # pack signature
  144. packdata += b'\0\0\0\x02' # pack version
  145. packdata += b'\0\0\0\0' # pack number of objects
  146. fp.write(packdata)
  147. fp.write(hashlib.sha1(packdata).digest()) # pack checksum trailer
  148. elif status != 0:
  149. raise RuntimeError(f'git bundle creation returned with non-zero exit status {status}.')
  150. _logger.info('Indexing bundle')
  151. # Yes, this is stupid, but unfortunately, `git index-pack` can only read from stdin inside a repo and will still write the packfile to disk anyway.
  152. # So sadly, the only way here (for now) is to make a copy of the packfile and then run index-pack on it.
  153. with open(bundle, 'rb') as fpin:
  154. # Skip over header
  155. for line in fpin:
  156. if line == b'\n':
  157. break
  158. # Copy remainder (= packfile) to tmp.pack
  159. with open('tmp.pack', 'xb') as fpout:
  160. shutil.copyfileobj(fpin, fpout)
  161. codearchiver.subprocess.run_with_log(['git', 'index-pack', '-v', 'tmp.pack'])
  162. with open('tmp.idx', 'rb') as fp:
  163. _, index, _ = codearchiver.subprocess.run_with_log(['git', 'show-index'], input = fp)
  164. indexObjectIds = {l.rstrip('\n').split(' ', 2)[1] for l in index.splitlines()}
  165. try:
  166. indexObjects = {oid: objects[oid] for oid in indexObjectIds}
  167. except KeyError as e:
  168. # This should never happen since the bundle is created from the clone with exclusions...
  169. raise RuntimeError(f'Bundle {bundle} contains object not contained in the present clone') from e
  170. os.remove('tmp.pack')
  171. os.remove('tmp.idx')
  172. _logger.info('Checking for submodules')
  173. _, commitsWithSubmodules, _ = codearchiver.subprocess.run_with_log(['git', 'log', '--format=format:%H', '--diff-filter=d', '--all', '--', '.gitmodules'], cwd = directory)
  174. if commitsWithSubmodules:
  175. _logger.warning('Submodules found but extraction not supported')
  176. # Ensure that all commits and tags included in the temporary metadata made it into the pack, else data may be lost!
  177. indexCommitsAndTags = {oid for oid, otype in indexObjects.items() if otype in ('commit', 'tag')}
  178. if newCommitsAndTags - indexCommitsAndTags != set():
  179. raise RuntimeError('Bundle does not contain all commits/tags that were written to temporary metadata, aborting due to data loss risk')
  180. for oid, otype in indexObjects.items():
  181. if oid in newCommitsAndTags:
  182. # Already added to metadata earlier
  183. continue
  184. metadata.append('Object', f'{oid} {otype}')
  185. # Bundling completed without issues; wait for depended-on bundles, add them to the metadata, then replace own temporary metadata
  186. if self._storage:
  187. self._storage.wait_temporary_metadata(tmpMetadataDependencies)
  188. with self._storage.lock():
  189. criteria = [('Module', type(self).name), ('Root commit', tuple(rootCommits)), ('Object', tuple(baseInProgressObjects))]
  190. missingObjects = baseInProgressObjects.copy()
  191. for oldBundleName, oldObjects, oldOids in self._find_storage_bundles(criteria, {value.split(' ', 1)[0] for value in baseInProgressObjects}):
  192. metadata.append('Based on bundle', oldBundleName)
  193. baseOids |= oldOids
  194. missingObjects -= oldObjects
  195. # Verification: all commit/tag objects collected from temporary metadata must be covered
  196. if missingObjects:
  197. raise RuntimeError('Resolved temporary metadata bundles do not cover all expected objects')
  198. # Verification: all objects in the clone are either in a base bundle or in the index
  199. # This can only be done here because all oids are needed, not just the commit/tag objects
  200. if objects.keys() - (baseOids | indexObjectIds) != set():
  201. raise RuntimeError('Object mismatch between clone and bundles')
  202. self._storage.replace_temporary_metadata(temporaryMetadataName, bundle, metadata)
  203. except:
  204. # Attempt to remove the temporary metadata, then reraise
  205. if self._storage:
  206. with self._storage.lock():
  207. self._storage.remove_temporary_metadata(temporaryMetadataName)
  208. raise
  209. return codearchiver.core.Result(id = self._id, files = [(bundle, metadata)])
  210. def __repr__(self):
  211. return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r}, extraBranches = {self._extraBranches!r})'