A VCS repository archival tool
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

312 lines
16 KiB

  1. import codearchiver.core
  2. import codearchiver.subprocess
  3. import datetime
  4. import functools
  5. import hashlib
  6. import itertools
  7. import logging
  8. import os
  9. import shutil
  10. import subprocess
  11. import tempfile
  12. _logger = logging.getLogger(__name__)
  13. class _HashingFileReader:
  14. '''A tiny wrapper around a file-like object which calculates the hash of the file as it is being read.'''
  15. def __init__(self, fp, hasher = hashlib.sha1, skipStart = 0, skipEnd = 0):
  16. self._fp = fp
  17. self._hasher = hasher()
  18. self._skipStart = skipStart
  19. self._skipEnd = skipEnd
  20. self._buf = b''
  21. def read(self, n):
  22. data = self._fp.read(n)
  23. if self._skipStart > 0:
  24. # Requires that the first block is bigger than skipStart+skipEnd because it otherwise gets very complicated
  25. # Yes, this fails if the file is smaller than that, but given what it's used for, that's not an issue.
  26. if len(data) < self._skipStart + self._skipEnd:
  27. raise ValueError(f'first read is required to return at least {self._skipStart + self._skipEnd} bytes')
  28. start = self._skipStart
  29. self._skipStart = 0
  30. else:
  31. start = 0
  32. bufPlusData = self._buf + data
  33. if self._skipEnd > 0:
  34. self._buf = bufPlusData[-self._skipEnd:]
  35. end = -self._skipEnd
  36. else:
  37. end = None
  38. self._hasher.update(bufPlusData[start:end])
  39. return data
  40. def digest(self):
  41. if self._skipStart > 0 or len(self._buf) != self._skipEnd:
  42. raise ValueError('data skipping failed')
  43. return self._hasher.digest()
  44. class GitMetadata(codearchiver.core.Metadata):
  45. fields = (
  46. codearchiver.core.MetadataField(key = 'Git version', required = True, repeatable = False),
  47. codearchiver.core.MetadataField(key = 'Based on bundle', required = False, repeatable = True),
  48. codearchiver.core.MetadataField(key = 'Ref', required = True, repeatable = True),
  49. codearchiver.core.MetadataField(key = 'Head', required = True, repeatable = False),
  50. codearchiver.core.MetadataField(key = 'Root commit', required = True, repeatable = True),
  51. codearchiver.core.MetadataField(key = 'Object', required = False, repeatable = True),
  52. )
  53. version = 0
  54. class Git(codearchiver.core.Module):
  55. name = 'git'
  56. MetadataClass = GitMetadata
  57. @staticmethod
  58. def matches(inputUrl):
  59. return inputUrl.url.endswith('.git')
  60. def __init__(self, *args, extraBranches = {}, **kwargs):
  61. super().__init__(*args, **kwargs)
  62. self._extraBranches = extraBranches
  63. def _find_storage_bundles(self, criteria, checkOids, temporary = False):
  64. '''Search `self._storage` for bundles or temporary metadata matching `criteria` and containing at least one element of `checkOids`. Yields tuples `(name, objects, oids)`.'''
  65. searchMethod = self._storage.search_metadata if not temporary else self._storage.search_temporary_metadata
  66. openMethod = self._storage.open_metadata if not temporary else self._storage.open_temporary_metadata
  67. matchedBundles = {} # bundle name → (objects, oids)
  68. for oldBundle in searchMethod(criteria):
  69. _logger.info(f'Matching bundle: {oldBundle!r}')
  70. with openMethod(oldBundle) as fp:
  71. idx = GitMetadata.deserialise(fp)
  72. isMatch = False
  73. oldObjects = set() # commit and tag lines in this bundle
  74. oldOids = set() # commit and tag IDs in this bundle
  75. for key, value in idx:
  76. if key != 'Object':
  77. continue
  78. oid, otype = value.split(' ', 1)
  79. oldObjects.add(value)
  80. oldOids.add(oid)
  81. if otype not in ('commit', 'tag'):
  82. continue
  83. if not isMatch and oid in checkOids:
  84. isMatch = True
  85. if isMatch:
  86. yield (oldBundle, oldObjects, oldOids)
  87. def process(self):
  88. with tempfile.TemporaryDirectory(prefix = 'tmp.codearchiver.git.', dir = os.getcwd()) as directory:
  89. bundle = f'{self._id}_git.bundle'
  90. if os.path.exists(bundle):
  91. _logger.fatal(f'{bundle!r} already exists')
  92. raise FileExistsError(f'{bundle!r} already exists')
  93. _, gitVersion, _ = codearchiver.subprocess.run_with_log(['git', '--version'])
  94. if not gitVersion.startswith('git version ') or not gitVersion.endswith('\n') or gitVersion[12:-1].strip('0123456789.') != '':
  95. raise RuntimeError(f'Unexpected output from `git --version`: {gitVersion!r}')
  96. gitVersion = gitVersion[12:-1]
  97. _logger.info(f'Cloning {self._url} into {directory}')
  98. startTime = datetime.datetime.utcnow()
  99. codearchiver.subprocess.run_with_log(['git', 'clone', '--verbose', '--progress', '--mirror', self._url, directory], env = {**os.environ, 'GIT_TERMINAL_PROMPT': '0'})
  100. if self._extraBranches:
  101. for branch, commit in self._extraBranches.items():
  102. _logger.info(f'Fetching commit {commit} as {branch}')
  103. r, _, _ = codearchiver.subprocess.run_with_log(['git', 'fetch', '--verbose', '--progress', 'origin', commit], cwd = directory, check = False)
  104. if r == 0:
  105. r2, _, _ = codearchiver.subprocess.run_with_log(['git', 'update-ref', f'refs/codearchiver/{branch}', commit, ''], cwd = directory, check = False)
  106. if r2 != 0:
  107. _logger.error(f'Failed to update-ref refs/codearchiver/{branch} to {commit}')
  108. else:
  109. _logger.error(f'Failed to fetch {commit}')
  110. # This leaves over a FETCH_HEAD file, but git-bundle does not care about that, so it can safely be ignored.
  111. endTime = datetime.datetime.utcnow()
  112. _logger.info('Collecting repository metadata')
  113. _, refs, _ = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory)
  114. refs = list(map(str.strip, refs.splitlines()))
  115. _, rootCommits, _ = codearchiver.subprocess.run_with_log(['git', 'rev-list', '--max-parents=0', '--all'], cwd = directory)
  116. rootCommits = list(filter(None, rootCommits.splitlines()))
  117. _, objects, _ = codearchiver.subprocess.run_with_log(['git', 'cat-file', '--batch-check', '--batch-all-objects', '--unordered', '--buffer'], cwd = directory)
  118. objects = {oid: otype for oid, otype, osize in map(functools.partial(str.split, sep = ' '), objects.splitlines())}
  119. with open(os.path.join(directory, 'HEAD'), 'r') as fp:
  120. head = fp.read()
  121. if not head.startswith('ref: refs/heads/') or not head.endswith('\n'):
  122. raise RuntimeError(f'Unexpected HEAD content: {head!r}')
  123. head = head[:-1] # Remove trailing \n
  124. metadata = self.create_metadata(bundle, startTime, endTime)
  125. metadata.append('Git version', gitVersion)
  126. for line in refs:
  127. metadata.append('Ref', line)
  128. metadata.append('Head', head)
  129. for commitId in rootCommits:
  130. metadata.append('Root commit', commitId)
  131. # Check whether there are relevant prior bundles to create an incremental one
  132. commitsAndTags = {oid for oid, otype in objects.items() if otype in ('commit', 'tag')}
  133. tmpMetadataDependencies = [] # temporary metadata names this depends on, to be resolved later
  134. baseOids = set() # all oids this depends on (including temporary metadata, but only commits and tags from there)
  135. baseInProgressObjects = set() # 'oid otype' lines for finding the bundles at the end
  136. newCommitsAndTags = set() # oids of commits and tags not covered in previous bundles or existing temporary metadata
  137. temporaryMetadataName = None
  138. if self._storage:
  139. _logger.info('Checking for previous bundles')
  140. # A note on dependency optimisation: we want the minimal set of previous bundles {B0, …, Bn} that maximises the cover with the current clone S.
  141. # In other words, in the general case, this is a set cover problem between I = S ∩ (B0 ∪ … ∪ Bn} as the universe and Bi ∩ I as the subsets.
  142. # Fortunately, solving the actual set cover problem is not necessary.
  143. # This is because the previous bundles must be disjoint: commit/tag objects are never duplicated. (Trees and blobs might be, but deduplicating those isn't possible.)
  144. # Therefore, any previous bundle that contains at least one commit or tag object in the current clone must be a dependency.
  145. # To support parallel archival of related repositories, this uses other processes' temporary metadata from and writes its own to storage.
  146. # First, obtain all relevant prior bundles.
  147. # Second, obtain all relevant temporary metadata. Make a note of these and also exclude their commits from this bundle. Write own temporary metadata.
  148. # Third, upon completion (below), wait for the depended-on temporary metadata to disappear, search for the corresponding bundles, and finalise own metadata.
  149. with self._storage.lock():
  150. for oldBundleName, oldObjects, oldOids in self._find_storage_bundles([('Module', type(self).name), ('Root commit', tuple(rootCommits))], commitsAndTags):
  151. metadata.append('Based on bundle', oldBundleName)
  152. baseOids |= oldOids
  153. for tmpMetadataName, tmpObjects, tmpOids in self._find_storage_bundles([('Module', type(self).name), ('Root commit', tuple(rootCommits))], commitsAndTags, temporary = True):
  154. tmpMetadataDependencies.append(tmpMetadataName)
  155. baseOids |= tmpOids
  156. baseInProgressObjects |= tmpObjects
  157. newCommitsAndTags = commitsAndTags - baseOids
  158. for oid in newCommitsAndTags:
  159. metadata.append('Object', f'{oid} {objects[oid]}')
  160. temporaryMetadataName = self._storage.add_temporary_metadata(metadata)
  161. try:
  162. _logger.info(f'Bundling into {bundle}')
  163. cmd = ['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all']
  164. objectsToExclude = baseOids & commitsAndTags
  165. del commitsAndTags
  166. input = ''.join(f'^{o}\n' for o in objectsToExclude).encode('ascii')
  167. del objectsToExclude
  168. status, _, stderr = codearchiver.subprocess.run_with_log(cmd, cwd = directory, input = input, check = False)
  169. del input
  170. if status == 128 and (stderr == 'fatal: Refusing to create empty bundle.\n' or stderr.endswith('\nfatal: Refusing to create empty bundle.\n')):
  171. # Manually write an empty bundle instead
  172. # Cf. Documentation/technical/bundle-format.txt and Documentation/technical/pack-format.txt in git's repository for details on the formats
  173. _logger.info('Writing empty bundle directly instead')
  174. with open(bundle, 'xb') as fp:
  175. fp.write(b'# v2 git bundle\n') # bundle signature
  176. fp.write(b'\n') # bundle end of prerequisites and refs
  177. packdata = b'PACK' # pack signature
  178. packdata += b'\0\0\0\x02' # pack version
  179. packdata += b'\0\0\0\0' # pack number of objects
  180. fp.write(packdata)
  181. fp.write(hashlib.sha1(packdata).digest()) # pack checksum trailer
  182. elif status != 0:
  183. raise RuntimeError(f'git bundle creation returned with non-zero exit status {status}.')
  184. _logger.info('Indexing bundle')
  185. # The bundle's packfile might contain deltified objects.
  186. # Those cannot be run through `index-pack` without using `--fix-thin` and running it in a repo containing the missing objects.
  187. # However, `--fix-thin` has the side effect that it appends those missing objects to the output pack as well, so they also show up in the `show-index` output afterwards.
  188. # The fact that this always appends is undocumented, so it can't simply be relied on.
  189. # So this does the following:
  190. # - Index the pack; as the data is being read, calculate a hash of the pack data without header and trailer checksum
  191. # - Verify that the corresponding bytes from the index-pack output file have the same hash.
  192. # - Read the index and filter out any entries that lie beyond the size of the bundle packfile minus 20 (trailer checksum)
  193. # This gets an accurate index of exactly which objects are in this pack; some might depend on data from other bundles, but that's fine.
  194. # The extra copy to disk is unfortunately unavoidable anyway, but this hash verification at least makes it somewhat valuable.
  195. # Index with inline hash calculation
  196. bundleSize = os.path.getsize(bundle)
  197. with open(bundle, 'rb') as fpin:
  198. # Skip over header
  199. for line in fpin:
  200. if line == b'\n':
  201. break
  202. packOffset = fpin.tell()
  203. hashWrapper = _HashingFileReader(fpin, skipStart = 12, skipEnd = 20)
  204. codearchiver.subprocess.run_with_log(['git', 'index-pack', '-v', '--fix-thin', '--stdin', '../tmp.pack'], input = hashWrapper, cwd = directory)
  205. bundlePackSize = bundleSize - packOffset - 12 - 20
  206. bundlePackHash = hashWrapper.digest()
  207. # Verify hash of first part of the index-pack output pack
  208. with open('tmp.pack', 'rb') as fp:
  209. fp.seek(12) # Header
  210. indexPackRead = 0
  211. hasher = hashlib.sha1()
  212. while indexPackRead < bundlePackSize:
  213. data = fp.read(min(bundlePackSize - indexPackRead, 1048576))
  214. indexPackRead += len(data)
  215. hasher.update(data)
  216. indexPackHash = hasher.digest()
  217. if bundlePackHash != indexPackHash or indexPackRead != bundlePackSize:
  218. raise RuntimeError(f'index pack hash comparison failed: expected {bundlePackHash!r} (size: {bundlePackSize}), got {indexPackHash!r} (size: {indexPackRead})')
  219. # Parse index
  220. with open('tmp.idx', 'rb') as fp:
  221. _, index, _ = codearchiver.subprocess.run_with_log(['git', 'show-index'], input = fp)
  222. indexObjectIds = {oid for offset, oid, _ in map(lambda l: l.rstrip('\n').split(' ', 2), index.splitlines()) if int(offset) < bundlePackSize}
  223. del index
  224. try:
  225. indexObjects = {oid: objects[oid] for oid in indexObjectIds}
  226. except KeyError as e:
  227. # This should never happen since the bundle is created from the clone with exclusions...
  228. raise RuntimeError(f'Bundle {bundle} contains object not contained in the present clone') from e
  229. os.remove('tmp.pack')
  230. os.remove('tmp.idx')
  231. _logger.info('Checking for submodules')
  232. _, commitsWithSubmodules, _ = codearchiver.subprocess.run_with_log(['git', 'log', '--format=format:%H', '--diff-filter=d', '--all', '--', '.gitmodules'], cwd = directory)
  233. if commitsWithSubmodules:
  234. _logger.warning('Submodules found but extraction not supported')
  235. del commitsWithSubmodules
  236. # Ensure that all commits and tags included in the temporary metadata made it into the pack, else data may be lost!
  237. indexCommitsAndTags = {oid for oid, otype in indexObjects.items() if otype in ('commit', 'tag')}
  238. if newCommitsAndTags - indexCommitsAndTags != set():
  239. raise RuntimeError('Bundle does not contain all commits/tags that were written to temporary metadata, aborting due to data loss risk')
  240. for oid, otype in indexObjects.items():
  241. if oid in newCommitsAndTags:
  242. # Already added to metadata earlier
  243. continue
  244. metadata.append('Object', f'{oid} {otype}')
  245. del indexObjects, indexCommitsAndTags
  246. # Bundling completed without issues; wait for depended-on bundles, add them to the metadata, then replace own temporary metadata
  247. if self._storage:
  248. if tmpMetadataDependencies:
  249. self._storage.wait_temporary_metadata(tmpMetadataDependencies)
  250. with self._storage.lock():
  251. if tmpMetadataDependencies:
  252. criteria = [('Module', type(self).name), ('Root commit', tuple(rootCommits)), ('Object', tuple(baseInProgressObjects))]
  253. missingObjects = baseInProgressObjects.copy()
  254. for oldBundleName, oldObjects, oldOids in self._find_storage_bundles(criteria, {value.split(' ', 1)[0] for value in baseInProgressObjects}):
  255. metadata.append('Based on bundle', oldBundleName)
  256. baseOids |= oldOids
  257. missingObjects -= oldObjects
  258. # Verification: all commit/tag objects collected from temporary metadata must be covered
  259. if missingObjects:
  260. raise RuntimeError('Resolved temporary metadata bundles do not cover all expected objects')
  261. # Verification: all objects in the clone are either in a base bundle or in the index
  262. # This can only be done here because all oids are needed, not just the commit/tag objects
  263. if objects.keys() - (baseOids | indexObjectIds) != set():
  264. raise RuntimeError('Object mismatch between clone and bundles')
  265. self._storage.replace_temporary_metadata(temporaryMetadataName, bundle, metadata)
  266. except:
  267. # Attempt to remove the temporary metadata, then reraise
  268. if self._storage:
  269. with self._storage.lock():
  270. self._storage.remove_temporary_metadata(temporaryMetadataName)
  271. raise
  272. return codearchiver.core.Result(id = self._id, files = [(bundle, metadata)])
  273. def __repr__(self):
  274. return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r}, extraBranches = {self._extraBranches!r})'