A VCS repository archival tool
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

git.py 5.6 KiB

1 year ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. import codearchiver.core
  2. import codearchiver.subprocess
  3. import datetime
  4. import functools
  5. import hashlib
  6. import logging
  7. import os.path
  8. import shutil
  9. import subprocess
  10. _logger = logging.getLogger(__name__)
  11. class GitMetadata(codearchiver.core.Metadata):
  12. fields = (
  13. codearchiver.core.MetadataField(key = 'Git version', required = True, repeatable = False),
  14. codearchiver.core.MetadataField(key = 'Based on bundle', required = False, repeatable = True),
  15. codearchiver.core.MetadataField(key = 'Ref', required = True, repeatable = True),
  16. codearchiver.core.MetadataField(key = 'Root commit', required = True, repeatable = True),
  17. codearchiver.core.MetadataField(key = 'Commit', required = False, repeatable = True),
  18. )
  19. version = 0
  20. class Git(codearchiver.core.Module):
  21. name = 'git'
  22. MetadataClass = GitMetadata
  23. @staticmethod
  24. def matches(inputUrl):
  25. return inputUrl.url.endswith('.git')
  26. def __init__(self, *args, extraBranches = {}, **kwargs):
  27. super().__init__(*args, **kwargs)
  28. self._extraBranches = extraBranches
  29. def process(self):
  30. directory = self._url.rsplit('/', 1)[1]
  31. if os.path.exists(directory):
  32. _logger.fatal(f'{directory!r} already exists')
  33. raise FileExistsError(f'{directory!r} already exists')
  34. bundle = f'{self._id}.bundle'
  35. if os.path.exists(bundle):
  36. _logger.fatal(f'{bundle!r} already exists')
  37. raise FileExistsError(f'{bundle!r} already exists')
  38. _, gitVersion, _ = codearchiver.subprocess.run_with_log(['git', '--version'])
  39. if not gitVersion.startswith('git version ') or not gitVersion.endswith('\n') or gitVersion[12:-1].strip('0123456789.') != '':
  40. raise RuntimeError(f'Unexpected output from `git --version`: {gitVersion!r}')
  41. gitVersion = gitVersion[12:-1]
  42. _logger.info(f'Cloning {self._url} into {directory}')
  43. startTime = datetime.datetime.utcnow()
  44. codearchiver.subprocess.run_with_log(['git', 'clone', '--verbose', '--progress', '--mirror', self._url, directory], env = {**os.environ, 'GIT_TERMINAL_PROMPT': '0'})
  45. if self._extraBranches:
  46. for branch, commit in self._extraBranches.items():
  47. _logger.info(f'Fetching commit {commit} as {branch}')
  48. r, _, _ = codearchiver.subprocess.run_with_log(['git', 'fetch', '--verbose', '--progress', 'origin', commit], cwd = directory, check = False)
  49. if r == 0:
  50. r2, _, _ = codearchiver.subprocess.run_with_log(['git', 'update-ref', f'refs/codearchiver/{branch}', commit, ''], cwd = directory, check = False)
  51. if r2 != 0:
  52. _logger.error(f'Failed to update-ref refs/codearchiver/{branch} to {commit}')
  53. else:
  54. _logger.error(f'Failed to fetch {commit}')
  55. # This leaves over a FETCH_HEAD file, but git-bundle does not care about that, so it can safely be ignored.
  56. endTime = datetime.datetime.utcnow()
  57. _logger.info('Collecting repository metadata')
  58. _, refs, _ = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory)
  59. _, commits, _ = codearchiver.subprocess.run_with_log(['git', 'log', '--reflog', '--all', '--format=format:%H% P'], cwd = directory)
  60. commits = list(map(functools.partial(str.split, sep = ' '), commits.splitlines()))
  61. rootCommits = [c[0] for c in commits if len(c) == 1]
  62. # Check whether there are relevant prior bundles to create an incremental one
  63. # Collect their commits shared with this clone (else `git bundle` complains about 'bad object')
  64. commitSet = set(c[0] for c in commits) # For fast lookup
  65. oldCommits = {} # dict to keep the order reasonable
  66. basedOnBundles = {} # ditto
  67. if self._storage:
  68. for oldBundle in self._storage.search_metadata([('Module', type(self).name)] + [('Root commit', c) for c in rootCommits]):
  69. _logger.info(f'Previous bundle: {oldBundle!r}')
  70. with self._storage.open_metadata(oldBundle) as fp:
  71. idx = GitMetadata.deserialise(fp)
  72. for key, value in idx:
  73. if key == 'Commit' and value in commitSet:
  74. oldCommits[value] = True
  75. basedOnBundles[oldBundle] = True
  76. _logger.info(f'Bundling into {bundle}')
  77. status , _, stderr = codearchiver.subprocess.run_with_log(['git', 'bundle', 'create', '--progress', f'../{bundle}', '--stdin', '--reflog', '--all'], cwd = directory, input = ''.join(f'^{commit}\n' for commit in oldCommits).encode('ascii'), check = False)
  78. if status == 128 and stderr == 'fatal: Refusing to create empty bundle.\n':
  79. # Manually write an empty bundle instead
  80. # Cf. Documentation/technical/bundle-format.txt and Documentation/technical/pack-format.txt in git's repository for details on the formats
  81. _logger.info('Writing empty bundle directly instead')
  82. with open(bundle, 'wb') as fp:
  83. fp.write(b'# v2 git bundle\n') # bundle signature
  84. fp.write(b'\n') # bundle end of prerequisites and refs
  85. packdata = b'PACK' # pack signature
  86. packdata += b'\0\0\0\x02' # pack version
  87. packdata += b'\0\0\0\0' # pack number of objects
  88. fp.write(packdata)
  89. fp.write(hashlib.sha1(packdata).digest()) # pack checksum trailer
  90. elif status != 0:
  91. raise RuntimeError(f'git bundle creation returned with non-zero exit status {status}.')
  92. _logger.info(f'Removing clone')
  93. shutil.rmtree(directory)
  94. metadata = self.create_metadata(bundle, startTime, endTime)
  95. metadata.append('Git version', gitVersion)
  96. for oldBundle in basedOnBundles:
  97. metadata.append('Based on bundle', oldBundle)
  98. for line in refs.splitlines():
  99. metadata.append('Ref', line)
  100. for commitHash, *parents in commits:
  101. if commitHash not in oldCommits:
  102. metadata.append('Commit', commitHash)
  103. if not parents:
  104. metadata.append('Root commit', commitHash)
  105. return codearchiver.core.Result(id = self._id, files = [(bundle, metadata)])
  106. def __repr__(self):
  107. return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r}, extraBranches = {self._extraBranches!r})'