'''Search `self._storage` for bundles or temporary metadata matching `criteria` and containing at least one element of `checkOids`. Yields tuples `(name, objects, oids)`.'''
searchMethod = self._storage.search_metadata if not temporary else self._storage.search_temporary_metadata
openMethod = self._storage.open_metadata if not temporary else self._storage.open_temporary_metadata
matchedBundles = {} # bundle name → (objects, oids)
for oldBundle in searchMethod(criteria):
_logger.info(f'Matching bundle: {oldBundle!r}')
with openMethod(oldBundle) as fp:
idx = GitMetadata.deserialise(fp)
isMatch = False
oldObjects = set() # commit and tag lines in this bundle
oldOids = set() # commit and tag IDs in this bundle
for key, value in idx:
if key != 'Object':
continue
oid, otype = value.split(' ', 1)
oldObjects.add(value)
oldOids.add(oid)
if otype not in ('commit', 'tag'):
continue
if not isMatch and oid in checkOids:
isMatch = True
if isMatch:
yield (oldBundle, oldObjects, oldOids)
def process(self):
with tempfile.TemporaryDirectory(prefix = 'tmp.codearchiver.git.', dir = os.getcwd()) as directory:
bundle = f'{self._id}_git.bundle'
@@ -80,10 +105,21 @@ class Git(codearchiver.core.Module):
raise RuntimeError(f'Unexpected HEAD content: {head!r}')
# Check whether there are relevant prior bundles to create an incremental one
commitsAndTags = {oid for oid, otype in objects.items() if otype in ('commit', 'tag')}
basedOnBundles = {} # dict to keep the order
baseBundleObjects = set()
tmpMetadataDependencies = [] # temporary metadata names this depends on, to be resolved later
baseOids = set() # all oids this depends on (including temporary metadata, but only commits and tags from there)
baseInProgressObjects = set() # 'oid otype' lines for finding the bundles at the end
newCommitsAndTags = set() # oids of commits and tags not covered in previous bundles or existing temporary metadata
temporaryMetadataName = None
if self._storage:
_logger.info('Checking for previous bundles')
@@ -93,90 +129,111 @@ class Git(codearchiver.core.Module):
# This is because the previous bundles must be disjoint: commit/tag objects are never duplicated. (Trees and blobs might be, but deduplicating those isn't possible.)
# Therefore, any previous bundle that contains at least one commit or tag object in the current clone must be a dependency.
for oldBundle in self._storage.search_metadata([('Module', type(self).name), ('Root commit', tuple(rootCommits))]):
_logger.info(f'Previous bundle: {oldBundle!r}')
with self._storage.open_metadata(oldBundle) as fp:
idx = GitMetadata.deserialise(fp)
isMatch = False
oldObjects = set() # commit and tag IDs in this bundle
indexObjectIds = {l.rstrip('\n').split(' ', 2)[1] for l in index.splitlines()}
# To support parallel archival of related repositories, this uses other processes' temporary metadata from and writes its own to storage.
# First, obtain all relevant prior bundles.
# Second, obtain all relevant temporary metadata. Make a note of these and also exclude their commits from this bundle. Write own temporary metadata.
# Third, upon completion (below), wait for the depended-on temporary metadata to disappear, search for the corresponding bundles, and finalise own metadata.
with self._storage.lock():
for oldBundleName, oldObjects, oldOids in self._find_storage_bundles([('Module', type(self).name), ('Root commit', tuple(rootCommits))], commitsAndTags):
metadata.append('Based on bundle', oldBundleName)
baseOids |= oldOids
for tmpMetadataName, tmpObjects, tmpOids in self._find_storage_bundles([('Module', type(self).name), ('Root commit', tuple(rootCommits))], commitsAndTags, temporary = True):
'''Put a local file and (if provided) its metadata into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.'''
@@ -30,9 +45,8 @@ class Storage(abc.ABC):
for _, subresult in result.submoduleResults:
self.put_result(subresult)
@property
@abc.abstractmethod
def newFiles(self) -> list[str]:
def list_new_files(self) -> list[str]:
'''
List of all files that have been `.put()` on this instance.
This may include additional files for storing metadata.
with open(os.path.join(self._directory, filename), mode) as fp:
yield fp
def add_temporary_metadata(self, metadata):
# Build a filename based on the current time in nanoseconds and a (truncated) hash of the metadata; this should guaranteed uniqueness to a sufficient degree.