From 811e119835f5ef0728a911e413938c87cdfcad9d Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 10 Mar 2023 11:24:22 +0000 Subject: [PATCH] Add retrieval start/end time metadata fields --- codearchiver/core.py | 12 ++++++++++-- codearchiver/modules/git.py | 5 ++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/codearchiver/core.py b/codearchiver/core.py index 30047cd..35a7d89 100644 --- a/codearchiver/core.py +++ b/codearchiver/core.py @@ -91,6 +91,8 @@ class Metadata(list[tuple[str, str]]): MetadataField('ID', required = True, repeatable = False), MetadataField('Input URL', required = True, repeatable = False), MetadataField('Filename', required = True, repeatable = False), + MetadataField('Retrieval start time', required = True, repeatable = False), + MetadataField('Retrieval end time', required = True, repeatable = False), ) '''The fields for this metadata collection''' @@ -376,8 +378,12 @@ class Module(metaclass = ModuleMeta): def process(self) -> Result: '''Perform the relevant retrieval(s)''' - def create_metadata(self, filename: str) -> Metadata: - '''Create a basic Metadata instance appropriate for this module''' + def create_metadata(self, filename: str, startTime: datetime.datetime, endTime: datetime.datetime) -> Metadata: + ''' + Create a basic Metadata instance appropriate for this module + + `startTime` and `endTime` must be in UTC (e.g. `datetime.datetime.utcnow()`). They should reflect the moments just before and after all interaction with the remote system. + ''' if type(self).MetadataClass is None or type(self).name is None: raise RuntimeError('Module lacks an MetadataClass or a name; cannot create metadata') @@ -387,6 +393,8 @@ class Module(metaclass = ModuleMeta): idx.append('ID', self._id) idx.append('Input URL', self._url) idx.append('Filename', filename) + idx.append('Retrieval start time', startTime.strftime('%Y-%m-%d %H:%M:%S.%f UTC')) + idx.append('Retrieval end time', endTime.strftime('%Y-%m-%d %H:%M:%S.%f UTC')) return idx def __repr__(self): diff --git a/codearchiver/modules/git.py b/codearchiver/modules/git.py index f7f704d..323d480 100644 --- a/codearchiver/modules/git.py +++ b/codearchiver/modules/git.py @@ -1,5 +1,6 @@ import codearchiver.core import codearchiver.subprocess +import datetime import functools import hashlib import logging @@ -49,6 +50,7 @@ class Git(codearchiver.core.Module): gitVersion = gitVersion[12:-1] _logger.info(f'Cloning {self._url} into {directory}') + startTime = datetime.datetime.utcnow() codearchiver.subprocess.run_with_log(['git', 'clone', '--verbose', '--progress', '--mirror', self._url, directory], env = {**os.environ, 'GIT_TERMINAL_PROMPT': '0'}) if self._extraBranches: @@ -62,6 +64,7 @@ class Git(codearchiver.core.Module): else: _logger.error(f'Failed to fetch {commit}') # This leaves over a FETCH_HEAD file, but git-bundle does not care about that, so it can safely be ignored. + endTime = datetime.datetime.utcnow() _logger.info('Collecting repository metadata') _, refs, _ = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory) @@ -104,7 +107,7 @@ class Git(codearchiver.core.Module): _logger.info(f'Removing clone') shutil.rmtree(directory) - metadata = self.create_metadata(bundle) + metadata = self.create_metadata(bundle, startTime, endTime) metadata.append('Git version', gitVersion) for oldBundle in basedOnBundles: metadata.append('Based on bundle', oldBundle)