From 9fa36654f5590087dd6e2de7ebbae7e30fe99310 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Mon, 27 Mar 2023 20:40:57 +0000 Subject: [PATCH] Add undocumented --write-artefacts-fd-3 for codearchiver-bot --- codearchiver/cli.py | 7 +++++++ codearchiver/storage.py | 27 +++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/codearchiver/cli.py b/codearchiver/cli.py index 1affcc8..de698fd 100644 --- a/codearchiver/cli.py +++ b/codearchiver/cli.py @@ -143,6 +143,8 @@ def parse_args(): parser.add_argument('--version', action = 'version', version = f'codearchiver {codearchiver.version.__version__}') parser.add_argument('-v', '--verbose', '--verbosity', dest = 'verbosity', action = 'count', default = 0, help = 'Increase output verbosity') parser.add_argument('--dump-locals', dest = 'dumpLocals', action = 'store_true', default = False, help = 'Dump local variables on serious log messages (warnings or higher)') + # Undocumented option to write one line for each artefact filename produced by this process to FD 3. + parser.add_argument('--write-artefacts-fd-3', dest = 'writeArtefactsFd3', action = 'store_true', help = argparse.SUPPRESS) parser.add_argument('url', help = 'Target URL') args = parser.parse_args() return args @@ -190,6 +192,8 @@ def main(): import codearchiver.storage with _dump_locals_on_exception(): inputUrl = codearchiver.core.InputURL(args.url) + if args.writeArtefactsFd3: + artefactsFd = os.fdopen(3, 'w') storage = codearchiver.storage.DirectoryStorage(os.getcwd()) module = codearchiver.core.get_module_instance(inputUrl, storage = storage) with tempfile.TemporaryDirectory(prefix = 'tmp.codearchiver.', dir = os.getcwd()) as td: @@ -199,6 +203,9 @@ def main(): result = module.process() finally: os.chdir('..') + if args.writeArtefactsFd3: + for filename in storage.newFiles: + print(filename, file = artefactsFd) if __name__ == '__main__': main() diff --git a/codearchiver/storage.py b/codearchiver/storage.py index 26d6e55..d94db42 100644 --- a/codearchiver/storage.py +++ b/codearchiver/storage.py @@ -13,6 +13,12 @@ _logger = logging.getLogger(__name__) class Storage(abc.ABC): + ''' + Interface for storage backing the codearchiver collection + This serves primarily to aid deduplication by locating prior archives of the same or closely related repositories. + Filenames must not contain LF. + ''' + @abc.abstractmethod def put(self, filename: str, metadata: typing.Optional['codearchiver.core.Metadata'] = None): '''Put a local file and (if provided) its metadata into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.''' @@ -24,6 +30,15 @@ class Storage(abc.ABC): for _, subresult in result.submoduleResults: self.put_result(subresult) + @property + @abc.abstractmethod + def newFiles(self) -> list[str]: + ''' + List of all files that have been `.put()` on this instance. + This may include additional files for storing metadata. + ''' + # The return value must be a copy of the state. + @abc.abstractmethod def search_metadata(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> collections.abc.Iterator[str]: ''' @@ -47,6 +62,7 @@ class DirectoryStorage(Storage): def __init__(self, directory): super().__init__() self._directory = directory + self._newFiles = [] def _check_directory(self): exists = os.path.exists(self._directory) @@ -60,11 +76,14 @@ class DirectoryStorage(Storage): def put(self, filename, metadata = None): self._ensure_directory() + if '\n' in filename: + raise ValueError(fr'filenames cannot contain \n: {filename!r}') #FIXME: Race condition if os.path.exists((targetFilename := os.path.join(self._directory, os.path.basename(filename)))): raise FileExistsError(f'{targetFilename} already exists') _logger.info(f'Moving {filename} to {self._directory}') shutil.move(filename, self._directory) + self._newFiles.append(filename) if not metadata: return metadataFilename = os.path.join(self._directory, f'{filename}_codearchiver_metadata.txt') @@ -72,6 +91,11 @@ class DirectoryStorage(Storage): _logger.info(f'Writing metadata for {filename} to {metadataFilename}') with open(metadataFilename, 'x') as fp: fp.write(metadata.serialise()) + self._newFiles.append(metadataFilename) + + @property + def newFiles(self): + return self._newFiles.copy() def search_metadata(self, criteria): _logger.info(f'Searching metadata by criteria: {criteria!r}') @@ -82,6 +106,7 @@ class DirectoryStorage(Storage): files.sort() for metadataFilename in files: metadataFilename = metadataFilename[escapedDirPrefixLen:] + assert '\n' not in metadataFilename _logger.info(f'Searching metadata {metadataFilename}') with self.open(metadataFilename, 'r') as fp: idx = codearchiver.core.Metadata.deserialise(fp, validate = False) @@ -97,5 +122,7 @@ class DirectoryStorage(Storage): @contextlib.contextmanager def open(self, filename, mode = 'rb'): + if '\n' in filename: + raise ValueError(fr'filenames cannot contain \n: {filename!r}') with open(os.path.join(self._directory, filename), mode) as fp: yield fp