Kaynağa Gözat

Add indices for files

tags/v1.0
JustAnotherArchivist 1 yıl önce
ebeveyn
işleme
0f1f5abc64
3 değiştirilmiş dosya ile 102 ekleme ve 8 silme
  1. +69
    -2
      codearchiver/core.py
  2. +22
    -1
      codearchiver/modules/git.py
  3. +11
    -5
      codearchiver/storage.py

+ 69
- 2
codearchiver/core.py Dosyayı Görüntüle

@@ -2,8 +2,11 @@ import abc
import collections import collections
#import codearchiver.modules # In get_module_class #import codearchiver.modules # In get_module_class
import codearchiver.version import codearchiver.version
import contextlib
import dataclasses import dataclasses
import functools
import logging import logging
import os
import queue import queue
import requests import requests
import time import time
@@ -59,13 +62,77 @@ class Result:
id: str id: str
'''A unique ID for this result''' '''A unique ID for this result'''


files: list[str] = dataclasses.field(default_factory = list)
'''List of filenames produced by the run'''
files: list[tuple[str, typing.Optional['Index']]] = dataclasses.field(default_factory = list)
'''List of filenames produced by the run, optionally with an index'''


submoduleResults: list[tuple['Module', 'Result']] = dataclasses.field(default_factory = list) submoduleResults: list[tuple['Module', 'Result']] = dataclasses.field(default_factory = list)
'''List of related submodules and their results''' '''List of related submodules and their results'''




class IndexValidationError(ValueError):
pass


@dataclasses.dataclass
class IndexField:
key: str
required: bool
repeatable: bool


class Index(list[tuple[str, str]]):
'''An index (key-value mapping, possibly with repeated keys) of a file produced by a module'''

fields: list[IndexField] = []
'''The fields for this index'''

def append(self, *args):
if len(args) == 1:
args = args[0]
return super().append(args)

def validate(self):
'''Check that all keys and values in the index conform to the specification'''

keyCounts = collections.Counter(key for key, _ in self)
keys = set(keyCounts)

permittedKeys = set(field.key for field in type(self).fields)
unrecognisedKeys = keys - permittedKeys
if unrecognisedKeys:
raise IndexValidationError(f'Unrecognised key(s): {", ".join(sorted(unrecognisedKeys))}')

requiredKeys = set(field.key for field in type(self).fields if field.required)
missingRequiredKeys = requiredKeys - keys
if missingRequiredKeys:
raise IndexValidationError(f'Missing required key(s): {", ".join(sorted(missingRequiredKeys))}')

repeatableKeys = set(field.key for field in type(self).fields if field.repeatable)
repeatedKeys = set(key for key, count in keyCounts.items() if count > 1)
repeatedUnrepeatableKeys = repeatedKeys - repeatableKeys
if repeatedUnrepeatableKeys:
raise IndexValidationError(f'Repeated unrepeatable key(s): {", ".join(sorted(repeatedUnrepeatableKeys))}')

def serialise(self) -> str:
'''Convert the index to a string suitable for e.g. a simple text file storage'''

self.validate()
return ''.join(f'{key}: {value}\n' for key, value in self)

@classmethod
def deserialise(cls, f: typing.Union[str, bytes, os.PathLike, typing.TextIO]):
'''Import a serialised index from a filename or file-like object'''

if isinstance(f, (str, bytes, os.PathLike)):
cm = open(f, 'r')
else:
cm = contextlib.nullcontext(f)
with cm as fp:
o = cls((key, value[:-1]) for key, value in map(functools.partial(str.split, sep = ': '), fp))
o.validate()
return o


class HttpError(Exception): class HttpError(Exception):
'''An HTTP request failed too many times.''' '''An HTTP request failed too many times.'''




+ 22
- 1
codearchiver/modules/git.py Dosyayı Görüntüle

@@ -1,6 +1,7 @@
import codearchiver.core import codearchiver.core
import codearchiver.subprocess import codearchiver.subprocess
import datetime import datetime
import functools
import logging import logging
import os.path import os.path
import shutil import shutil
@@ -10,6 +11,14 @@ import subprocess
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)




class GitIndex(codearchiver.core.Index):
fields = [
codearchiver.core.IndexField(key = 'Ref', required = True, repeatable = True),
codearchiver.core.IndexField(key = 'Root commit', required = True, repeatable = True),
codearchiver.core.IndexField(key = 'Commit', required = True, repeatable = True),
]


class Git(codearchiver.core.Module): class Git(codearchiver.core.Module):
name = 'git' name = 'git'


@@ -51,10 +60,22 @@ class Git(codearchiver.core.Module):
logger.info(f'Bundling into {bundle}') logger.info(f'Bundling into {bundle}')
codearchiver.subprocess.run_with_log(['git', 'bundle', 'create', '--progress', f'../{bundle}', '--all'], cwd = directory) codearchiver.subprocess.run_with_log(['git', 'bundle', 'create', '--progress', f'../{bundle}', '--all'], cwd = directory)


logger.info(f'Collecting repository metadata for index')
_, refs = codearchiver.subprocess.run_with_log(['git', 'show-ref'], cwd = directory)
_, commits = codearchiver.subprocess.run_with_log(['git', 'log', '--reflog', '--format=format:%H% P'], cwd = directory)

logger.info(f'Removing clone') logger.info(f'Removing clone')
shutil.rmtree(directory) shutil.rmtree(directory)


return codearchiver.core.Result(id = self._id, files = [bundle])
index = GitIndex()
for line in refs.splitlines():
index.append('Ref', line)
for commitHash, *parents in map(functools.partial(str.split, sep = ' '), commits.splitlines()):
index.append('Commit', commitHash)
if not parents:
index.append('Root commit', commitHash)

return codearchiver.core.Result(id = self._id, files = [(bundle, index)])


def __repr__(self): def __repr__(self):
return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r}, extraBranches = {self._extraBranches!r})' return f'{type(self).__module__}.{type(self).__name__}({self._inputUrl!r}, extraBranches = {self._extraBranches!r})'

+ 11
- 5
codearchiver/storage.py Dosyayı Görüntüle

@@ -12,13 +12,13 @@ _logger = logging.getLogger(__name__)


class Storage(abc.ABC): class Storage(abc.ABC):
@abc.abstractmethod @abc.abstractmethod
def put(self, filename: str):
'''Put a local file into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.'''
def put(self, filename: str, index: typing.Optional[codearchiver.core.Index] = None):
'''Put a local file and (if provided) its index into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.'''


def put_result(self, result: codearchiver.core.Result): def put_result(self, result: codearchiver.core.Result):
'''Put a module's Result into storage. The semantics are as for `put`, and the exact behaviour regarding partial copies and leftover files on errors is undefined.''' '''Put a module's Result into storage. The semantics are as for `put`, and the exact behaviour regarding partial copies and leftover files on errors is undefined.'''
for fn in result.files:
self.put(fn)
for fn, index in result.files:
self.put(fn, index)
for _, subresult in result.submoduleResults: for _, subresult in result.submoduleResults:
self.put_result(subresult) self.put_result(subresult)


@@ -43,10 +43,16 @@ class DirectoryStorage(Storage):
if not self._check_directory(): if not self._check_directory():
os.makedirs(self._directory) os.makedirs(self._directory)


def put(self, filename):
def put(self, filename, index = None):
self._ensure_directory() self._ensure_directory()
_logger.info(f'Moving {filename} to {self._directory}') _logger.info(f'Moving {filename} to {self._directory}')
shutil.move(filename, self._directory) shutil.move(filename, self._directory)
if not index:
return
indexFilename = os.path.join(self._directory, f'{filename}.codearchiver-index')
_logger.info(f'Writing index for {filename} to {indexFilename}')
with open(indexFilename, 'x') as fp:
fp.write(index.serialise())


@contextlib.contextmanager @contextlib.contextmanager
def open(self, filename): def open(self, filename):


Yükleniyor…
İptal
Kaydet