A VCS repository archival tool
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

229 lines
9.0 KiB

  1. import abc
  2. import codearchiver.core
  3. import collections.abc
  4. import contextlib
  5. import filelock
  6. import glob
  7. import hashlib
  8. import logging
  9. import os
  10. import os.path
  11. import shutil
  12. import time
  13. import typing
  14. _logger = logging.getLogger(__name__)
  15. class Storage(abc.ABC):
  16. '''
  17. Interface for storage backing the codearchiver collection
  18. This serves primarily to aid deduplication by locating prior archives of the same or closely related repositories.
  19. Filenames must not contain LF.
  20. '''
  21. @abc.abstractmethod
  22. @contextlib.contextmanager
  23. def lock(self, blocking = True) -> typing.Iterator[bool]:
  24. '''
  25. Acquire a lock on the storage.
  26. If `blocking`, this method blocks until the lock can be acquired.
  27. Yields whether the lock was acquired. If `blocking`, this is always `True`.
  28. Once the context manager is exited, the lock shall be released.
  29. Other methods must only be called while holding the lock unless noted otherwise. The `Storage` class may or may not enforce this.
  30. '''
  31. @abc.abstractmethod
  32. def put(self, filename: str, metadata: typing.Optional['codearchiver.core.Metadata'] = None):
  33. '''Put a local file and (if provided) its metadata into storage. If an error occurs, a partial copy may remain in storage. If it completes, the local input file is removed.'''
  34. def put_result(self, result: 'codearchiver.core.Result'):
  35. '''Put a module's Result into storage. The semantics are as for `put`, and the exact behaviour regarding partial copies and leftover files on errors is undefined.'''
  36. for fn, metadata in result.files:
  37. self.put(fn, metadata)
  38. for _, subresult in result.submoduleResults:
  39. self.put_result(subresult)
  40. @abc.abstractmethod
  41. def list_new_files(self) -> list[str]:
  42. '''
  43. List of all files that have been `.put()` on this instance.
  44. This may include additional files for storing metadata.
  45. '''
  46. # The return value must be a copy of the state.
  47. @abc.abstractmethod
  48. def search_metadata(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> collections.abc.Iterator[str]:
  49. '''
  50. Search all metadata in storage by criteria.
  51. Refer to `codearchiver.core.Metadata.matches` for the semantics of `criteria`.
  52. Yields all filenames where all criteria match in lexicographical order.
  53. '''
  54. @abc.abstractmethod
  55. @contextlib.contextmanager
  56. def open_metadata(self, filename: str) -> typing.TextIO:
  57. '''Open the metadata for a file in serialised form.'''
  58. @abc.abstractmethod
  59. @contextlib.contextmanager
  60. def open(self, filename: str, mode: typing.Optional[str] = 'rb') -> typing.Iterator[typing.Union[typing.BinaryIO, typing.TextIO]]:
  61. '''Open a file from storage. The mode must be r or rb.'''
  62. @abc.abstractmethod
  63. def add_temporary_metadata(self, metadata: 'codearchiver.core.Metadata') -> str:
  64. '''
  65. Add a temporary metadata record, to be replaced by permanent data or removed depending on the further processing.
  66. This is intended to allow for parallel deduplication.
  67. Every call to this method MUST be paired with a call to either `replace_temporary_metadata` or `remove_temporary_metadata`.
  68. Returns a unique name for this temporary record for use in the other `*_temporary_metadata` methods.
  69. '''
  70. # The name must be unique in perpetuity, i.e. it must never be reused.
  71. @abc.abstractmethod
  72. def search_temporary_metadata(self, criteria: list[tuple[str, typing.Union[str, tuple[str]]]]) -> collections.abc.Iterator[str]:
  73. '''Same as `search_metadata`, but for the temporary metadata written by `add_temporary_metadata`.'''
  74. @abc.abstractmethod
  75. def open_temporary_metadata(self, name: str) -> typing.TextIO:
  76. '''Open temporary metadata.'''
  77. @abc.abstractmethod
  78. def replace_temporary_metadata(self, name: str, filename: str, metadata: 'codearchiver.core.Metadata'):
  79. '''Replace the temporary metadata with a matching proper file and accompanying metadata.'''
  80. @abc.abstractmethod
  81. def remove_temporary_metadata(self, name: str):
  82. '''Remove the temporary metadata without adding a matching proper file instead, e.g. in case of an error.'''
  83. @abc.abstractmethod
  84. def wait_temporary_metadata(self, names: list[str], sleepTime: typing.Optional[float] = None):
  85. '''
  86. Block until all temporary metadata in `names` are gone.
  87. `sleepTime` is the time to wait between attempts to check for existence, used for storage layers that do not support direct monitoring.
  88. The caller should afterwards use `search_metadata` with appropriate `criteria` to find matching permanent files.
  89. This method must be called without holding the global storage lock.
  90. '''
  91. class DirectoryStorage(Storage):
  92. def __init__(self, directory):
  93. super().__init__()
  94. self._directory = directory
  95. self._newFiles = []
  96. self._lock = filelock.FileLock(os.path.join(self._directory, '.lock'))
  97. @contextlib.contextmanager
  98. def lock(self, blocking = True):
  99. try:
  100. with self._lock.acquire(blocking = blocking):
  101. yield True
  102. except filelock.Timeout:
  103. yield False
  104. def _check_directory(self):
  105. exists = os.path.exists(self._directory)
  106. if exists and not os.path.isdir(self._directory):
  107. raise NotADirectoryError(self._directory)
  108. return exists
  109. def _ensure_directory(self):
  110. if not self._check_directory():
  111. os.makedirs(self._directory)
  112. def put(self, filename, metadata = None):
  113. self._ensure_directory()
  114. if '\n' in filename:
  115. raise ValueError(fr'filenames cannot contain \n: {filename!r}')
  116. #FIXME: Race condition
  117. if os.path.exists((targetFilename := os.path.join(self._directory, os.path.basename(filename)))):
  118. raise FileExistsError(f'{targetFilename} already exists')
  119. _logger.info(f'Moving {filename} to {self._directory}')
  120. shutil.move(filename, self._directory)
  121. self._newFiles.append(filename)
  122. if not metadata:
  123. return
  124. metadataFilename = os.path.join(self._directory, f'{filename}_codearchiver_metadata.txt')
  125. # No need to check for existence here thanks to the 'x' mode
  126. _logger.info(f'Writing metadata for {filename} to {metadataFilename}')
  127. with open(metadataFilename, 'x') as fp:
  128. fp.write(metadata.serialise())
  129. self._newFiles.append(metadataFilename)
  130. def list_new_files(self):
  131. return self._newFiles.copy()
  132. def search_metadata(self, criteria, _suffix = '_codearchiver_metadata.txt'):
  133. _logger.info(f'Searching metadata by criteria: {criteria!r}')
  134. # Replace this with `root_dir` when dropping Python 3.9 support
  135. escapedDirPrefix = os.path.join(glob.escape(self._directory), '')
  136. escapedDirPrefixLen = len(escapedDirPrefix)
  137. escapedSuffix = glob.escape(_suffix)
  138. files = glob.glob(f'{escapedDirPrefix}*{escapedSuffix}')
  139. files.sort()
  140. for metadataFilename in files:
  141. metadataFilename = metadataFilename[escapedDirPrefixLen:]
  142. assert '\n' not in metadataFilename
  143. _logger.info(f'Searching metadata {metadataFilename}')
  144. with self.open(metadataFilename, 'r') as fp:
  145. idx = codearchiver.core.Metadata.deserialise(fp, validate = False)
  146. if idx.matches(criteria):
  147. _logger.info(f'Found metadata match {metadataFilename}')
  148. yield metadataFilename[:-len(_suffix)]
  149. _logger.info('Done searching metadata')
  150. @contextlib.contextmanager
  151. def open_metadata(self, filename):
  152. with self.open(f'{filename}_codearchiver_metadata.txt', 'r') as fp:
  153. yield fp
  154. @contextlib.contextmanager
  155. def open(self, filename, mode = 'rb'):
  156. if '\n' in filename:
  157. raise ValueError(fr'filenames cannot contain \n: {filename!r}')
  158. with open(os.path.join(self._directory, filename), mode) as fp:
  159. yield fp
  160. def add_temporary_metadata(self, metadata):
  161. # Build a filename based on the current time in nanoseconds and a (truncated) hash of the metadata; this should guaranteed uniqueness to a sufficient degree.
  162. serialised = metadata.serialise().encode('utf-8')
  163. metadataHash = hashlib.sha512(serialised).hexdigest()[:128]
  164. filename = f'tmp_{time.time_ns()}_{metadataHash}_codearchiver_temporary_metadata.txt'
  165. self._ensure_directory()
  166. _logger.info(f'Writing temporary metadata to {filename}')
  167. with open(os.path.join(self._directory, filename), 'xb') as fp:
  168. fp.write(serialised)
  169. _logger.info('Done writing temporary metadata file')
  170. return filename
  171. def search_temporary_metadata(self, criteria):
  172. yield from self.search_metadata(criteria, _suffix = '_codearchiver_temporary_metadata.txt')
  173. @contextlib.contextmanager
  174. def open_temporary_metadata(self, name):
  175. with self.open(f'{name}_codearchiver_temporary_metadata.txt', 'r') as fp:
  176. yield fp
  177. def replace_temporary_metadata(self, name, filename, metadata):
  178. self.put(filename, metadata)
  179. self.remove_temporary_metadata(name)
  180. def remove_temporary_metadata(self, name):
  181. if not name.endswith('_codearchiver_temporary_metadata.txt'):
  182. raise RuntimeError('invalid temporary metadata name provided')
  183. _logger.info(f'Removing temporary metadata file {name}')
  184. os.remove(os.path.join(self._directory, name))
  185. def wait_temporary_metadata(self, names, sleepTime = 5):
  186. _logger.info(f'Waiting for temporary metadata: {names!r}')
  187. remaining = set(names)
  188. while remaining:
  189. with self.lock(blocking = False) as locked:
  190. if locked:
  191. remaining = set(filename for filename in remaining if os.path.exists(os.path.join(self._directory, filename)))
  192. if not remaining:
  193. break
  194. time.sleep(sleepTime)
  195. _logger.info('All temporary metadata files gone')