The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

349 lines
13 KiB

  1. #!/usr/bin/env python3
  2. # Only external dependency: requests
  3. import argparse
  4. import base64
  5. import collections
  6. import configparser
  7. import contextlib
  8. import functools
  9. import hashlib
  10. import io
  11. import itertools
  12. import json
  13. import logging
  14. import os
  15. import pprint
  16. import re
  17. import requests
  18. import sys
  19. import time
  20. try:
  21. import tqdm
  22. except ImportError:
  23. tqdm = None
  24. import types
  25. logger = logging.getLogger()
  26. class UploadError(Exception):
  27. def __init__(self, message, r = None, uploadId = None, parts = None):
  28. self.message = message
  29. self.r = r
  30. self.uploadId = uploadId
  31. self.parts = parts
  32. class PreventCompletionError(UploadError):
  33. 'Raised in place of completing the upload when --no-complete is active'
  34. def get_ia_access_secret(configFile = None):
  35. if configFile is None:
  36. # This part of the code is identical (except for style changes) to the one in internetarchive and was written from scratch by JustAnotherArchivist in May 2021.
  37. xdgConfigHome = os.environ.get('XDG_CONFIG_HOME')
  38. if not xdgConfigHome or not os.path.isabs(xdgConfigHome) or not os.path.isdir(xdgConfigHome):
  39. # Per the XDG Base Dir specification, this should be $HOME/.config. Unfortunately, $HOME does not exist on all systems. Therefore, we use ~/.config here.
  40. # On a POSIX-compliant system, where $HOME must always be set, the XDG spec will be followed precisely.
  41. xdgConfigHome = os.path.join(os.path.expanduser('~'), '.config')
  42. for candidate in [os.path.join(xdgConfigHome, 'internetarchive', 'ia.ini'),
  43. os.path.join(os.path.expanduser('~'), '.config', 'ia.ini'),
  44. os.path.join(os.path.expanduser('~'), '.ia')]:
  45. if os.path.isfile(candidate):
  46. configFile = candidate
  47. break
  48. # (End of the identical code)
  49. elif not os.path.isfile(configFile):
  50. configFile = None
  51. if not configFile:
  52. raise RuntimeError('Could not find ia configuration file; did you run `ia configure`?')
  53. config = configparser.RawConfigParser()
  54. config.read(configFile)
  55. if 's3' not in config or 'access' not in config['s3'] or 'secret' not in config['s3']:
  56. raise RuntimeError('Could not read configuration; did you run `ia configure`?')
  57. access = config['s3']['access']
  58. secret = config['s3']['secret']
  59. return access, secret
  60. def metadata_to_headers(metadata):
  61. # metadata is a dict or a list of 2-tuples.
  62. # Returns the headers for the IA S3 request as a dict.
  63. headers = {}
  64. counters = collections.defaultdict(int) # How often each metadata key has been seen
  65. if isinstance(metadata, dict):
  66. metadata = metadata.items()
  67. for key, value in metadata:
  68. headers[f'x-archive-meta{counters[key]:02d}-{key.replace("_", "--")}'] = value.encode('utf-8')
  69. counters[key] += 1
  70. return headers
  71. def readinto_size_limit(fin, fout, size, blockSize = 1048576):
  72. while size:
  73. d = fin.read(min(blockSize, size))
  74. if not d:
  75. break
  76. fout.write(d)
  77. size -= len(d)
  78. @contextlib.contextmanager
  79. def file_progress_bar(f, mode, description, size = None):
  80. if size is None:
  81. pos = f.tell()
  82. f.seek(0, io.SEEK_END)
  83. size = f.tell() - pos
  84. f.seek(pos, io.SEEK_SET)
  85. if tqdm is not None:
  86. with tqdm.tqdm(total = size, unit = 'iB', unit_scale = True, unit_divisor = 1024, desc = description) as t:
  87. wrappedFile = tqdm.utils.CallbackIOWrapper(t.update, f, mode)
  88. yield wrappedFile
  89. else:
  90. # Simple progress bar that just prints a new line with elapsed time and size in MiB on every read or write
  91. processedSize = 0
  92. startTime = time.time()
  93. def _progress(inc):
  94. nonlocal processedSize
  95. processedSize += inc
  96. proc = f'{processedSize / size * 100 :.0f}%, ' if size else ''
  97. of = f' of {size / 1048576 :.2f}' if size else ''
  98. print(f'\r{description}: {proc}{processedSize / 1048576 :.2f}{of} MiB, {time.time() - startTime :.1f} s', end = '', file = sys.stderr)
  99. class Wrapper:
  100. def __init__(self, wrapped):
  101. object.__setattr__(self, '_wrapped', wrapped)
  102. def __getattr__(self, name):
  103. return getattr(self._wrapped, name)
  104. def __setattr__(self, name, value):
  105. return setattr(self._wrapped, name, value)
  106. func = getattr(f, mode)
  107. @functools.wraps(func)
  108. def _readwrite(self, *args, **kwargs):
  109. nonlocal mode
  110. res = func(*args, **kwargs)
  111. if mode == 'write':
  112. data, args = args[0], args[1:]
  113. else:
  114. data = res
  115. _progress(len(data))
  116. return res
  117. wrapper = Wrapper(f)
  118. object.__setattr__(wrapper, mode, types.MethodType(_readwrite, wrapper))
  119. yield wrapper
  120. print(f'\rdone {description}, {processedSize / 1048576 :.2f} MiB in {time.time() - startTime :.1f} seconds', file = sys.stderr) # EOL when it's done
  121. @contextlib.contextmanager
  122. def maybe_file_progress_bar(progress, f, *args, **kwargs):
  123. if progress:
  124. with file_progress_bar(f, *args, **kwargs) as r:
  125. yield r
  126. else:
  127. yield f
  128. def upload(item, filename, metadata, *, iaConfigFile = None, partSize = 100*1024*1024, tries = 3, queueDerive = True, keepOldVersion = True, complete = True, uploadId = None, parts = None, progress = True):
  129. f = sys.stdin.buffer
  130. # Read `ia` config
  131. access, secret = get_ia_access_secret(iaConfigFile)
  132. url = f'https://s3.us.archive.org/{item}/{filename}'
  133. headers = {'Authorization': f'LOW {access}:{secret}'}
  134. if uploadId is None:
  135. # Initiate multipart upload
  136. logger.info(f'Initiating multipart upload for {filename} in {item}')
  137. metadataHeaders = metadata_to_headers(metadata)
  138. r = requests.post(f'{url}?uploads', headers = {**headers, 'x-amz-auto-make-bucket': '1', **metadataHeaders})
  139. if r.status_code != 200:
  140. raise UploadError(f'Could not initiate multipart upload; got status {r.status_code} from IA S3', r = r)
  141. # Fight me!
  142. m = re.search(r'<uploadid>([^<]*)</uploadid>', r.text, re.IGNORECASE)
  143. if not m or not m[1]:
  144. raise UploadError('Could not find upload ID in IA S3 response', r = r)
  145. uploadId = m[1]
  146. logger.info(f'Got upload ID {uploadId}')
  147. # Upload the data in parts
  148. if parts is None:
  149. parts = []
  150. for partNumber in itertools.count(start = len(parts) + 1):
  151. data = io.BytesIO()
  152. with maybe_file_progress_bar(progress, data, 'write', 'reading input') as w:
  153. readinto_size_limit(f, w, partSize)
  154. data.seek(0)
  155. size = len(data.getbuffer())
  156. if not size:
  157. # We're done!
  158. break
  159. logger.info(f'Uploading part {partNumber} ({size} bytes)')
  160. logger.info('Calculating MD5')
  161. h = hashlib.md5(data.getbuffer())
  162. logger.info(f'MD5: {h.hexdigest()}')
  163. contentMd5 = base64.b64encode(h.digest()).decode('ascii')
  164. for attempt in range(1, tries + 1):
  165. if attempt > 1:
  166. logger.info(f'Retrying part {partNumber}')
  167. try:
  168. with maybe_file_progress_bar(progress, data, 'read', 'uploading', size = size) as w:
  169. r = requests.put(f'{url}?partNumber={partNumber}&uploadId={uploadId}', headers = {**headers, 'Content-MD5': contentMd5}, data = w)
  170. except (ConnectionError, requests.exceptions.RequestException) as e:
  171. err = f'error {type(e).__module__}.{type(e).__name__} {e!s}'
  172. else:
  173. if r.status_code == 200:
  174. break
  175. err = f'status {r.status_code}'
  176. sleepTime = min(3 ** attempt, 30)
  177. retrying = f', retrying after {sleepTime} seconds' if attempt < tries else ''
  178. logger.error(f'Got {err} from IA S3 on uploading part {partNumber}{retrying}')
  179. if attempt == tries:
  180. raise UploadError(f'Got {err} from IA S3 on uploading part {partNumber}', r = r, uploadId = uploadId, parts = parts)
  181. time.sleep(sleepTime)
  182. logger.info(f'Upload OK, ETag: {r.headers["ETag"]}')
  183. parts.append((partNumber, r.headers['ETag']))
  184. # If --no-complete is used, raise the special error to be caught in main for pretty printing.
  185. if not complete:
  186. logger.info('Not completing upload')
  187. raise PreventCompletionError('', uploadId = uploadId, parts = parts)
  188. # Complete upload
  189. logger.info('Completing upload')
  190. # FUCKING FIGHT ME!
  191. completeData = '<CompleteMultipartUpload>' + ''.join(f'<Part><PartNumber>{partNumber}</PartNumber><ETag>{etag}</ETag></Part>' for partNumber, etag in parts) + '</CompleteMultipartUpload>'
  192. completeData = completeData.encode('utf-8')
  193. extraHeaders = {'x-archive-queue-derive': '1' if queueDerive else '0', 'x-archive-keep-old-version': '1' if keepOldVersion else '0'}
  194. for attempt in range(1, tries + 1):
  195. if attempt > 1:
  196. logger.info('Retrying completion request')
  197. r = requests.post(f'{url}?uploadId={uploadId}', headers = {**headers, **extraHeaders}, data = completeData)
  198. if r.status_code == 200:
  199. break
  200. retrying = f', retrying' if attempt < tries else ''
  201. logger.error(f'Could not complete upload; got status {r.status_code} from IA S3{retrying}')
  202. if attempt == tries:
  203. raise UploadError(f'Could not complete upload; got status {r.status_code} from IA S3', r = r, uploadId = uploadId, parts = parts)
  204. logger.info('Done!')
  205. def abort(item, filename, uploadId, *, iaConfigFile = None, tries = 3):
  206. # Read `ia` config
  207. access, secret = get_ia_access_secret(iaConfigFile)
  208. url = f'https://s3.us.archive.org/{item}/{filename}'
  209. headers = {'Authorization': f'LOW {access}:{secret}'}
  210. # Delete upload
  211. logger.info(f'Aborting upload {uploadId}')
  212. for attempt in range(1, tries + 1):
  213. if attempt > 1:
  214. logger.info('Retrying abort request')
  215. r = requests.delete(f'{url}?uploadId={uploadId}', headers = headers)
  216. if r.status_code == 204:
  217. break
  218. retrying = f', retrying' if attempt < tries else ''
  219. logger.error(f'Could not abort upload; got status {r.status_code} from IA S3{retrying}')
  220. if attempt == tries:
  221. raise UploadError(f'Could not abort upload; got status {r.status_code} from IA S3', r = r, uploadId = uploadId)
  222. logger.info('Done!')
  223. def main():
  224. def metadata(x):
  225. if ':' not in x:
  226. raise ValueError
  227. return x.split(':', 1)
  228. def size(x):
  229. try:
  230. return int(x)
  231. except ValueError:
  232. pass
  233. if x.endswith('M'):
  234. return int(x[:-1]) * 1024 ** 2
  235. elif x.endswith('G'):
  236. return int(x[:-1]) * 1024 ** 3
  237. raise ValueError
  238. def parts(x):
  239. try:
  240. o = json.loads(base64.b64decode(x))
  241. except json.JSONDecodeError as e:
  242. raise ValueError from e
  243. if not isinstance(o, list) or not all(isinstance(e, list) and len(e) == 2 for e in o):
  244. raise ValueError
  245. if [i for i, _ in o] != list(range(1, len(o) + 1)):
  246. raise ValueError
  247. return o
  248. parser = argparse.ArgumentParser()
  249. parser.add_argument('--partsize', dest = 'partSize', type = size, default = size('100M'), help = 'size of each chunk to buffer in memory and upload (default: 100M = 100 MiB)')
  250. parser.add_argument('--no-derive', dest = 'queueDerive', action = 'store_false', help = 'disable queueing a derive task')
  251. parser.add_argument('--clobber', dest = 'keepOldVersion', action = 'store_false', help = 'enable clobbering existing files')
  252. parser.add_argument('--ia-config-file', dest = 'iaConfigFile', metavar = 'FILE', help = 'path to the ia CLI config file (default: search the same paths as ia)')
  253. parser.add_argument('--tries', type = int, default = 3, metavar = 'N', help = 'retry on S3 errors (default: 3)')
  254. parser.add_argument('--no-complete', dest = 'complete', action = 'store_false', help = 'disable completing the upload when stdin is exhausted')
  255. parser.add_argument('--no-progress', dest = 'progress', action = 'store_false', help = 'disable progress bar')
  256. parser.add_argument('--upload-id', dest = 'uploadId', help = 'upload ID when resuming or aborting an upload')
  257. parser.add_argument('--parts', type = parts, help = 'previous parts data for resumption; can only be used with --upload-id')
  258. parser.add_argument('--abort', action = 'store_true', help = 'aborts an upload; can only be used with --upload-id; most other options are ignored when this is used')
  259. parser.add_argument('item', help = 'identifier of the target item')
  260. parser.add_argument('filename', help = 'filename to store the data to')
  261. parser.add_argument('metadata', nargs = '*', type = metadata, help = "metadata for the item in the form 'key:value'; only has an effect if the item doesn't exist yet")
  262. args = parser.parse_args()
  263. if (args.parts or args.abort) and not args.uploadId:
  264. parser.error('--parts and --abort can only be used together with --upload-id')
  265. if args.uploadId and (args.parts is not None) == bool(args.abort):
  266. parser.error('--upload-id requires exactly one of --parts and --abort')
  267. logging.basicConfig(level = logging.INFO, format = '{asctime}.{msecs:03.0f} {levelname} {name} {message}', datefmt = '%Y-%m-%d %H:%M:%S', style = '{')
  268. try:
  269. if not args.abort:
  270. upload(
  271. args.item,
  272. args.filename,
  273. args.metadata,
  274. iaConfigFile = args.iaConfigFile,
  275. partSize = args.partSize,
  276. tries = args.tries,
  277. queueDerive = args.queueDerive,
  278. keepOldVersion = args.keepOldVersion,
  279. complete = args.complete,
  280. uploadId = args.uploadId,
  281. parts = args.parts,
  282. progress = args.progress,
  283. )
  284. else:
  285. abort(
  286. args.item,
  287. args.filename,
  288. args.uploadId,
  289. iaConfigFile = args.iaConfigFile,
  290. tries = args.tries,
  291. )
  292. except (RuntimeError, UploadError) as e:
  293. if isinstance(e, PreventCompletionError):
  294. level = logging.INFO
  295. else:
  296. logger.exception('Unhandled exception raised')
  297. level = logging.WARNING
  298. if isinstance(e, UploadError):
  299. if e.r is not None:
  300. logger.info(pprint.pformat(vars(e.r.request)), exc_info = False)
  301. logger.info(pprint.pformat(vars(e.r)), exc_info = False)
  302. if e.uploadId:
  303. logger.log(level, f'Upload ID for resumption or abortion: {e.uploadId}', exc_info = False)
  304. if e.parts:
  305. parts = base64.b64encode(json.dumps(e.parts, separators = (',', ':')).encode('ascii')).decode('ascii')
  306. logger.log(level, f'Previous parts data for resumption: {parts}', exc_info = False)
  307. if __name__ == '__main__':
  308. main()