The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

143 regels
4.9 KiB

  1. #!/usr/bin/env python3
  2. import html
  3. import http.client
  4. import json
  5. import os
  6. import re
  7. import shlex
  8. import ssl
  9. import sys
  10. import urllib.parse
  11. RESPONSE_PATTERN = re.compile(r'''^<\?xml version=(["'])1\.0\1 encoding=(["'])UTF-8\2\?>''' '\n?' r'''<ListBucketResult xmlns=(["'])http://(?:s3\.amazonaws\.com/doc/2006-03-01/|doc\.s3\.amazonaws\.com/2006-03-01)\3>'''.encode('ascii'))
  12. # Arguments
  13. i = 1
  14. withListUrls = False
  15. listUrlsFD = None
  16. startMarker = None
  17. format = None
  18. defaultFormat = '{url}'
  19. jsonl = False
  20. args = []
  21. while i < len(sys.argv):
  22. arg = sys.argv[i]
  23. if arg == '--help':
  24. print('s3-bucket-list [options] BUCKETURL', file = sys.stderr)
  25. print('', file = sys.stderr)
  26. print('Options:', file = sys.stderr)
  27. print(f' --format FORMAT Modify the output format; FORMAT defaults to {defaultFormat!r}; available fields: url, key, size, and all fields returned by S3 (e.g. LastModified)', file = sys.stderr)
  28. print( ' --jsonl Enable JSONL output format; cannot be used if --format is present', file = sys.stderr)
  29. print( ' --marker KEY Start after a particular key instead of from the beginning', file = sys.stderr)
  30. print( ' --with-list-urls Enables printing the list URLs retrieved to FD 3', file = sys.stderr)
  31. sys.exit(1)
  32. elif arg == '--with-list-urls':
  33. withListUrls = True
  34. try:
  35. listUrlsFD = os.fdopen(3, 'w', buffering = 1)
  36. except OSError:
  37. print('Error: FD 3 not open', file = sys.stderr)
  38. sys.exit(1)
  39. elif arg == '--marker':
  40. startMarker = sys.argv[i + 1]
  41. i += 1
  42. elif arg == '--format':
  43. format = sys.argv[i + 1]
  44. i += 1
  45. elif arg == '--jsonl':
  46. jsonl = True
  47. else:
  48. args.append(arg)
  49. i += 1
  50. assert not jsonl or format is None, '--jsonl and --format options are mutually exclusive'
  51. if format is None:
  52. format = defaultFormat
  53. assert len(args) == 1, 'Need one argument: bucket URL'
  54. baseUrl = args[0]
  55. assert baseUrl.startswith('http://') or baseUrl.startswith('https://'), 'Argument does not look like an HTTP URL'
  56. if '/' not in baseUrl.split('://', 1)[1] or not baseUrl.endswith('/'):
  57. baseUrl = f'{baseUrl}/'
  58. hostname = baseUrl.split('://', 1)[1].split('/', 1)[0]
  59. conn = http.client.HTTPSConnection(hostname, context = ssl._create_unverified_context())
  60. params = {}
  61. if startMarker is not None:
  62. params['marker'] = startMarker
  63. attempt = 1
  64. while True:
  65. queryString = urllib.parse.urlencode(params)
  66. url = f'{baseUrl}{"?" + queryString if queryString else ""}'
  67. if withListUrls:
  68. print(f'{url}', file = listUrlsFD)
  69. conn.request('GET', url[url.index('/', 8):])
  70. resp = conn.getresponse()
  71. body = resp.read()
  72. if b'<Error><Code>InternalError</Code><Message>We encountered an internal error. Please try again.</Message>' in body:
  73. print(f'Got internal error on {url} on attempt {attempt}; {"retrying" if attempt < 10 else "aborting"}', file = sys.stderr)
  74. if attempt >= 10:
  75. if 'marker' in params:
  76. print(f'To retry, use --marker {shlex.quote(params["marker"])}', file = sys.stderr)
  77. break
  78. attempt += 1
  79. continue
  80. if not RESPONSE_PATTERN.match(body):
  81. raise RuntimeError(f'Invalid body: {body[:200]}...')
  82. if b'<Marker></Marker>' in body[:200] and 'marker' in params:
  83. raise RuntimeError('Marker loop (empty marker in response despite providing one)')
  84. # No risk, no fun!
  85. contents = body.split(b'<Contents>')
  86. assert all(content.startswith(b'<Key>') for content in contents[1:])
  87. assert all(content.endswith(b'</Contents>') for content in contents[1:-1])
  88. assert contents[-1].endswith(b'</Contents></ListBucketResult>')
  89. contents[-1] = contents[-1][:-len('</ListBucketResult>')]
  90. for content in contents[1:]:
  91. key = html.unescape(content[5 : content.index(b'</Key>')].decode('utf-8')) # 5 = len(b'<Key>')
  92. fields = {}
  93. url = f'{baseUrl}{urllib.parse.quote(key)}'
  94. fields['URL'] = url
  95. tags = content.split(b'>')
  96. assert len(tags) % 2 == 0
  97. assert tags[-1] == b''
  98. assert tags[-2] == b'</Contents'
  99. openTags = [] # Current open tag hierarchy
  100. for tag in tags[:-2]:
  101. if tag.startswith(b'<'):
  102. openTags.append(tag[1:])
  103. continue
  104. assert openTags
  105. if tag.endswith(b'</' + openTags[-1]):
  106. k = b'>'.join(openTags).decode('utf-8')
  107. assert k not in fields, f'{k!r} encountered twice (previous value: {fields[k]!r})'
  108. fields[k] = html.unescape(tag[:-(len(openTags[-1]) + 2)].decode('utf-8'))
  109. openTags.pop()
  110. continue
  111. assert False
  112. if 'Size' in fields:
  113. fields['Size'] = int(fields['Size'])
  114. try:
  115. if jsonl:
  116. print(json.dumps(fields))
  117. else:
  118. print(format.format(**fields, key = key, url = url, size = fields.get('Size')))
  119. except BrokenPipeError:
  120. sys.exit(0)
  121. lastKey = key
  122. truncated = True if b'<IsTruncated>true</IsTruncated>' in body else (False if b'<IsTruncated>false</IsTruncated>' in body else None)
  123. assert truncated in (True, False)
  124. if not truncated:
  125. break
  126. if 'marker' in params and params['marker'] == lastKey:
  127. raise RuntimeError('Marker loop (same last key as previous marker)')
  128. params['marker'] = lastKey
  129. attempt = 1