The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

139 lines
4.8 KiB

  1. #!/usr/bin/env python3
  2. import html
  3. import http.client
  4. import json
  5. import os
  6. import shlex
  7. import ssl
  8. import sys
  9. import urllib.parse
  10. # Arguments
  11. i = 1
  12. withListUrls = False
  13. listUrlsFD = None
  14. startMarker = None
  15. format = None
  16. defaultFormat = '{url}'
  17. jsonl = False
  18. args = []
  19. while i < len(sys.argv):
  20. arg = sys.argv[i]
  21. if arg == '--help':
  22. print('s3-bucket-list [options] BUCKETURL', file = sys.stderr)
  23. print('', file = sys.stderr)
  24. print('Options:', file = sys.stderr)
  25. print(f' --format FORMAT Modify the output format; FORMAT defaults to {defaultFormat!r}; available fields: url, key, size, and all fields returned by S3 (e.g. LastModified)', file = sys.stderr)
  26. print( ' --jsonl Enable JSONL output format; cannot be used if --format is present', file = sys.stderr)
  27. print( ' --marker KEY Start after a particular key instead of from the beginning', file = sys.stderr)
  28. print( ' --with-list-urls Enables printing the list URLs retrieved to FD 3', file = sys.stderr)
  29. sys.exit(1)
  30. elif arg == '--with-list-urls':
  31. withListUrls = True
  32. try:
  33. listUrlsFD = os.fdopen(3, 'w')
  34. except OSError:
  35. print('Error: FD 3 not open', file = sys.stderr)
  36. sys.exit(1)
  37. elif arg == '--marker':
  38. startMarker = sys.argv[i + 1]
  39. i += 1
  40. elif arg == '--format':
  41. format = sys.argv[i + 1]
  42. i += 1
  43. elif arg == '--jsonl':
  44. jsonl = True
  45. else:
  46. args.append(arg)
  47. i += 1
  48. assert not jsonl or format is None, '--jsonl and --format options are mutually exclusive'
  49. if format is None:
  50. format = defaultFormat
  51. assert len(args) == 1, 'Need one argument: bucket URL'
  52. baseUrl = args[0]
  53. assert baseUrl.startswith('http://') or baseUrl.startswith('https://'), 'Argument does not look like an HTTP URL'
  54. if '/' not in baseUrl.split('://', 1)[1] or not baseUrl.endswith('/'):
  55. baseUrl = f'{baseUrl}/'
  56. hostname = baseUrl.split('://', 1)[1].split('/', 1)[0]
  57. conn = http.client.HTTPSConnection(hostname, context = ssl._create_unverified_context())
  58. params = {}
  59. if startMarker is not None:
  60. params['marker'] = startMarker
  61. attempt = 1
  62. while True:
  63. queryString = urllib.parse.urlencode(params)
  64. url = f'{baseUrl}{"?" + queryString if queryString else ""}'
  65. if withListUrls:
  66. print(f'{url}', file = listUrlsFD)
  67. conn.request('GET', url[url.index('/', 8):])
  68. resp = conn.getresponse()
  69. body = resp.read()
  70. if b'<Error><Code>InternalError</Code><Message>We encountered an internal error. Please try again.</Message>' in body:
  71. print(f'Got internal error on {url} on attempt {attempt}; {"retrying" if attempt < 10 else "aborting"}', file = sys.stderr)
  72. if attempt >= 10:
  73. if 'marker' in params:
  74. print(f'To retry, use --marker {shlex.quote(params["marker"])}', file = sys.stderr)
  75. break
  76. attempt += 1
  77. continue
  78. if not body.startswith(b'<?xml version="1.0" encoding="UTF-8"?>\n<ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/">') and not body.startswith(b"<?xml version='1.0' encoding='UTF-8'?><ListBucketResult xmlns='http://doc.s3.amazonaws.com/2006-03-01'>"):
  79. raise RuntimeError(f'Invalid body: {body[:200]}...')
  80. if b'<Marker></Marker>' in body[:200] and 'marker' in params:
  81. raise RuntimeError('Marker loop (empty marker in response despite providing one)')
  82. # No risk, no fun!
  83. contents = body.split(b'<Contents>')
  84. assert all(content.startswith(b'<Key>') for content in contents[1:])
  85. assert all(content.endswith(b'</Contents>') for content in contents[1:-1])
  86. assert contents[-1].endswith(b'</Contents></ListBucketResult>')
  87. contents[-1] = contents[-1][:-len('</ListBucketResult>')]
  88. for content in contents[1:]:
  89. key = html.unescape(content[5 : content.index(b'</Key>')].decode('utf-8')) # 5 = len(b'<Key>')
  90. fields = {}
  91. url = f'{baseUrl}{urllib.parse.quote(key)}'
  92. fields['URL'] = url
  93. tags = content.split(b'>')
  94. assert len(tags) % 2 == 0
  95. assert tags[-1] == b''
  96. assert tags[-2] == b'</Contents'
  97. openTags = [] # Current open tag hierarchy
  98. for tag in tags[:-2]:
  99. if tag.startswith(b'<'):
  100. openTags.append(tag[1:])
  101. continue
  102. assert openTags
  103. if tag.endswith(b'</' + openTags[-1]):
  104. k = b'>'.join(openTags).decode('utf-8')
  105. assert k not in fields, f'{k!r} encountered twice (previous value: {fields[k]!r})'
  106. fields[k] = html.unescape(tag[:-(len(openTags[-1]) + 2)].decode('utf-8'))
  107. openTags.pop()
  108. continue
  109. assert False
  110. if 'Size' in fields:
  111. fields['Size'] = int(fields['Size'])
  112. try:
  113. if jsonl:
  114. print(json.dumps(fields))
  115. else:
  116. print(format.format(**fields, key = key, url = url, size = fields.get('Size')))
  117. except BrokenPipeError:
  118. sys.exit(0)
  119. lastKey = key
  120. truncated = True if b'<IsTruncated>true</IsTruncated>' in body else (False if b'<IsTruncated>false</IsTruncated>' in body else None)
  121. assert truncated in (True, False)
  122. if not truncated:
  123. break
  124. if 'marker' in params and params['marker'] == lastKey:
  125. raise RuntimeError('Marker loop (same last key as previous marker)')
  126. params['marker'] = lastKey
  127. attempt = 1