diff --git a/s3-bucket-find-direct-url b/s3-bucket-find-direct-url index b83606c..62eae40 100755 --- a/s3-bucket-find-direct-url +++ b/s3-bucket-find-direct-url @@ -2,27 +2,55 @@ import re import requests import sys +import urllib3 RESPONSE_PATTERN = re.compile(r'''^<\?xml version=(["'])1\.0\1 encoding=(["'])UTF-8\2\?>''' '\n?' r'''''') NAME_PATTERN = re.compile(r'([^<]*)') KEY_PATTERN = re.compile(r'([^<]*)') MTIME_PATTERN = re.compile(r'([^<]*)') +REDIRECT_PATTERN = re.compile(r'''^<\?xml version=(["'])1\.0\1 encoding=(["'])UTF-8\2\?>''' '\n?' r'PermanentRedirect') +REDIRECT_TARGET_ENDPOINT_PATTERN = re.compile(r'([^<]*)') +REDIRECT_TARGET_BUCKET_PATTERN = re.compile(r'([^<]*)') PROVIDERS = { - 'amazon': ['https://s3.amazonaws.com/{}/'], + 'amazon': ['https://s3.amazonaws.com/{}/', 'https://{}.s3.amazonaws.com/'], 'google': ['https://storage.googleapis.com/{}/'], 'scaleway': ['https://s3.nl-ams.scw.cloud/{}/', 'https://s3.fr-par.scw.cloud/{}/'], 'wasabi': ['https://s3.wasabisys.com/{}/'], } -def find(url, providers): +# AWS S3 buckets whose names contain a . are broken because AWS can't be bothered to serve valid TLS certs for them. +urllib3.disable_warnings() + + +def fetch_with_redirect(url): print(f'Fetching {url}', file = sys.stderr) - r = requests.get(url, timeout = 60) + r = requests.get(url, verify = False, timeout = 60) print(f'{r.status_code} {url}', file = sys.stderr) body = r.text - if not RESPONSE_PATTERN.match(body): + if r.status_code == 301 and REDIRECT_PATTERN.match(body): + m = REDIRECT_TARGET_ENDPOINT_PATTERN.search(body) + if not m: + raise RuntimeError('Could not get redirect endpoint') + endpoint = m.group(1) + m = REDIRECT_TARGET_BUCKET_PATTERN.search(body) + if not m: + raise RuntimeError('Could not get redirect bucket') + bucket = m.group(1) + print(f'Redirect to endpoint {endpoint!r} bucket {bucket!r}') + url = f'https://{endpoint}/{bucket}/' + print(f'Fetching {url}') + r = requests.get(url, timeout = 60) + print(f'{r.status_code} {url}', file = sys.stderr) + body = r.text + if r.status_code == 200 and not RESPONSE_PATTERN.match(body): raise RuntimeError(f'Invalid body: {body[:200]}...') + return r, url, body + + +def find(url, providers): + _, _, body = fetch_with_redirect(url) # Get bucket name m = NAME_PATTERN.search(body) @@ -48,14 +76,9 @@ def find(url, providers): for provider in providers: for testUrlTemplate in PROVIDERS[provider]: testUrl = testUrlTemplate.format(name) - print(f'Fetching {testUrl}', file = sys.stderr) - r = requests.get(testUrl, timeout = 60) - print(f'{r.status_code} {testUrl}', file = sys.stderr) + r, testUrl, body = fetch_with_redirect(testUrl) if r.status_code != 200: continue - body = r.text - if not RESPONSE_PATTERN.match(body): - raise RuntimeError(f'Invalid body: {body[:200]}...') # Compare first object if not firstKey: