From d2afd1309dedc65e6fe2afb6c45cb29d3dea197f Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sat, 26 Mar 2022 00:37:29 +0000 Subject: [PATCH] Add s3-bucket-find-direct-url --- s3-bucket-find-direct-url | 86 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100755 s3-bucket-find-direct-url diff --git a/s3-bucket-find-direct-url b/s3-bucket-find-direct-url new file mode 100755 index 0000000..b4f46a3 --- /dev/null +++ b/s3-bucket-find-direct-url @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +import re +import requests +import sys + + +RESPONSE_PATTERN = re.compile(r'''^<\?xml version=(["'])1\.0\1 encoding=(["'])UTF-8\2\?>''' '\n?' r'''''') +NAME_PATTERN = re.compile(r'([^<]*)') +KEY_PATTERN = re.compile(r'([^<]*)') +MTIME_PATTERN = re.compile(r'([^<]*)') +PROVIDERS = { + 'amazon': ['https://s3.amazonaws.com/{}/'], + 'google': ['https://storage.googleapis.com/{}/'], + 'scaleway': ['https://s3.nl-ams.scw.cloud/{}/', 'https://s3.fr-par.scw.cloud/{}/'], +} + + +def find(url, providers): + print(f'Fetching {url}', file = sys.stderr) + r = requests.get(url, timeout = 60) + print(f'{r.status_code} {url}', file = sys.stderr) + body = r.text + if not RESPONSE_PATTERN.match(body): + raise RuntimeError(f'Invalid body: {body[:200]}...') + + # Get bucket name + m = NAME_PATTERN.search(body) + if not m: + raise RuntimeError('Could not find bucket name') + name = m.group(1) + if '&' in name: + raise RuntimeError(f'Unsupported bucket name: {name!r}') + + # Get name and mtime of first object + m = KEY_PATTERN.search(body) + if m: + firstKey = m.group(1) + m = MTIME_PATTERN.search(body) + if not m: + raise RuntimeError('Got key but no mtime') + firstMtime = m.group(1) + else: + print('Warning: no key found, cannot verify that it is the same bucket', file = sys.stderr) + firstKey, firstMtime = None, None + + # Start searching + for provider in providers: + for testUrlTemplate in PROVIDERS[provider]: + testUrl = testUrlTemplate.format(name) + print(f'Fetching {testUrl}', file = sys.stderr) + r = requests.get(testUrl, timeout = 60) + print(f'{r.status_code} {testUrl}', file = sys.stderr) + if r.status_code != 200: + continue + body = r.text + if not RESPONSE_PATTERN.match(body): + raise RuntimeError(f'Invalid body: {body[:200]}...') + + # Compare first object + if not firstKey: + continue + m = KEY_PATTERN.search(body) + if not m: + print(f'No key in {testUrl}', file = sys.stderr) + continue + testFirstKey = m.group(1) + m = MTIME_PATTERN.search(body) + if not m: + print(f'Got key but no mtime in {testUrl}', file = sys.stderr) + continue + testFirstMtime = m.group(1) + + if (firstKey, firstMtime) == (testFirstKey, testFirstMtime): + print(f'Found the bucket: {url} == {testUrl}') + + +if __name__ == '__main__': + if not 2 <= len(sys.argv) <= 3 or sys.argv[1] in ('--help', '-h'): + print('Usage: s3-bucket-find-direct-url URL [PROVIDER]', file = sys.stderr) + print("Searches for an S3 bucket that's available at URL (e.g. CDN or proxy), optionally filtered by PROVIDER", file = sys.stderr) + print(f'Providers: {", ".join(PROVIDERS)}', file = sys.stderr) + sys.exit(1) + + url = sys.argv[1] + providers = (sys.argv[2],) if len(sys.argv) == 3 else tuple(PROVIDERS.keys()) + find(url, providers)