diff --git a/s3-bucket-list b/s3-bucket-list index c2ac623..352e65e 100755 --- a/s3-bucket-list +++ b/s3-bucket-list @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import html import http.client +import json import os import shlex import ssl @@ -13,7 +14,9 @@ i = 1 withListUrls = False listUrlsFD = None startMarker = None -format = '{url}' +format = None +defaultFormat = '{url}' +jsonl = False args = [] while i < len(sys.argv): arg = sys.argv[i] @@ -21,7 +24,8 @@ while i < len(sys.argv): print('s3-bucket-list [options] BUCKETURL', file = sys.stderr) print('', file = sys.stderr) print('Options:', file = sys.stderr) - print(f' --format FORMAT Modify the output format; FORMAT defaults to {format!r}; available fields: url, key, size, and all fields returned by S3 (e.g. LastModified)', file = sys.stderr) + print(f' --format FORMAT Modify the output format; FORMAT defaults to {defaultFormat!r}; available fields: url, key, size, and all fields returned by S3 (e.g. LastModified)', file = sys.stderr) + print( ' --jsonl Enable JSONL output format; cannot be used if --format is present', file = sys.stderr) print( ' --marker KEY Start after a particular key instead of from the beginning', file = sys.stderr) print( ' --with-list-urls Enables printing the list URLs retrieved to FD 3', file = sys.stderr) sys.exit(1) @@ -38,9 +42,14 @@ while i < len(sys.argv): elif arg == '--format': format = sys.argv[i + 1] i += 1 + elif arg == '--jsonl': + jsonl = True else: args.append(arg) i += 1 +assert not jsonl or format is None, '--jsonl and --format options are mutually exclusive' +if format is None: + format = defaultFormat assert len(args) == 1, 'Need one argument: bucket URL' baseUrl = args[0] assert baseUrl.startswith('http://') or baseUrl.startswith('https://'), 'Argument does not look like an HTTP URL' @@ -84,29 +93,36 @@ while True: contents[-1] = contents[-1][:-len('')] for content in contents[1:]: key = html.unescape(content[5 : content.index(b'')].decode('utf-8')) # 5 = len(b'') + fields = {} url = f'{baseUrl}{urllib.parse.quote(key)}' + fields['URL'] = url tags = content.split(b'>') assert len(tags) % 2 == 0 assert tags[-1] == b'' assert tags[-2] == b''.join(openTags).decode('utf-8')] = html.unescape(tag[:-(len(openTags[-1]) + 2)].decode('utf-8')) + k = b'>'.join(openTags).decode('utf-8') + assert k not in fields, f'{k!r} encountered twice (previous value: {fields[k]!r})' + fields[k] = html.unescape(tag[:-(len(openTags[-1]) + 2)].decode('utf-8')) openTags.pop() continue assert False - size = int(fields['Size']) if 'Size' in fields else None + if 'Size' in fields: + fields['Size'] = int(fields['Size']) try: - print(format.format(**fields, key = key, url = url, size = size)) + if jsonl: + print(json.dumps(fields)) + else: + print(format.format(**fields, key = key, url = url, size = fields.get('Size'))) except BrokenPipeError: sys.exit(0) lastKey = key diff --git a/s3-bucket-list-qwarc b/s3-bucket-list-qwarc index a5aba65..6c9449e 100755 --- a/s3-bucket-list-qwarc +++ b/s3-bucket-list-qwarc @@ -1,5 +1,5 @@ #!/bin/bash -format='{url}' +defaultFormat='{url}' function usage_exit { echo 's3-bucket-list-qwarc [options] BUCKETURL [MARKER...]' >&2 @@ -8,7 +8,8 @@ function usage_exit { echo >&2 echo 'Options:' >&2 echo ' --concurrency N Start qwarc with a concurrency of N; the default is to run all markers at once' >&2 - echo " --format FORMAT Modify the output format; FORMAT defaults to '${format}'; available fields: url, key, size, and all fields returned by S3 (e.g. LastModified)" >&2 + echo " --format FORMAT Modify the output format; FORMAT defaults to '${defaultFormat}'; available fields: url, key, size, and all fields returned by S3 (e.g. LastModified)" >&2 + echo ' --jsonl Enable JSONL output; cannot be used with --format' >&2 echo ' --no-start-marker Disable the automatic marker to start from the beginning of the list' >&2 echo ' --no-end-marker Disable the automatic marker to scan to the end of the list' >&2 echo ' --with-list-urls Enables printing the list URLs retrieved to FD 3' >&2 @@ -19,6 +20,8 @@ concurrency= listUrls= noStartMarker= noEndMarker= +format= +jsonl= cmd="$(printf '%q ' 's3-bucket-list-qwarc' "$@" | sed 's, $,,')" while [[ $# -gt 0 ]] @@ -34,6 +37,9 @@ do then format="$2" shift + elif [[ "$1" == '--jsonl' ]] + then + jsonl=1 elif [[ "$1" == '--no-start-marker' ]] then noStartMarker=1 @@ -68,6 +74,16 @@ then exit 1 fi +if [[ "${jsonl}" && "${format}" ]] +then + echo 'Error: --jsonl and --format options are mutually exclusive' >&2 + exit 1 +fi +if [[ -z "${format}" ]] +then + format="${defaultFormat}" +fi + # Validate and process bucket URL if [[ "${bucketUrl}" != http://* && "${bucketUrl}" != https://* ]] then @@ -117,6 +133,7 @@ readarray code <')].decode('utf-8')) # 5 = len(b'') if marker2 is not None and key >= marker2: break + fields = {} url = f'{bucketUrl}{urllib.parse.quote(key)}' + fields['URL'] = url tags = content.split(b'>') assert len(tags) % 2 == 0 assert tags[-1] == b'' assert tags[-2] == b''.join(openTags).decode('utf-8')] = html.unescape(tag[:-(len(openTags[-1]) + 2)].decode('utf-8')) + k = b'>'.join(openTags).decode('utf-8') + assert k not in fields, f'{k!r} encountered twice (previous value: {fields[k]!r})' + fields[k] = html.unescape(tag[:-(len(openTags[-1]) + 2)].decode('utf-8')) openTags.pop() continue assert False - size = int(fields['Size']) if 'Size' in fields else None + if 'Size' in fields: + fields['Size'] = int(fields['Size']) - s = format.format(**fields, key = key, url = url, size = size) + if jsonl: + s = json.dumps(fields) + else: + s = format.format(**fields, key = key, url = url, size = fields.get('Size')) self.logger.info(f'Output: {s!r}') print(s) lastKey = key @@ -244,6 +268,7 @@ rsync -a "$0" s3-bucket-list-qwarc envvars=() envvars+=(S3BL_CMD="${cmd}") envvars+=(S3_FORMAT="${format}") +envvars+=(S3_JSONL="${jsonl}") envvars+=(S3_BUCKET_URL="${bucketUrl}") envvars+=(S3_MARKERS_FILENAME="${prefix}-markers") if [[ "${listUrls}" ]]; then envvars+=(S3_WITH_LIST_URLS="${listUrls}"); fi