print(f' --format FORMAT Modify the output format; FORMAT defaults to {format!r}; available fields: url, key, size, and all fields returned by S3 (e.g. LastModified)', file = sys.stderr)
print(f' --format FORMAT Modify the output format; FORMAT defaults to {defaultFormat!r}; available fields: url, key, size, and all fields returned by S3 (e.g. LastModified)', file = sys.stderr)
print( ' --jsonl Enable JSONL output format; cannot be used if --format is present', file = sys.stderr)
print( ' --marker KEY Start after a particular key instead of from the beginning', file = sys.stderr)
print( ' --with-list-urls Enables printing the list URLs retrieved to FD 3', file = sys.stderr)
sys.exit(1)
@@ -38,9 +42,14 @@ while i < len(sys.argv):
elif arg == '--format':
format = sys.argv[i + 1]
i += 1
elif arg == '--jsonl':
jsonl = True
else:
args.append(arg)
i += 1
assert not jsonl or format is None, '--jsonl and --format options are mutually exclusive'
if format is None:
format = defaultFormat
assert len(args) == 1, 'Need one argument: bucket URL'
baseUrl = args[0]
assert baseUrl.startswith('http://') or baseUrl.startswith('https://'), 'Argument does not look like an HTTP URL'
echo ' --concurrency N Start qwarc with a concurrency of N; the default is to run all markers at once' >&2
echo " --format FORMAT Modify the output format; FORMAT defaults to '${format}'; available fields: url, key, size, and all fields returned by S3 (e.g. LastModified)" >&2
echo " --format FORMAT Modify the output format; FORMAT defaults to '${defaultFormat}'; available fields: url, key, size, and all fields returned by S3 (e.g. LastModified)" >&2
echo ' --jsonl Enable JSONL output; cannot be used with --format' >&2
echo ' --no-start-marker Disable the automatic marker to start from the beginning of the list' >&2
echo ' --no-end-marker Disable the automatic marker to scan to the end of the list' >&2
echo ' --with-list-urls Enables printing the list URLs retrieved to FD 3' >&2
@@ -19,6 +20,8 @@ concurrency=
listUrls=
noStartMarker=
noEndMarker=
format=
jsonl=
cmd="$(printf '%q ' 's3-bucket-list-qwarc' "$@" | sed 's, $,,')"
while [[ $# -gt 0 ]]
@@ -34,6 +37,9 @@ do
then
format="$2"
shift
elif [[ "$1" == '--jsonl' ]]
then
jsonl=1
elif [[ "$1" == '--no-start-marker' ]]
then
noStartMarker=1
@@ -68,6 +74,16 @@ then
exit 1
fi
if [[ "${jsonl}" && "${format}" ]]
then
echo 'Error: --jsonl and --format options are mutually exclusive' >&2
exit 1
fi
if [[ -z "${format}" ]]
then
format="${defaultFormat}"
fi
# Validate and process bucket URL
if [[ "${bucketUrl}" != http://* && "${bucketUrl}" != https://* ]]
then
@@ -117,6 +133,7 @@ readarray code <<EOF
format = os.environ['S3_FORMAT']
jsonl = os.environ['S3_JSONL'] == '1'
bucketUrl = yarl.URL(os.environ['S3_BUCKET_URL'])
withListUrls = bool(os.environ.get('S3_WITH_LIST_URLS')) # True if present and non-empty