Browse Source

Add JSONL output option for S3 listing

master
JustAnotherArchivist 2 years ago
parent
commit
22f2e68356
2 changed files with 53 additions and 12 deletions
  1. +22
    -6
      s3-bucket-list
  2. +31
    -6
      s3-bucket-list-qwarc

+ 22
- 6
s3-bucket-list View File

@@ -1,6 +1,7 @@
#!/usr/bin/env python3
import html
import http.client
import json
import os
import shlex
import ssl
@@ -13,7 +14,9 @@ i = 1
withListUrls = False
listUrlsFD = None
startMarker = None
format = '{url}'
format = None
defaultFormat = '{url}'
jsonl = False
args = []
while i < len(sys.argv):
arg = sys.argv[i]
@@ -21,7 +24,8 @@ while i < len(sys.argv):
print('s3-bucket-list [options] BUCKETURL', file = sys.stderr)
print('', file = sys.stderr)
print('Options:', file = sys.stderr)
print(f' --format FORMAT Modify the output format; FORMAT defaults to {format!r}; available fields: url, key, size, and all fields returned by S3 (e.g. LastModified)', file = sys.stderr)
print(f' --format FORMAT Modify the output format; FORMAT defaults to {defaultFormat!r}; available fields: url, key, size, and all fields returned by S3 (e.g. LastModified)', file = sys.stderr)
print( ' --jsonl Enable JSONL output format; cannot be used if --format is present', file = sys.stderr)
print( ' --marker KEY Start after a particular key instead of from the beginning', file = sys.stderr)
print( ' --with-list-urls Enables printing the list URLs retrieved to FD 3', file = sys.stderr)
sys.exit(1)
@@ -38,9 +42,14 @@ while i < len(sys.argv):
elif arg == '--format':
format = sys.argv[i + 1]
i += 1
elif arg == '--jsonl':
jsonl = True
else:
args.append(arg)
i += 1
assert not jsonl or format is None, '--jsonl and --format options are mutually exclusive'
if format is None:
format = defaultFormat
assert len(args) == 1, 'Need one argument: bucket URL'
baseUrl = args[0]
assert baseUrl.startswith('http://') or baseUrl.startswith('https://'), 'Argument does not look like an HTTP URL'
@@ -84,29 +93,36 @@ while True:
contents[-1] = contents[-1][:-len('</ListBucketResult>')]
for content in contents[1:]:
key = html.unescape(content[5 : content.index(b'</Key>')].decode('utf-8')) # 5 = len(b'<Key>')
fields = {}
url = f'{baseUrl}{urllib.parse.quote(key)}'
fields['URL'] = url

tags = content.split(b'>')
assert len(tags) % 2 == 0
assert tags[-1] == b''
assert tags[-2] == b'</Contents'
openTags = [] # Current open tag hierarchy
fields = {}
for tag in tags[:-2]:
if tag.startswith(b'<'):
openTags.append(tag[1:])
continue
assert openTags
if tag.endswith(b'</' + openTags[-1]):
fields[b'>'.join(openTags).decode('utf-8')] = html.unescape(tag[:-(len(openTags[-1]) + 2)].decode('utf-8'))
k = b'>'.join(openTags).decode('utf-8')
assert k not in fields, f'{k!r} encountered twice (previous value: {fields[k]!r})'
fields[k] = html.unescape(tag[:-(len(openTags[-1]) + 2)].decode('utf-8'))
openTags.pop()
continue
assert False

size = int(fields['Size']) if 'Size' in fields else None
if 'Size' in fields:
fields['Size'] = int(fields['Size'])

try:
print(format.format(**fields, key = key, url = url, size = size))
if jsonl:
print(json.dumps(fields))
else:
print(format.format(**fields, key = key, url = url, size = fields.get('Size')))
except BrokenPipeError:
sys.exit(0)
lastKey = key


+ 31
- 6
s3-bucket-list-qwarc View File

@@ -1,5 +1,5 @@
#!/bin/bash
format='{url}'
defaultFormat='{url}'

function usage_exit {
echo 's3-bucket-list-qwarc [options] BUCKETURL [MARKER...]' >&2
@@ -8,7 +8,8 @@ function usage_exit {
echo >&2
echo 'Options:' >&2
echo ' --concurrency N Start qwarc with a concurrency of N; the default is to run all markers at once' >&2
echo " --format FORMAT Modify the output format; FORMAT defaults to '${format}'; available fields: url, key, size, and all fields returned by S3 (e.g. LastModified)" >&2
echo " --format FORMAT Modify the output format; FORMAT defaults to '${defaultFormat}'; available fields: url, key, size, and all fields returned by S3 (e.g. LastModified)" >&2
echo ' --jsonl Enable JSONL output; cannot be used with --format' >&2
echo ' --no-start-marker Disable the automatic marker to start from the beginning of the list' >&2
echo ' --no-end-marker Disable the automatic marker to scan to the end of the list' >&2
echo ' --with-list-urls Enables printing the list URLs retrieved to FD 3' >&2
@@ -19,6 +20,8 @@ concurrency=
listUrls=
noStartMarker=
noEndMarker=
format=
jsonl=

cmd="$(printf '%q ' 's3-bucket-list-qwarc' "$@" | sed 's, $,,')"
while [[ $# -gt 0 ]]
@@ -34,6 +37,9 @@ do
then
format="$2"
shift
elif [[ "$1" == '--jsonl' ]]
then
jsonl=1
elif [[ "$1" == '--no-start-marker' ]]
then
noStartMarker=1
@@ -68,6 +74,16 @@ then
exit 1
fi

if [[ "${jsonl}" && "${format}" ]]
then
echo 'Error: --jsonl and --format options are mutually exclusive' >&2
exit 1
fi
if [[ -z "${format}" ]]
then
format="${defaultFormat}"
fi

# Validate and process bucket URL
if [[ "${bucketUrl}" != http://* && "${bucketUrl}" != https://* ]]
then
@@ -117,6 +133,7 @@ readarray code <<EOF


format = os.environ['S3_FORMAT']
jsonl = os.environ['S3_JSONL'] == '1'
bucketUrl = yarl.URL(os.environ['S3_BUCKET_URL'])
withListUrls = bool(os.environ.get('S3_WITH_LIST_URLS')) # True if present and non-empty
markersFilename = os.environ['S3_MARKERS_FILENAME']
@@ -187,28 +204,35 @@ readarray code <<EOF
key = html.unescape(content[5 : content.index(b'</Key>')].decode('utf-8')) # 5 = len(b'<Key>')
if marker2 is not None and key >= marker2:
break
fields = {}
url = f'{bucketUrl}{urllib.parse.quote(key)}'
fields['URL'] = url

tags = content.split(b'>')
assert len(tags) % 2 == 0
assert tags[-1] == b''
assert tags[-2] == b'</Contents'
openTags = [] # Current open tag hierarchy
fields = {}
for tag in tags[:-2]:
if tag.startswith(b'<'):
openTags.append(tag[1:])
continue
assert openTags
if tag.endswith(b'</' + openTags[-1]):
fields[b'>'.join(openTags).decode('utf-8')] = html.unescape(tag[:-(len(openTags[-1]) + 2)].decode('utf-8'))
k = b'>'.join(openTags).decode('utf-8')
assert k not in fields, f'{k!r} encountered twice (previous value: {fields[k]!r})'
fields[k] = html.unescape(tag[:-(len(openTags[-1]) + 2)].decode('utf-8'))
openTags.pop()
continue
assert False

size = int(fields['Size']) if 'Size' in fields else None
if 'Size' in fields:
fields['Size'] = int(fields['Size'])

s = format.format(**fields, key = key, url = url, size = size)
if jsonl:
s = json.dumps(fields)
else:
s = format.format(**fields, key = key, url = url, size = fields.get('Size'))
self.logger.info(f'Output: {s!r}')
print(s)
lastKey = key
@@ -244,6 +268,7 @@ rsync -a "$0" s3-bucket-list-qwarc
envvars=()
envvars+=(S3BL_CMD="${cmd}")
envvars+=(S3_FORMAT="${format}")
envvars+=(S3_JSONL="${jsonl}")
envvars+=(S3_BUCKET_URL="${bucketUrl}")
envvars+=(S3_MARKERS_FILENAME="${prefix}-markers")
if [[ "${listUrls}" ]]; then envvars+=(S3_WITH_LIST_URLS="${listUrls}"); fi


Loading…
Cancel
Save