From 398cbfdcda21f349d9d7413fe3060c7abaf475bc Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sat, 26 Sep 2020 17:41:19 +0000 Subject: [PATCH] Add s3-bucket-list-qwarc, rewritten s3-bucket-list on top of qwarc --- s3-bucket-list-qwarc | 241 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 241 insertions(+) create mode 100755 s3-bucket-list-qwarc diff --git a/s3-bucket-list-qwarc b/s3-bucket-list-qwarc new file mode 100755 index 0000000..dcfae80 --- /dev/null +++ b/s3-bucket-list-qwarc @@ -0,0 +1,241 @@ +#!/bin/bash +format='{url}' + +function usage_exit { + echo 's3-bucket-list-qwarc [options] BUCKETURL [MARKER...]' >&2 + echo >&2 + echo 'List the contents of an open S3 (or S3-like) bucket, possibly in parallel using multiple markers' >&2 + echo >&2 + echo 'Options:' >&2 + echo ' --concurrency N Start qwarc with a concurrency of N; the default is to run all markers at once' >&2 + echo " --format FORMAT Modify the output format; FORMAT defaults to '${format}'; available fields: url, key, size, and all fields returned by S3 (e.g. LastModified)" >&2 + echo ' --no-start-marker Disable the automatic marker to start from the beginning of the list' >&2 + echo ' --no-end-marker Disable the automatic marker to scan to the end of the list' >&2 + echo ' --with-list-urls Enables printing the list URLs retrieved to FD 3' >&2 + exit $1 +} + +concurrency= +listUrls= +noStartMarker= +noEndMarker= + +cmd="$(printf '%q ' 's3-bucket-list-qwarc' "$@" | sed 's, $,,')" +while [[ $# -gt 0 ]] +do + if [[ "$1" == '--help' || "$1" == '-h' ]] + then + usage_exit 0 + elif [[ "$1" == '--concurrency' ]] + then + declare -i concurrency="$2" + shift + elif [[ "$1" == '--format' ]] + then + format="$2" + shift + elif [[ "$1" == '--no-start-marker' ]] + then + noStartMarker=1 + elif [[ "$1" == '--no-end-marker' ]] + then + noEndMarker=1 + elif [[ "$1" == '--with-list-urls' ]] + then + listUrls='yes' + else + break + fi + shift +done + +bucketUrl="$1" +shift + +# Remaining arguments are markers + +if [[ -z "${concurrency}" ]] +then + declare -i concurrency=$# + if [[ -z "${noStartMarker}" ]]; then concurrency+=1; fi + if [[ -z "${noEndMarker}" ]]; then concurrency+=1; fi + concurrency+=-1 # Because the obvious -= doesn't work... +fi + +if [[ "${listUrls}" ]] && ! { command >&3; } 2>/dev/null +then + echo 'Error: --with-list-urls requires FD 3 to be open' >&2 + exit 1 +fi + +# Validate and process bucket URL +if [[ "${bucketUrl}" != http://* && "${bucketUrl}" != https://* ]] +then + echo 'Invalid bucket URL: must be HTTP or HTTPS' >&2 + exit 1 +fi +if [[ "${bucketUrl}" == *'?'* ]] +then + echo 'Invalid bucket URL: must not have a query' >&2 + exit 1 +fi +if [[ "${bucketUrl}" != *://*/* || "${bucketUrl}" != */ ]] +then + bucketUrl="${bucketUrl}/" +fi + +# Construct prefix for files and output +prefix="${bucketUrl#*://}" # Remove protocol +while [[ "${prefix}" == */ ]]; do prefix="${prefix%/}"; done # Remove trailing slashes +prefix="${prefix//\//_}" # Replace slashes with underscores + +# Ensure no collisions +if [[ "$(shopt -s nullglob; echo "${prefix}"*)" ]] +then + echo "Error: there already exist files with the prefix '${prefix}' in this directory." >&2 + exit 1 +fi + +# Write the qwarc spec file +# Indentation... Inspired by https://stackoverflow.com/a/33817423 +readarray code <InternalErrorWe encountered an internal error. Please try again.' in body: + self.logger.error(f'Got internal error on {url} on attempt {attempt}; {"retrying" if attempt < 10 else "aborting"}') + if attempt >= 10: + if 'marker' in params: + self.logger.error(f'To retry, use marker {marker!r}') + break + attempt += 1 + continue + if not body.startswith(b'\n') and \ + not body.startswith(b""): + self.logger.error(f'Invalid body on marker {marker!r}: {body[:200]}...') + break + + if b'' in body[:200] and marker is not None: + self.logger.error('Marker loop (empty marker in response despite providing one)') + break + + # No risk, no fun! + contents = body.split(b'') + assert all(content.startswith(b'') for content in contents[1:]) + assert all(content.endswith(b'') for content in contents[1:-1]) + assert contents[-1].endswith(b'') + contents[-1] = contents[-1][:-len('')] + for content in contents[1:]: + key = html.unescape(content[5 : content.index(b'')].decode('utf-8')) # 5 = len(b'') + if marker2 is not None and key >= marker2: + break + url = f'{bucketUrl}{urllib.parse.quote(key)}' + + tags = content.split(b'>') + assert len(tags) % 2 == 0 + assert tags[-1] == b'' + assert tags[-2] == b''.join(openTags).decode('utf-8')] = html.unescape(tag[:-(len(openTags[-1]) + 2)].decode('utf-8')) + openTags.pop() + continue + assert False + + size = int(fields['Size']) if 'Size' in fields else None + + s = format.format(**fields, key = key, url = url, size = size) + self.logger.info(f'Output: {s!r}') + print(s) + lastKey = key + if marker2 is not None and lastKey >= marker2: + break + + truncated = True if b'true' in body else (False if b'false' in body else None) + assert truncated in (True, False) + + if not truncated: + break + if marker is not None and marker == lastKey: + self.logger.error('Marker loop (same last key as previous marker)') + break + marker = lastKey + attempt = 1 + + + specDependencies = qwarc.utils.SpecDependencies( + files = [os.environ['S3_MARKERS_FILENAME']], + extra = {'env': {k: os.environ.get(k) for k in ('S3BL_CMD', 'S3_FORMAT', 'S3_BUCKET_URL', 'S3_MARKERS_FILENAME', 'S3_WITH_LIST_URLS')}} + ) +EOF +printf '%s' "${code[@]# }" >"${prefix}.py" # That's a tab character after the hash. + +# Generate the markers file +{ if [[ -z "${noStartMarker}" ]]; then echo; fi; printf '%s\n' "$@"; if [[ -z "${noEndMarker}" ]]; then echo; fi; } >"${prefix}-markers" + +# Lift-off! +S3BL_CMD="${cmd}" \ + S3_FORMAT="${format}" \ + S3_BUCKET_URL="${bucketUrl}" \ + S3_MARKERS_FILENAME="${prefix}-markers" \ + S3_WITH_LIST_URLS="${listUrls}" \ + qwarc --concurrency "${concurrency}" --database "${prefix}.db" --log "${prefix}.log" --warc "${prefix}" "${prefix}.py"