JustAnotherArchivist
/
little-things


			
							#!/bin/bash
format='{url}'

function usage_exit {
	echo 's3-bucket-list-qwarc [options] BUCKETURL [MARKER...]' >&2
	echo >&2
	echo 'List the contents of an open S3 (or S3-like) bucket, possibly in parallel using multiple markers' >&2
	echo >&2
	echo 'Options:' >&2
	echo '  --concurrency N    Start qwarc with a concurrency of N; the default is to run all markers at once' >&2
	echo "  --format FORMAT    Modify the output format; FORMAT defaults to '${format}'; available fields: url, key, size, and all fields returned by S3 (e.g. LastModified)" >&2
	echo '  --no-start-marker  Disable the automatic marker to start from the beginning of the list' >&2
	echo '  --no-end-marker    Disable the automatic marker to scan to the end of the list' >&2
	echo '  --with-list-urls   Enables printing the list URLs retrieved to FD 3' >&2
	exit $1
}

concurrency=
listUrls=
noStartMarker=
noEndMarker=

cmd="$(printf '%q ' 's3-bucket-list-qwarc' "$@" | sed 's, $,,')"
while [[ $# -gt 0 ]]
do
	if [[ "$1" == '--help' || "$1" == '-h' ]]
	then
		usage_exit 0
	elif [[ "$1" == '--concurrency' ]]
	then
		declare -i concurrency="$2"
		shift
	elif [[ "$1" == '--format' ]]
	then
		format="$2"
		shift
	elif [[ "$1" == '--no-start-marker' ]]
	then
		noStartMarker=1
	elif [[ "$1" == '--no-end-marker' ]]
	then
		noEndMarker=1
	elif [[ "$1" == '--with-list-urls' ]]
	then
		listUrls='yes'
	else
		break
	fi
	shift
done

bucketUrl="$1"
shift

# Remaining arguments are markers

if [[ -z "${concurrency}" ]]
then
	declare -i concurrency=$#
	if [[ -z "${noStartMarker}" ]]; then concurrency+=1; fi
	if [[ -z "${noEndMarker}" ]]; then concurrency+=1; fi
	concurrency+=-1 # Because the obvious -= doesn't work...
fi

if [[ "${listUrls}" ]] && ! { command >&3; } 2>/dev/null
then
	echo 'Error: --with-list-urls requires FD 3 to be open' >&2
	exit 1
fi

# Validate and process bucket URL
if [[ "${bucketUrl}" != http://* && "${bucketUrl}" != https://* ]]
then
	echo 'Invalid bucket URL: must be HTTP or HTTPS' >&2
	exit 1
fi
if [[ "${bucketUrl}" == *'?'* ]]
then
	echo 'Invalid bucket URL: must not have a query' >&2
	exit 1
fi
if [[ "${bucketUrl}" != *://*/* || "${bucketUrl}" != */ ]]
then
	bucketUrl="${bucketUrl}/"
fi

# Construct prefix for files and output
prefix="${bucketUrl#*://}" # Remove protocol
while [[ "${prefix}" == */ ]]; do prefix="${prefix%/}"; done # Remove trailing slashes
prefix="${prefix//\//_}" # Replace slashes with underscores

# Ensure no collisions
if [[ "$(shopt -s nullglob; echo "${prefix}"*)" ]]
then
	echo "Error: there already exist files with the prefix '${prefix}' in this directory." >&2
	exit 1
fi

# Write the qwarc spec file
# Indentation... Inspired by https://stackoverflow.com/a/33817423
readarray code <<EOF
	#!/usr/bin/env python3
	import html
	import json
	import logging
	import os
	import qwarc
	import qwarc.utils
	import shlex
	import urllib.parse
	import yarl


	format = os.environ['S3_FORMAT']
	bucketUrl = yarl.URL(os.environ['S3_BUCKET_URL'])
	withListUrls = bool(os.environ.get('S3_WITH_LIST_URLS')) # True if present and non-empty
	markersFilename = os.environ['S3_MARKERS_FILENAME']
	if withListUrls:
		try:
			listUrlsFD = os.fdopen(3, 'w')
		except OSError:
			logging.critical('FD 3 is not open')
			raise


	class S3ListBucket(qwarc.Item):
		itemType = 's3listbucket'
		# itemValue = ('marker1', 'marker2') encoded as JSON

		@classmethod
		def generate(cls):
			yield from map(lambda x: json.dumps(x, separators = (',', ':')), cls._generate())

		@classmethod
		def _generate(cls):
			with open(markersFilename, 'r') as fp:
				it = iter(fp)
				lastLine = next(it).strip() or None
				for line in it:
					line = line.strip() or None
					yield (lastLine, line)
					lastLine = line

		async def process(self):
			marker1, marker2 = json.loads(self.itemValue)
			marker = marker1
			while True:
				url = bucketUrl.with_query({'marker': marker} if marker is not None else {})
				if withListUrls:
					self.logger.info(f'List URL: {str(url)!r}')
					print(f'{url}', file = listUrlsFD)
				response = await self.fetch(url)
				if response.status != 200:
					self.logger.error(f'Could not fetch page on marker {marker!r}')
					break
				body = await response.read()
				# Isn't this a 503?
				if b'<Error><Code>InternalError</Code><Message>We encountered an internal error. Please try again.</Message>' in body:
					self.logger.error(f'Got internal error on {url} on attempt {attempt}; {"retrying" if attempt < 10 else "aborting"}')
					if attempt >= 10:
						if 'marker' in params:
							self.logger.error(f'To retry, use marker {marker!r}')
						break
					attempt += 1
					continue
				if not body.startswith(b'<?xml version="1.0" encoding="UTF-8"?>\n<ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/">') and \
				   not body.startswith(b"<?xml version='1.0' encoding='UTF-8'?><ListBucketResult xmlns='http://doc.s3.amazonaws.com/2006-03-01'>"):
					self.logger.error(f'Invalid body on marker {marker!r}: {body[:200]}...')
					break

				if b'<Marker></Marker>' in body[:200] and marker is not None:
					self.logger.error('Marker loop (empty marker in response despite providing one)')
					break

				# No risk, no fun!
				contents = body.split(b'<Contents>')
				assert all(content.startswith(b'<Key>') for content in contents[1:])
				assert all(content.endswith(b'</Contents>') for content in contents[1:-1])
				assert contents[-1].endswith(b'</Contents></ListBucketResult>')
				contents[-1] = contents[-1][:-len('</ListBucketResult>')]
				for content in contents[1:]:
					key = html.unescape(content[5 : content.index(b'</Key>')].decode('utf-8')) # 5 = len(b'<Key>')
					if marker2 is not None and key >= marker2:
						break
					url = f'{bucketUrl}{urllib.parse.quote(key)}'

					tags = content.split(b'>')
					assert len(tags) % 2 == 0
					assert tags[-1] == b''
					assert tags[-2] == b'</Contents'
					openTags = [] # Current open tag hierarchy
					fields = {}
					for tag in tags[:-2]:
						if tag.startswith(b'<'):
							openTags.append(tag[1:])
							continue
						assert openTags
						if tag.endswith(b'</' + openTags[-1]):
							fields[b'>'.join(openTags).decode('utf-8')] = html.unescape(tag[:-(len(openTags[-1]) + 2)].decode('utf-8'))
							openTags.pop()
							continue
						assert False

					size = int(fields['Size']) if 'Size' in fields else None

					s = format.format(**fields, key = key, url = url, size = size)
					self.logger.info(f'Output: {s!r}')
					print(s)
				lastKey = key
				if marker2 is not None and lastKey >= marker2:
					break

				truncated = True if b'<IsTruncated>true</IsTruncated>' in body else (False if b'<IsTruncated>false</IsTruncated>' in body else None)
				assert truncated in (True, False)

				if not truncated:
					break
				if marker is not None and marker == lastKey:
					self.logger.error('Marker loop (same last key as previous marker)')
					break
				marker = lastKey
				attempt = 1


	specDependencies = qwarc.utils.SpecDependencies(
		files = [os.environ['S3_MARKERS_FILENAME']],
		extra = {'env': {k: os.environ.get(k) for k in ('S3BL_CMD', 'S3_FORMAT', 'S3_BUCKET_URL', 'S3_MARKERS_FILENAME', 'S3_WITH_LIST_URLS')}}
	)
EOF
printf '%s' "${code[@]#	}" >"${prefix}.py"  # That's a tab character after the hash.

# Generate the markers file
{ if [[ -z "${noStartMarker}" ]]; then echo; fi; printf '%s\n' "$@"; if [[ -z "${noEndMarker}" ]]; then echo; fi; } >"${prefix}-markers"

# Collect environment variables
envvars=()
envvars+=(S3BL_CMD="${cmd}")
envvars+=(S3_FORMAT="${format}")
envvars+=(S3_BUCKET_URL="${bucketUrl}")
envvars+=(S3_MARKERS_FILENAME="${prefix}-markers")
if [[ "${listUrls}" ]]; then envvars+=(S3_WITH_LIST_URLS="${listUrls}"); fi

# Lift-off!
env "${envvars[@]}" qwarc --concurrency "${concurrency}" --database "${prefix}.db" --log "${prefix}.log" --warc "${prefix}" "${prefix}.py"