The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

253 lines
7.9 KiB

  1. #!/bin/bash
  2. format='{url}'
  3. function usage_exit {
  4. echo 's3-bucket-list-qwarc [options] BUCKETURL [MARKER...]' >&2
  5. echo >&2
  6. echo 'List the contents of an open S3 (or S3-like) bucket, possibly in parallel using multiple markers' >&2
  7. echo >&2
  8. echo 'Options:' >&2
  9. echo ' --concurrency N Start qwarc with a concurrency of N; the default is to run all markers at once' >&2
  10. echo " --format FORMAT Modify the output format; FORMAT defaults to '${format}'; available fields: url, key, size, and all fields returned by S3 (e.g. LastModified)" >&2
  11. echo ' --no-start-marker Disable the automatic marker to start from the beginning of the list' >&2
  12. echo ' --no-end-marker Disable the automatic marker to scan to the end of the list' >&2
  13. echo ' --with-list-urls Enables printing the list URLs retrieved to FD 3' >&2
  14. exit $1
  15. }
  16. concurrency=
  17. listUrls=
  18. noStartMarker=
  19. noEndMarker=
  20. cmd="$(printf '%q ' 's3-bucket-list-qwarc' "$@" | sed 's, $,,')"
  21. while [[ $# -gt 0 ]]
  22. do
  23. if [[ "$1" == '--help' || "$1" == '-h' ]]
  24. then
  25. usage_exit 0
  26. elif [[ "$1" == '--concurrency' ]]
  27. then
  28. declare -i concurrency="$2"
  29. shift
  30. elif [[ "$1" == '--format' ]]
  31. then
  32. format="$2"
  33. shift
  34. elif [[ "$1" == '--no-start-marker' ]]
  35. then
  36. noStartMarker=1
  37. elif [[ "$1" == '--no-end-marker' ]]
  38. then
  39. noEndMarker=1
  40. elif [[ "$1" == '--with-list-urls' ]]
  41. then
  42. listUrls='yes'
  43. else
  44. break
  45. fi
  46. shift
  47. done
  48. bucketUrl="$1"
  49. shift
  50. # Remaining arguments are markers
  51. if [[ -z "${concurrency}" ]]
  52. then
  53. declare -i concurrency=$#
  54. if [[ -z "${noStartMarker}" ]]; then concurrency+=1; fi
  55. if [[ -z "${noEndMarker}" ]]; then concurrency+=1; fi
  56. concurrency+=-1 # Because the obvious -= doesn't work...
  57. fi
  58. if [[ "${listUrls}" ]] && ! { command >&3; } 2>/dev/null
  59. then
  60. echo 'Error: --with-list-urls requires FD 3 to be open' >&2
  61. exit 1
  62. fi
  63. # Validate and process bucket URL
  64. if [[ "${bucketUrl}" != http://* && "${bucketUrl}" != https://* ]]
  65. then
  66. echo 'Invalid bucket URL: must be HTTP or HTTPS' >&2
  67. exit 1
  68. fi
  69. if [[ "${bucketUrl}" == *'?'* ]]
  70. then
  71. echo 'Invalid bucket URL: must not have a query' >&2
  72. exit 1
  73. fi
  74. if [[ "${bucketUrl}" != *://*/* || "${bucketUrl}" != */ ]]
  75. then
  76. bucketUrl="${bucketUrl}/"
  77. fi
  78. # Construct prefix for files and output
  79. prefix="${bucketUrl#*://}" # Remove protocol
  80. while [[ "${prefix}" == */ ]]; do prefix="${prefix%/}"; done # Remove trailing slashes
  81. prefix="${prefix//\//_}" # Replace slashes with underscores
  82. # Ensure no collisions
  83. if [[ -e s3-bucket-list-qwarc ]]
  84. then
  85. echo 'Error: s3-bucket-list-qwarc exists in this directory.' >&2
  86. exit 1
  87. fi
  88. if [[ "$(shopt -s nullglob; echo "${prefix}"*)" ]]
  89. then
  90. echo "Error: there already exist files with the prefix '${prefix}' in this directory." >&2
  91. exit 1
  92. fi
  93. # Write the qwarc spec file
  94. # Indentation... Inspired by https://stackoverflow.com/a/33817423
  95. readarray code <<EOF
  96. #!/usr/bin/env python3
  97. import html
  98. import json
  99. import logging
  100. import os
  101. import qwarc
  102. import qwarc.utils
  103. import shlex
  104. import urllib.parse
  105. import yarl
  106. format = os.environ['S3_FORMAT']
  107. bucketUrl = yarl.URL(os.environ['S3_BUCKET_URL'])
  108. withListUrls = bool(os.environ.get('S3_WITH_LIST_URLS')) # True if present and non-empty
  109. markersFilename = os.environ['S3_MARKERS_FILENAME']
  110. if withListUrls:
  111. try:
  112. listUrlsFD = os.fdopen(3, 'w')
  113. except OSError:
  114. logging.critical('FD 3 is not open')
  115. raise
  116. class S3ListBucket(qwarc.Item):
  117. itemType = 's3listbucket'
  118. # itemValue = ('marker1', 'marker2') encoded as JSON
  119. @classmethod
  120. def generate(cls):
  121. yield from map(lambda x: json.dumps(x, separators = (',', ':')), cls._generate())
  122. @classmethod
  123. def _generate(cls):
  124. with open(markersFilename, 'r') as fp:
  125. it = iter(fp)
  126. lastLine = next(it).strip() or None
  127. for line in it:
  128. line = line.strip() or None
  129. yield (lastLine, line)
  130. lastLine = line
  131. async def process(self):
  132. marker1, marker2 = json.loads(self.itemValue)
  133. marker = marker1
  134. while True:
  135. url = bucketUrl.with_query({'marker': marker} if marker is not None else {})
  136. if withListUrls:
  137. self.logger.info(f'List URL: {str(url)!r}')
  138. print(f'{url}', file = listUrlsFD)
  139. response = await self.fetch(url)
  140. if response.status != 200:
  141. self.logger.error(f'Could not fetch page on marker {marker!r}')
  142. break
  143. body = await response.read()
  144. # Isn't this a 503?
  145. if b'<Error><Code>InternalError</Code><Message>We encountered an internal error. Please try again.</Message>' in body:
  146. self.logger.error(f'Got internal error on {url} on attempt {attempt}; {"retrying" if attempt < 10 else "aborting"}')
  147. if attempt >= 10:
  148. if 'marker' in params:
  149. self.logger.error(f'To retry, use marker {marker!r}')
  150. break
  151. attempt += 1
  152. continue
  153. if not body.startswith(b'<?xml version="1.0" encoding="UTF-8"?>\n<ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/">') and \
  154. not body.startswith(b"<?xml version='1.0' encoding='UTF-8'?><ListBucketResult xmlns='http://doc.s3.amazonaws.com/2006-03-01'>"):
  155. self.logger.error(f'Invalid body on marker {marker!r}: {body[:200]}...')
  156. break
  157. if b'<Marker></Marker>' in body[:200] and marker is not None:
  158. self.logger.error('Marker loop (empty marker in response despite providing one)')
  159. break
  160. # No risk, no fun!
  161. contents = body.split(b'<Contents>')
  162. assert all(content.startswith(b'<Key>') for content in contents[1:])
  163. assert all(content.endswith(b'</Contents>') for content in contents[1:-1])
  164. assert contents[-1].endswith(b'</Contents></ListBucketResult>')
  165. contents[-1] = contents[-1][:-len('</ListBucketResult>')]
  166. for content in contents[1:]:
  167. key = html.unescape(content[5 : content.index(b'</Key>')].decode('utf-8')) # 5 = len(b'<Key>')
  168. if marker2 is not None and key >= marker2:
  169. break
  170. url = f'{bucketUrl}{urllib.parse.quote(key)}'
  171. tags = content.split(b'>')
  172. assert len(tags) % 2 == 0
  173. assert tags[-1] == b''
  174. assert tags[-2] == b'</Contents'
  175. openTags = [] # Current open tag hierarchy
  176. fields = {}
  177. for tag in tags[:-2]:
  178. if tag.startswith(b'<'):
  179. openTags.append(tag[1:])
  180. continue
  181. assert openTags
  182. if tag.endswith(b'</' + openTags[-1]):
  183. fields[b'>'.join(openTags).decode('utf-8')] = html.unescape(tag[:-(len(openTags[-1]) + 2)].decode('utf-8'))
  184. openTags.pop()
  185. continue
  186. assert False
  187. size = int(fields['Size']) if 'Size' in fields else None
  188. s = format.format(**fields, key = key, url = url, size = size)
  189. self.logger.info(f'Output: {s!r}')
  190. print(s)
  191. lastKey = key
  192. if marker2 is not None and lastKey >= marker2:
  193. break
  194. truncated = True if b'<IsTruncated>true</IsTruncated>' in body else (False if b'<IsTruncated>false</IsTruncated>' in body else None)
  195. assert truncated in (True, False)
  196. if not truncated:
  197. break
  198. if marker is not None and marker == lastKey:
  199. self.logger.error('Marker loop (same last key as previous marker)')
  200. break
  201. marker = lastKey
  202. attempt = 1
  203. specDependencies = qwarc.utils.SpecDependencies(
  204. files = ['s3-bucket-list-qwarc', os.environ['S3_MARKERS_FILENAME']],
  205. extra = {'env': {k: os.environ.get(k) for k in ('S3BL_CMD', 'S3_FORMAT', 'S3_BUCKET_URL', 'S3_MARKERS_FILENAME', 'S3_WITH_LIST_URLS')}}
  206. )
  207. EOF
  208. printf '%s' "${code[@]# }" >"${prefix}.py" # That's a tab character after the hash.
  209. # Generate the markers file
  210. { if [[ -z "${noStartMarker}" ]]; then echo; fi; printf '%s\n' "$@"; if [[ -z "${noEndMarker}" ]]; then echo; fi; } >"${prefix}-markers"
  211. # Copy this script
  212. rsync -a "$0" s3-bucket-list-qwarc
  213. # Collect environment variables
  214. envvars=()
  215. envvars+=(S3BL_CMD="${cmd}")
  216. envvars+=(S3_FORMAT="${format}")
  217. envvars+=(S3_BUCKET_URL="${bucketUrl}")
  218. envvars+=(S3_MARKERS_FILENAME="${prefix}-markers")
  219. if [[ "${listUrls}" ]]; then envvars+=(S3_WITH_LIST_URLS="${listUrls}"); fi
  220. # Lift-off!
  221. env "${envvars[@]}" qwarc --concurrency "${concurrency}" --database "${prefix}.db" --log "${prefix}.log" --warc "${prefix}" "${prefix}.py"