The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

245 lines
7.7 KiB

  1. #!/bin/bash
  2. format='{url}'
  3. function usage_exit {
  4. echo 's3-bucket-list-qwarc [options] BUCKETURL [MARKER...]' >&2
  5. echo >&2
  6. echo 'List the contents of an open S3 (or S3-like) bucket, possibly in parallel using multiple markers' >&2
  7. echo >&2
  8. echo 'Options:' >&2
  9. echo ' --concurrency N Start qwarc with a concurrency of N; the default is to run all markers at once' >&2
  10. echo " --format FORMAT Modify the output format; FORMAT defaults to '${format}'; available fields: url, key, size, and all fields returned by S3 (e.g. LastModified)" >&2
  11. echo ' --no-start-marker Disable the automatic marker to start from the beginning of the list' >&2
  12. echo ' --no-end-marker Disable the automatic marker to scan to the end of the list' >&2
  13. echo ' --with-list-urls Enables printing the list URLs retrieved to FD 3' >&2
  14. exit $1
  15. }
  16. concurrency=
  17. listUrls=
  18. noStartMarker=
  19. noEndMarker=
  20. cmd="$(printf '%q ' 's3-bucket-list-qwarc' "$@" | sed 's, $,,')"
  21. while [[ $# -gt 0 ]]
  22. do
  23. if [[ "$1" == '--help' || "$1" == '-h' ]]
  24. then
  25. usage_exit 0
  26. elif [[ "$1" == '--concurrency' ]]
  27. then
  28. declare -i concurrency="$2"
  29. shift
  30. elif [[ "$1" == '--format' ]]
  31. then
  32. format="$2"
  33. shift
  34. elif [[ "$1" == '--no-start-marker' ]]
  35. then
  36. noStartMarker=1
  37. elif [[ "$1" == '--no-end-marker' ]]
  38. then
  39. noEndMarker=1
  40. elif [[ "$1" == '--with-list-urls' ]]
  41. then
  42. listUrls='yes'
  43. else
  44. break
  45. fi
  46. shift
  47. done
  48. bucketUrl="$1"
  49. shift
  50. # Remaining arguments are markers
  51. if [[ -z "${concurrency}" ]]
  52. then
  53. declare -i concurrency=$#
  54. if [[ -z "${noStartMarker}" ]]; then concurrency+=1; fi
  55. if [[ -z "${noEndMarker}" ]]; then concurrency+=1; fi
  56. concurrency+=-1 # Because the obvious -= doesn't work...
  57. fi
  58. if [[ "${listUrls}" ]] && ! { command >&3; } 2>/dev/null
  59. then
  60. echo 'Error: --with-list-urls requires FD 3 to be open' >&2
  61. exit 1
  62. fi
  63. # Validate and process bucket URL
  64. if [[ "${bucketUrl}" != http://* && "${bucketUrl}" != https://* ]]
  65. then
  66. echo 'Invalid bucket URL: must be HTTP or HTTPS' >&2
  67. exit 1
  68. fi
  69. if [[ "${bucketUrl}" == *'?'* ]]
  70. then
  71. echo 'Invalid bucket URL: must not have a query' >&2
  72. exit 1
  73. fi
  74. if [[ "${bucketUrl}" != *://*/* || "${bucketUrl}" != */ ]]
  75. then
  76. bucketUrl="${bucketUrl}/"
  77. fi
  78. # Construct prefix for files and output
  79. prefix="${bucketUrl#*://}" # Remove protocol
  80. while [[ "${prefix}" == */ ]]; do prefix="${prefix%/}"; done # Remove trailing slashes
  81. prefix="${prefix//\//_}" # Replace slashes with underscores
  82. # Ensure no collisions
  83. if [[ "$(shopt -s nullglob; echo "${prefix}"*)" ]]
  84. then
  85. echo "Error: there already exist files with the prefix '${prefix}' in this directory." >&2
  86. exit 1
  87. fi
  88. # Write the qwarc spec file
  89. # Indentation... Inspired by https://stackoverflow.com/a/33817423
  90. readarray code <<EOF
  91. #!/usr/bin/env python3
  92. import html
  93. import json
  94. import logging
  95. import os
  96. import qwarc
  97. import qwarc.utils
  98. import shlex
  99. import urllib.parse
  100. import yarl
  101. format = os.environ['S3_FORMAT']
  102. bucketUrl = yarl.URL(os.environ['S3_BUCKET_URL'])
  103. withListUrls = bool(os.environ.get('S3_WITH_LIST_URLS')) # True if present and non-empty
  104. markersFilename = os.environ['S3_MARKERS_FILENAME']
  105. if withListUrls:
  106. try:
  107. listUrlsFD = os.fdopen(3, 'w')
  108. except OSError:
  109. logging.critical('FD 3 is not open')
  110. raise
  111. class S3ListBucket(qwarc.Item):
  112. itemType = 's3listbucket'
  113. # itemValue = ('marker1', 'marker2') encoded as JSON
  114. @classmethod
  115. def generate(cls):
  116. yield from map(lambda x: json.dumps(x, separators = (',', ':')), cls._generate())
  117. @classmethod
  118. def _generate(cls):
  119. with open(markersFilename, 'r') as fp:
  120. it = iter(fp)
  121. lastLine = next(it).strip() or None
  122. for line in it:
  123. line = line.strip() or None
  124. yield (lastLine, line)
  125. lastLine = line
  126. async def process(self):
  127. marker1, marker2 = json.loads(self.itemValue)
  128. marker = marker1
  129. while True:
  130. url = bucketUrl.with_query({'marker': marker} if marker is not None else {})
  131. if withListUrls:
  132. self.logger.info(f'List URL: {str(url)!r}')
  133. print(f'{url}', file = listUrlsFD)
  134. response = await self.fetch(url)
  135. if response.status != 200:
  136. self.logger.error(f'Could not fetch page on marker {marker!r}')
  137. break
  138. body = await response.read()
  139. # Isn't this a 503?
  140. if b'<Error><Code>InternalError</Code><Message>We encountered an internal error. Please try again.</Message>' in body:
  141. self.logger.error(f'Got internal error on {url} on attempt {attempt}; {"retrying" if attempt < 10 else "aborting"}')
  142. if attempt >= 10:
  143. if 'marker' in params:
  144. self.logger.error(f'To retry, use marker {marker!r}')
  145. break
  146. attempt += 1
  147. continue
  148. if not body.startswith(b'<?xml version="1.0" encoding="UTF-8"?>\n<ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/">') and \
  149. not body.startswith(b"<?xml version='1.0' encoding='UTF-8'?><ListBucketResult xmlns='http://doc.s3.amazonaws.com/2006-03-01'>"):
  150. self.logger.error(f'Invalid body on marker {marker!r}: {body[:200]}...')
  151. break
  152. if b'<Marker></Marker>' in body[:200] and marker is not None:
  153. self.logger.error('Marker loop (empty marker in response despite providing one)')
  154. break
  155. # No risk, no fun!
  156. contents = body.split(b'<Contents>')
  157. assert all(content.startswith(b'<Key>') for content in contents[1:])
  158. assert all(content.endswith(b'</Contents>') for content in contents[1:-1])
  159. assert contents[-1].endswith(b'</Contents></ListBucketResult>')
  160. contents[-1] = contents[-1][:-len('</ListBucketResult>')]
  161. for content in contents[1:]:
  162. key = html.unescape(content[5 : content.index(b'</Key>')].decode('utf-8')) # 5 = len(b'<Key>')
  163. if marker2 is not None and key >= marker2:
  164. break
  165. url = f'{bucketUrl}{urllib.parse.quote(key)}'
  166. tags = content.split(b'>')
  167. assert len(tags) % 2 == 0
  168. assert tags[-1] == b''
  169. assert tags[-2] == b'</Contents'
  170. openTags = [] # Current open tag hierarchy
  171. fields = {}
  172. for tag in tags[:-2]:
  173. if tag.startswith(b'<'):
  174. openTags.append(tag[1:])
  175. continue
  176. assert openTags
  177. if tag.endswith(b'</' + openTags[-1]):
  178. fields[b'>'.join(openTags).decode('utf-8')] = html.unescape(tag[:-(len(openTags[-1]) + 2)].decode('utf-8'))
  179. openTags.pop()
  180. continue
  181. assert False
  182. size = int(fields['Size']) if 'Size' in fields else None
  183. s = format.format(**fields, key = key, url = url, size = size)
  184. self.logger.info(f'Output: {s!r}')
  185. print(s)
  186. lastKey = key
  187. if marker2 is not None and lastKey >= marker2:
  188. break
  189. truncated = True if b'<IsTruncated>true</IsTruncated>' in body else (False if b'<IsTruncated>false</IsTruncated>' in body else None)
  190. assert truncated in (True, False)
  191. if not truncated:
  192. break
  193. if marker is not None and marker == lastKey:
  194. self.logger.error('Marker loop (same last key as previous marker)')
  195. break
  196. marker = lastKey
  197. attempt = 1
  198. specDependencies = qwarc.utils.SpecDependencies(
  199. files = [os.environ['S3_MARKERS_FILENAME']],
  200. extra = {'env': {k: os.environ.get(k) for k in ('S3BL_CMD', 'S3_FORMAT', 'S3_BUCKET_URL', 'S3_MARKERS_FILENAME', 'S3_WITH_LIST_URLS')}}
  201. )
  202. EOF
  203. printf '%s' "${code[@]# }" >"${prefix}.py" # That's a tab character after the hash.
  204. # Generate the markers file
  205. { if [[ -z "${noStartMarker}" ]]; then echo; fi; printf '%s\n' "$@"; if [[ -z "${noEndMarker}" ]]; then echo; fi; } >"${prefix}-markers"
  206. # Collect environment variables
  207. envvars=()
  208. envvars+=(S3BL_CMD="${cmd}")
  209. envvars+=(S3_FORMAT="${format}")
  210. envvars+=(S3_BUCKET_URL="${bucketUrl}")
  211. envvars+=(S3_MARKERS_FILENAME="${prefix}-markers")
  212. if [[ "${listUrls}" ]]; then envvars+=(S3_WITH_LIST_URLS="${listUrls}"); fi
  213. # Lift-off!
  214. env "${envvars[@]}" qwarc --concurrency "${concurrency}" --database "${prefix}.db" --log "${prefix}.log" --warc "${prefix}" "${prefix}.py"