The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

52 lines
1.6 KiB

  1. #!/bin/bash
  2. if [[ $# -ne 2 ]]
  3. then
  4. echo 'Usage: cdx-chunk CDXFILE SIZE' >&2
  5. echo 'Returns offsets at which to split the WARC corresponding to CDXFILE such that each chunk is about SIZE bytes large.' >&2
  6. echo 'CDXFILE must be a IA-style modern CDX file (CDX N b a m s k r M S V g) for a single WARC file. DO NOT PASS ITEM CDX FILES!' >&2
  7. echo 'If CDXFILE ends with .gz, it is automatically decompressed. A dash may be passed to read from stdin, in which case it must be decompressed already.' >&2
  8. echo 'SIZE is an integer, optionally with a trailing M or G to designate MiB or GiB, respectively.' >&2
  9. echo 'The output is one integer per line, which designates the offset at which a new chunk begins. For example, if the first line is 1042, the first 1042 bytes are one chunk and the 1043rd byte begins the second chunk.' >&2
  10. echo 'Note that chunks may be much bigger than SIZE if there are large records in the WARC.' >&2
  11. exit 1
  12. fi
  13. file="$1"
  14. declare -i size
  15. if [[ "$2" == *M ]]
  16. then
  17. size=$((${2::-1} * 1024 * 1024))
  18. elif [[ "$2" == *G ]]
  19. then
  20. size=$((${2::-1} * 1024 * 1024 * 1024))
  21. else
  22. size=$2
  23. fi
  24. if [[ ${size} -eq 0 ]]
  25. then
  26. echo "Error: invalid size" >&2
  27. exit 1
  28. fi
  29. {
  30. if [[ "${file}" == '-' ]]
  31. then
  32. cat
  33. elif [[ "${file}" == *.gz ]]
  34. then
  35. # Try to use zstdcat if available since it has much better performance.
  36. if command -v zstdcat &>/dev/null
  37. then
  38. zstdcat "${file}"
  39. else
  40. zcat "${file}"
  41. fi
  42. else
  43. cat "${file}"
  44. fi
  45. } | \
  46. tail -n+2 | \
  47. awk '{ print $10 }' | \
  48. sort -n | \
  49. awk -v size=${size} '($1 - lastBoundary) >= size { print; lastBoundary = $1; }'