|
|
@@ -0,0 +1,51 @@ |
|
|
|
#!/bin/bash |
|
|
|
if [[ $# -ne 2 ]] |
|
|
|
then |
|
|
|
echo 'Usage: cdx-chunk CDXFILE SIZE' >&2 |
|
|
|
echo 'Returns offsets at which to split the WARC corresponding to CDXFILE such that each chunk is about SIZE bytes large.' >&2 |
|
|
|
echo 'CDXFILE must be a IA-style modern CDX file (CDX N b a m s k r M S V g) for a single WARC file. DO NOT PASS ITEM CDX FILES!' >&2 |
|
|
|
echo 'If CDXFILE ends with .gz, it is automatically decompressed. A dash may be passed to read from stdin, in which case it must be decompressed already.' >&2 |
|
|
|
echo 'SIZE is an integer, optionally with a trailing M or G to designate MiB or GiB, respectively.' >&2 |
|
|
|
echo 'The output is one integer per line, which designates the offset at which a new chunk begins. For example, if the first line is 1042, the first 1042 bytes are one chunk and the 1043rd byte begins the second chunk.' >&2 |
|
|
|
echo 'Note that chunks may be much bigger than SIZE if there are large records in the WARC.' >&2 |
|
|
|
exit 1 |
|
|
|
fi |
|
|
|
|
|
|
|
file="$1" |
|
|
|
declare -i size |
|
|
|
if [[ "$2" == *M ]] |
|
|
|
then |
|
|
|
size=$((${2::-1} * 1024 * 1024)) |
|
|
|
elif [[ "$2" == *G ]] |
|
|
|
then |
|
|
|
size=$((${2::-1} * 1024 * 1024 * 1024)) |
|
|
|
else |
|
|
|
size=$2 |
|
|
|
fi |
|
|
|
if [[ ${size} -eq 0 ]] |
|
|
|
then |
|
|
|
echo "Error: invalid size" >&2 |
|
|
|
exit 1 |
|
|
|
fi |
|
|
|
|
|
|
|
{ |
|
|
|
if [[ "${file}" == '-' ]] |
|
|
|
then |
|
|
|
cat |
|
|
|
elif [[ "${file}" == *.gz ]] |
|
|
|
then |
|
|
|
# Try to use zstdcat if available since it has much better performance. |
|
|
|
if command -v zstdcat &>/dev/null |
|
|
|
then |
|
|
|
zstdcat "${file}" |
|
|
|
else |
|
|
|
zcat "${file}" |
|
|
|
fi |
|
|
|
else |
|
|
|
cat "${file}" |
|
|
|
fi |
|
|
|
} | \ |
|
|
|
tail -n+2 | \ |
|
|
|
awk '{ print $10 }' | \ |
|
|
|
sort -n | \ |
|
|
|
awk -v size=${size} '($1 - lastBoundary) >= size { print; lastBoundary = $1; }' |