|
|
@@ -19,9 +19,14 @@ then |
|
|
|
source ./config.sh || exit 1 |
|
|
|
fi |
|
|
|
|
|
|
|
BYTES_PER_CHUNK=$((1024*1024*MEGABYTES_PER_CHUNK)) |
|
|
|
|
|
|
|
mkdir -p "$CHUNKER_WORKING_DIR" || exit 1 |
|
|
|
mkdir -p "$PACKING_QUEUE_DIR" || exit 1 |
|
|
|
|
|
|
|
mkdir -p "$CHUNKER_WORKING_DIR/current" || exit 1 |
|
|
|
cur_size=$( du -B1 -s "$CHUNKER_WORKING_DIR/current" | grep -oE "^[0-9]+" ) |
|
|
|
|
|
|
|
# find every .warc.gz in the upload directory |
|
|
|
find "$INCOMING_UPLOADS_DIR" -type f -name "*.warc.gz" \ |
|
|
|
| while read filename |
|
|
@@ -32,6 +37,8 @@ do |
|
|
|
continue |
|
|
|
fi |
|
|
|
|
|
|
|
cur_size=$((cur_size + $( du -B1 -s $filename | grep -oE "^[0-9]+" ))) |
|
|
|
|
|
|
|
# move to the current/ directory |
|
|
|
echo "Moving ${filename}" |
|
|
|
mkdir -p "$CHUNKER_WORKING_DIR/current" |
|
|
@@ -39,12 +46,12 @@ do |
|
|
|
|
|
|
|
# if the current/ directory is large enough, |
|
|
|
# rename it to archive-XXXXX and start a new current/ |
|
|
|
cur_size=$( du -BM -s "$CHUNKER_WORKING_DIR/current" | grep -oE "^[0-9]+" ) |
|
|
|
if [[ $cur_size -gt $MEGABYTES_PER_CHUNK ]] |
|
|
|
if [[ $cur_size -gt $BYTES_PER_CHUNK ]] |
|
|
|
then |
|
|
|
timestamp=$( date +'%Y%m%d%H%M%S' ) |
|
|
|
echo "Current archive is full, moving to ${timestamp}." |
|
|
|
mv "$CHUNKER_WORKING_DIR/current" "$PACKING_QUEUE_DIR/${timestamp}" |
|
|
|
cur_size=0 |
|
|
|
fi |
|
|
|
done |
|
|
|
|