diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b25c15b --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*~ diff --git a/chunker b/chunker index 7075179..0c4ec01 100755 --- a/chunker +++ b/chunker @@ -28,7 +28,7 @@ mkdir -p "$CHUNKER_WORKING_DIR/current" || exit 1 cur_size=$( du -B1 -s "$CHUNKER_WORKING_DIR/current" | grep -oE "^[0-9]+" ) # find every .warc.gz in the upload directory -find "$INCOMING_UPLOADS_DIR" -type f -name "*.warc.gz" \ +find "$INCOMING_UPLOADS_DIR" -type f -regex ".+\.warc\.\(gz\|zst\)$" \ | while read filename do # skip partial uploads diff --git a/config.example.sh b/config.example.sh index 26be8ca..9875f1d 100755 --- a/config.example.sh +++ b/config.example.sh @@ -34,6 +34,9 @@ OFFLOAD_TARGET="rsync://somewhere-far-away:portnum/module-name/directory/" # it is also possible to create a list of targets and the offloader will pick one at random and retry others on failure # simply comment out the line above and put all rsync target urls separated by newline in a file called "offload_targets" +# the API for requesting the ZSTD dictionaries +ZST_DICTIONARY_API="API URL" + ############### # DIRECTORIES # ############### diff --git a/megawarc b/megawarc index f77638d..5468d80 160000 --- a/megawarc +++ b/megawarc @@ -1 +1 @@ -Subproject commit f77638dbf7d0c4a7dd301217ee04fbc6a3c3ebbf +Subproject commit 5468d80e35b3dcb85d36624580c813326af706fe diff --git a/pack-one b/pack-one index b798e77..ea496c6 100755 --- a/pack-one +++ b/pack-one @@ -84,7 +84,7 @@ echo "$( date ): Starting megawarc for item $ITEM" >> packer.log mkdir -p $PACKER_WORKING_MEGAWARC_DIR/$ITEM # megawarcs use relative paths cd "$PACKER_WORKING_CHUNKS_DIR/" -$MEGAWARC --verbose pack $PACKER_WORKING_MEGAWARC_DIR/$ITEM/${FILE_PREFIX}${ITEM} $ITEM +$MEGAWARC --verbose pack --server $ZST_DICTIONARY_API $PACKER_WORKING_MEGAWARC_DIR/$ITEM/${FILE_PREFIX}${ITEM} $ITEM result=$? cd "$WORKING_DIR" diff --git a/upload-one b/upload-one index 360dfd9..6359c1a 100755 --- a/upload-one +++ b/upload-one @@ -65,13 +65,13 @@ echo "$( date ): Start uploading for item $ITEM" >> uploader.log # upload megawarc size_hint=$( du --bytes -s "${UPLOADER_WORKING_DIR}/${ITEM}" | grep -oE "^[0-9]+" ) # (upload the large files first to optimise S3 snowballing) -for ext in warc.gz tar json.gz + +find "${UPLOADER_WORKING_DIR}/${ITEM}" -type f -regextype posix-egrep -regex ".+\.megawarc\.(warc\.(gz|zst)|tar|json\.gz)$" -printf "%f\n" \ +| while read filename do - test "${ext}" == "tar" && ! test -f "${FILE_PREFIX}${ITEM}.megawarc.${ext}" && continue # skip non-existing tar files result=1 while [[ $result -ne 0 ]] do - filename="${FILE_PREFIX}${ITEM}.megawarc.${ext}" curl -v --location --fail \ --speed-limit 1 --speed-time 900 \ --header "x-archive-queue-derive:1" \