#!/bin/bash # Estimate size of a website through wget spider # Note: this will miss anything where the server doesn't advertise the size. wget --recursive --level inf --spider --no-directories --output-file=wget.log --no-parent --reject-regex '/\?C=[NMSD];O=[AD]$' "$1" grep -Po ' \Khttps?://.*$|Length: \K\d+(?= )' wget.log | sed 's,^\(.*https\?://.*$\),url \1,; s,^\([0-9]\+\)$,length \1,' | awk 'BEGIN {url = ""; len = 0; totalsize = 0; } { if ($1 == "url") { if ($2 != url) { totalsize += len; url = $2; len = 0; } } else { if ($1 == "length") { len = $2; } } } END { totalsize += len; printf "%.0f\n", totalsize; }'