diff --git a/foolfuuka-search b/foolfuuka-search index 8276d4e..e326aca 100644 --- a/foolfuuka-search +++ b/foolfuuka-search @@ -1,31 +1,36 @@ #!/bin/bash # Search 4chan archives based on FoolFuuka -# Searches each board individually to get as much content as possible due to the 5000 results limit -# Output: one post per line in HTML +# Output: one post per line in HTML, prefixed with the post ID +# Note that posts can appear multiple times in the output in some cases. You're encouraged to filter based on the post ID. domain="$1" q="$2" -curl -s "https://${domain}/" | grep -Po 'href="(https?://'"$(sed 's/[]\.|$(){}?+*^]/\\&/g' <<<"${domain}")"')?/\K[^/]+(?=/")' | awk '!seen[$0]++' | while read -r board +end= +nextend=2038-01-19 +while : do - content=$(curl -s "https://${domain}/${board}/search/text/${q}/") - if grep -qP '

.*Returning only' <<<"${content}" - then - echo "Warning: only 5000 results!" >&2 - fi + end="${nextend}" + content=$(curl -s "https://${domain}/_/search/text/${q}/end/${end}/page/${page}/") declare -i page=1 while [[ ${page} -lt 201 ]] do - echo "Grabbing https://${domain}/${board}/search/text/${q}/page/${page}/" >&2 - content=$(curl -s "https://${domain}/${board}/search/text/${q}/page/${page}/") + echo "Grabbing https://${domain}/_/search/text/${q}/end/${end}/page/${page}/" >&2 + content=$(curl -s "https://${domain}/_/search/text/${q}/end/${end}/page/${page}/") + tr -d '\n' <<<"${content}" | grep -Po '
' | grep -q 'No results found' then - echo "Error on https://${domain}/${board}/search/text/${q}/page/${page}/" >&2 + echo "Error" >&2 + break + else + break 2 fi - break fi - tr -d '\n' <<<"${content}" | grep -Po '