Browse Source

Better workaround for the 5000 results limit; works for FoolFuuka 2.0.1 and up

master
JustAnotherArchivist 5 years ago
parent
commit
1748a6b607
1 changed files with 18 additions and 13 deletions
  1. +18
    -13
      foolfuuka-search

+ 18
- 13
foolfuuka-search View File

@@ -1,31 +1,36 @@
#!/bin/bash
# Search 4chan archives based on FoolFuuka
# Searches each board individually to get as much content as possible due to the 5000 results limit
# Output: one post per line in HTML
# Output: one post per line in HTML, prefixed with the post ID
# Note that posts can appear multiple times in the output in some cases. You're encouraged to filter based on the post ID.
domain="$1"
q="$2"
curl -s "https://${domain}/" | grep -Po 'href="(https?://'"$(sed 's/[]\.|$(){}?+*^]/\\&/g' <<<"${domain}")"')?/\K[^/]+(?=/")' | awk '!seen[$0]++' | while read -r board
end=
nextend=2038-01-19
while :
do
content=$(curl -s "https://${domain}/${board}/search/text/${q}/")
if grep -qP '<h3 class="section_title">.*Returning only' <<<"${content}"
then
echo "Warning: only 5000 results!" >&2
fi
end="${nextend}"
content=$(curl -s "https://${domain}/_/search/text/${q}/end/${end}/page/${page}/")

declare -i page=1
while [[ ${page} -lt 201 ]]
do
echo "Grabbing https://${domain}/${board}/search/text/${q}/page/${page}/" >&2
content=$(curl -s "https://${domain}/${board}/search/text/${q}/page/${page}/")
echo "Grabbing https://${domain}/_/search/text/${q}/end/${end}/page/${page}/" >&2
content=$(curl -s "https://${domain}/_/search/text/${q}/end/${end}/page/${page}/")
tr -d '\n' <<<"${content}" | grep -Po '<article class="post.*?</article>' | perl -pe 's,^(.*?id="(\d+)".*$),\2 \1,'

# Get last date seen to update end date; subtract one because the search appears to be a bit unreliable
nextend="$(date --date="@$(($(date --date="$(tr -d '\n' <<<"${content}" | grep -Po '<article class="post.*?</article>' | tail -1 | grep -Po '<time datetime="\K[^"]+')" '+%s') - 86400))" '+%Y-%m-%d')"

if grep -qF '<div class="alert"' <<<"${content}"
then
if ! tr -d '\n' <<<"${content}" | grep -Po '<div class="alert".*?</div>' | grep -q 'No results found'
then
echo "Error on https://${domain}/${board}/search/text/${q}/page/${page}/" >&2
echo "Error" >&2
break
else
break 2
fi
break
fi
tr -d '\n' <<<"${content}" | grep -Po '<article class="post.*?</article>'
page+=1
done
done

Loading…
Cancel
Save