The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
- #!/bin/bash
- # Search 4chan archives based on FoolFuuka
- # Searches each board individually to get as much content as possible due to the 5000 results limit
- # Output: one post per line in HTML
- domain="$1"
- q="$2"
- curl -s "https://${domain}/" | grep -Po 'href="(https?://'"$(sed 's/[]\.|$(){}?+*^]/\\&/g' <<<"${domain}")"')?/\K[^/]+(?=/")' | awk '!seen[$0]++' | while read -r board
- do
- content=$(curl -s "https://${domain}/${board}/search/text/${q}/")
- if grep -qP '<h3 class="section_title">.*Returning only' <<<"${content}"
- then
- echo "Warning: only 5000 results!" >&2
- fi
-
- declare -i page=1
- while [[ ${page} -lt 201 ]]
- do
- echo "Grabbing https://${domain}/${board}/search/text/${q}/page/${page}/" >&2
- content=$(curl -s "https://${domain}/${board}/search/text/${q}/page/${page}/")
- if grep -qF '<div class="alert"' <<<"${content}"
- then
- if ! tr -d '\n' <<<"${content}" | grep -Po '<div class="alert".*?</div>' | grep -q 'No results found'
- then
- echo "Error on https://${domain}/${board}/search/text/${q}/page/${page}/" >&2
- fi
- break
- fi
- tr -d '\n' <<<"${content}" | grep -Po '<article class="post.*?</article>'
- page+=1
- done
- done
|