Browse Source

Add script for requeueing skipped URLs due to too many failed attempts on wpull crawls

master
JustAnotherArchivist 1 month ago
parent
commit
5678b586ac
1 changed files with 41 additions and 0 deletions
  1. +41
    -0
      wpull2-requeue

+ 41
- 0
wpull2-requeue View File

@@ -0,0 +1,41 @@
#!/bin/bash
# Usage: wpull2-requeue [ACTION] [FILENAME] URLPATTERN
# ACTION can be 'count' (default), 'print', or 'write'
# FILENAME defaults to 'wpull.db'
# URLPATTERN uses SQLite's LIKE syntax with ESCAPE "\", i.e. % matches any number of characters, _ matches exactly one character, and a backslash can be used to escape these special characters.
# Must not contain quotes.

if [[ $# -eq 3 || ( $# -eq 2 && ( "$1" == 'count' || "$1" == 'print' || "$1" == 'write' )) ]]
then
action="$1"
shift
else
action=count
fi

if [[ $# -eq 2 ]]
then
filename="$1"
shift
else
filename=wpull.db
fi
if [[ ! -f "${filename}" ]]
then
echo "Error: ${filename} does not exist or is not a regular file" >&2
exit 1
fi

urlpattern="$1"

query='FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE url LIKE "'"${urlpattern}"'" ESCAPE "\" AND status = "skipped" AND try_count > 3'

if [[ "${action}" == 'count' ]]
then
sqlite3 "${filename}" "SELECT COUNT(queued_urls.id) ${query}"
elif [[ "${action}" == 'print' ]]
then
sqlite3 "${filename}" "SELECT queued_urls.*, url_strings.* ${query}"
else
sqlite3 "${filename}" 'UPDATE queued_urls SET status = "todo", try_count = 0 WHERE id IN (SELECT queued_urls.id '"${query}"')'
fi

Loading…
Cancel
Save