2 Commits

Author SHA1 Message Date
  JustAnotherArchivist 5678b586ac Add script for requeueing skipped URLs due to too many failed attempts on wpull crawls 3 years ago
  JustAnotherArchivist f05a8a79bc Clean up wpull DB commands 3 years ago
3 changed files with 87 additions and 4 deletions
Split View
  1. +16
    -3
      wpull2-extract-remaining
  2. +41
    -0
      wpull2-requeue
  3. +30
    -1
      wpull2-url-origin

+ 16
- 3
wpull2-extract-remaining View File

@@ -1,8 +1,21 @@
#!/bin/bash
# Usage: wpull2-extract-remaining FILENAME
# FILENAME points to a wpull 2.x SQLite DB
# Usage: wpull2-extract-remaining [FILENAME]
# FILENAME points to a wpull 2.x SQLite DB; if not specified, defaults to wpull.db
# Prints all remaining URLs from the DB on stdout

if [[ $# -eq 1 ]]
then
filename="$1"
else
filename=wpull.db
fi
if [[ ! -f "${filename}" ]]
then
echo "Error: ${filename} does not exist or is not a regular file" >&2
exit 1
fi

for status in in_progress todo error
do
sqlite3 "$1" 'SELECT url_strings.url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE status = "'$status'"'
sqlite3 "${filename}" 'SELECT url_strings.url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE status = "'$status'"'
done

+ 41
- 0
wpull2-requeue View File

@@ -0,0 +1,41 @@
#!/bin/bash
# Usage: wpull2-requeue [ACTION] [FILENAME] URLPATTERN
# ACTION can be 'count' (default), 'print', or 'write'
# FILENAME defaults to 'wpull.db'
# URLPATTERN uses SQLite's LIKE syntax with ESCAPE "\", i.e. % matches any number of characters, _ matches exactly one character, and a backslash can be used to escape these special characters.
# Must not contain quotes.

if [[ $# -eq 3 || ( $# -eq 2 && ( "$1" == 'count' || "$1" == 'print' || "$1" == 'write' )) ]]
then
action="$1"
shift
else
action=count
fi

if [[ $# -eq 2 ]]
then
filename="$1"
shift
else
filename=wpull.db
fi
if [[ ! -f "${filename}" ]]
then
echo "Error: ${filename} does not exist or is not a regular file" >&2
exit 1
fi

urlpattern="$1"

query='FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE url LIKE "'"${urlpattern}"'" ESCAPE "\" AND status = "skipped" AND try_count > 3'

if [[ "${action}" == 'count' ]]
then
sqlite3 "${filename}" "SELECT COUNT(queued_urls.id) ${query}"
elif [[ "${action}" == 'print' ]]
then
sqlite3 "${filename}" "SELECT queued_urls.*, url_strings.* ${query}"
else
sqlite3 "${filename}" 'UPDATE queued_urls SET status = "todo", try_count = 0 WHERE id IN (SELECT queued_urls.id '"${query}"')'
fi

+ 30
- 1
wpull2-url-origin View File

@@ -1,3 +1,32 @@
#!/bin/bash
# Usage: wpull2-url-origin [FILENAME] URL
# FILENAME defaults to wpull.db
# Trace back where a URL was discovered, all the way back to the root
url="$1"; curId=$(sqlite3 wpull.db 'SELECT id FROM url_strings WHERE url = "'"${url}"'"'); while :; do sqlite3 wpull.db 'SELECT queued_urls.*, url_strings.* FROM queued_urls JOIN url_strings ON queued_urls.url_string_id = url_strings.id WHERE url_strings.id = '$curId; IFS='|' read -r curId level < <(sqlite3 wpull.db 'SELECT parent_url_string_id, level FROM queued_urls WHERE url_string_id = '$curId); if [[ ${level} -eq 0 ]]; then break; fi done
if [[ $# -eq 2 ]]
then
filename="$1"
shift
else
filename=wpull.db
fi
if [[ ! -f "${filename}" ]]
then
echo "Error: ${filename} does not exist or is not a regular file" >&2
exit 1
fi
url="$1"
curId=$(sqlite3 "${filename}" 'SELECT id FROM url_strings WHERE url = "'"${url}"'"')
if [[ -z "${curId}" ]]
then
echo "Error: ${url} not found" >&2
exit 1
fi
while :
do
sqlite3 "${filename}" 'SELECT queued_urls.*, url_strings.* FROM queued_urls JOIN url_strings ON queued_urls.url_string_id = url_strings.id WHERE url_strings.id = '${curId}
IFS='|' read -r curId level < <(sqlite3 "${filename}" 'SELECT parent_url_string_id, level FROM queued_urls WHERE url_string_id = '${curId})
if [[ ${level} -eq 0 ]]
then
break
fi
done

Loading…
Cancel
Save