Author | SHA1 | Message | Date |
---|---|---|---|
JustAnotherArchivist | 5678b586ac | Add script for requeueing skipped URLs due to too many failed attempts on wpull crawls | 3 years ago |
JustAnotherArchivist | f05a8a79bc | Clean up wpull DB commands | 3 years ago |
@@ -1,8 +1,21 @@ | |||||
#!/bin/bash | #!/bin/bash | ||||
# Usage: wpull2-extract-remaining FILENAME | |||||
# FILENAME points to a wpull 2.x SQLite DB | |||||
# Usage: wpull2-extract-remaining [FILENAME] | |||||
# FILENAME points to a wpull 2.x SQLite DB; if not specified, defaults to wpull.db | |||||
# Prints all remaining URLs from the DB on stdout | # Prints all remaining URLs from the DB on stdout | ||||
if [[ $# -eq 1 ]] | |||||
then | |||||
filename="$1" | |||||
else | |||||
filename=wpull.db | |||||
fi | |||||
if [[ ! -f "${filename}" ]] | |||||
then | |||||
echo "Error: ${filename} does not exist or is not a regular file" >&2 | |||||
exit 1 | |||||
fi | |||||
for status in in_progress todo error | for status in in_progress todo error | ||||
do | do | ||||
sqlite3 "$1" 'SELECT url_strings.url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE status = "'$status'"' | |||||
sqlite3 "${filename}" 'SELECT url_strings.url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE status = "'$status'"' | |||||
done | done |
@@ -0,0 +1,41 @@ | |||||
#!/bin/bash | |||||
# Usage: wpull2-requeue [ACTION] [FILENAME] URLPATTERN | |||||
# ACTION can be 'count' (default), 'print', or 'write' | |||||
# FILENAME defaults to 'wpull.db' | |||||
# URLPATTERN uses SQLite's LIKE syntax with ESCAPE "\", i.e. % matches any number of characters, _ matches exactly one character, and a backslash can be used to escape these special characters. | |||||
# Must not contain quotes. | |||||
if [[ $# -eq 3 || ( $# -eq 2 && ( "$1" == 'count' || "$1" == 'print' || "$1" == 'write' )) ]] | |||||
then | |||||
action="$1" | |||||
shift | |||||
else | |||||
action=count | |||||
fi | |||||
if [[ $# -eq 2 ]] | |||||
then | |||||
filename="$1" | |||||
shift | |||||
else | |||||
filename=wpull.db | |||||
fi | |||||
if [[ ! -f "${filename}" ]] | |||||
then | |||||
echo "Error: ${filename} does not exist or is not a regular file" >&2 | |||||
exit 1 | |||||
fi | |||||
urlpattern="$1" | |||||
query='FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE url LIKE "'"${urlpattern}"'" ESCAPE "\" AND status = "skipped" AND try_count > 3' | |||||
if [[ "${action}" == 'count' ]] | |||||
then | |||||
sqlite3 "${filename}" "SELECT COUNT(queued_urls.id) ${query}" | |||||
elif [[ "${action}" == 'print' ]] | |||||
then | |||||
sqlite3 "${filename}" "SELECT queued_urls.*, url_strings.* ${query}" | |||||
else | |||||
sqlite3 "${filename}" 'UPDATE queued_urls SET status = "todo", try_count = 0 WHERE id IN (SELECT queued_urls.id '"${query}"')' | |||||
fi |
@@ -1,3 +1,32 @@ | |||||
#!/bin/bash | #!/bin/bash | ||||
# Usage: wpull2-url-origin [FILENAME] URL | |||||
# FILENAME defaults to wpull.db | |||||
# Trace back where a URL was discovered, all the way back to the root | # Trace back where a URL was discovered, all the way back to the root | ||||
url="$1"; curId=$(sqlite3 wpull.db 'SELECT id FROM url_strings WHERE url = "'"${url}"'"'); while :; do sqlite3 wpull.db 'SELECT queued_urls.*, url_strings.* FROM queued_urls JOIN url_strings ON queued_urls.url_string_id = url_strings.id WHERE url_strings.id = '$curId; IFS='|' read -r curId level < <(sqlite3 wpull.db 'SELECT parent_url_string_id, level FROM queued_urls WHERE url_string_id = '$curId); if [[ ${level} -eq 0 ]]; then break; fi done | |||||
if [[ $# -eq 2 ]] | |||||
then | |||||
filename="$1" | |||||
shift | |||||
else | |||||
filename=wpull.db | |||||
fi | |||||
if [[ ! -f "${filename}" ]] | |||||
then | |||||
echo "Error: ${filename} does not exist or is not a regular file" >&2 | |||||
exit 1 | |||||
fi | |||||
url="$1" | |||||
curId=$(sqlite3 "${filename}" 'SELECT id FROM url_strings WHERE url = "'"${url}"'"') | |||||
if [[ -z "${curId}" ]] | |||||
then | |||||
echo "Error: ${url} not found" >&2 | |||||
exit 1 | |||||
fi | |||||
while : | |||||
do | |||||
sqlite3 "${filename}" 'SELECT queued_urls.*, url_strings.* FROM queued_urls JOIN url_strings ON queued_urls.url_string_id = url_strings.id WHERE url_strings.id = '${curId} | |||||
IFS='|' read -r curId level < <(sqlite3 "${filename}" 'SELECT parent_url_string_id, level FROM queued_urls WHERE url_string_id = '${curId}) | |||||
if [[ ${level} -eq 0 ]] | |||||
then | |||||
break | |||||
fi | |||||
done |