2 コミット

作成者 SHA1 メッセージ 日付
  JustAnotherArchivist 5678b586ac Add script for requeueing skipped URLs due to too many failed attempts on wpull crawls 3年前
  JustAnotherArchivist f05a8a79bc Clean up wpull DB commands 3年前
3個のファイルの変更87行の追加4行の削除
分割表示
  1. +16
    -3
      wpull2-extract-remaining
  2. +41
    -0
      wpull2-requeue
  3. +30
    -1
      wpull2-url-origin

+ 16
- 3
wpull2-extract-remaining ファイルの表示

@@ -1,8 +1,21 @@
#!/bin/bash
# Usage: wpull2-extract-remaining FILENAME
# FILENAME points to a wpull 2.x SQLite DB
# Usage: wpull2-extract-remaining [FILENAME]
# FILENAME points to a wpull 2.x SQLite DB; if not specified, defaults to wpull.db
# Prints all remaining URLs from the DB on stdout

if [[ $# -eq 1 ]]
then
filename="$1"
else
filename=wpull.db
fi
if [[ ! -f "${filename}" ]]
then
echo "Error: ${filename} does not exist or is not a regular file" >&2
exit 1
fi

for status in in_progress todo error
do
sqlite3 "$1" 'SELECT url_strings.url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE status = "'$status'"'
sqlite3 "${filename}" 'SELECT url_strings.url FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE status = "'$status'"'
done

+ 41
- 0
wpull2-requeue ファイルの表示

@@ -0,0 +1,41 @@
#!/bin/bash
# Usage: wpull2-requeue [ACTION] [FILENAME] URLPATTERN
# ACTION can be 'count' (default), 'print', or 'write'
# FILENAME defaults to 'wpull.db'
# URLPATTERN uses SQLite's LIKE syntax with ESCAPE "\", i.e. % matches any number of characters, _ matches exactly one character, and a backslash can be used to escape these special characters.
# Must not contain quotes.

if [[ $# -eq 3 || ( $# -eq 2 && ( "$1" == 'count' || "$1" == 'print' || "$1" == 'write' )) ]]
then
action="$1"
shift
else
action=count
fi

if [[ $# -eq 2 ]]
then
filename="$1"
shift
else
filename=wpull.db
fi
if [[ ! -f "${filename}" ]]
then
echo "Error: ${filename} does not exist or is not a regular file" >&2
exit 1
fi

urlpattern="$1"

query='FROM queued_urls JOIN url_strings ON url_string_id = url_strings.id WHERE url LIKE "'"${urlpattern}"'" ESCAPE "\" AND status = "skipped" AND try_count > 3'

if [[ "${action}" == 'count' ]]
then
sqlite3 "${filename}" "SELECT COUNT(queued_urls.id) ${query}"
elif [[ "${action}" == 'print' ]]
then
sqlite3 "${filename}" "SELECT queued_urls.*, url_strings.* ${query}"
else
sqlite3 "${filename}" 'UPDATE queued_urls SET status = "todo", try_count = 0 WHERE id IN (SELECT queued_urls.id '"${query}"')'
fi

+ 30
- 1
wpull2-url-origin ファイルの表示

@@ -1,3 +1,32 @@
#!/bin/bash
# Usage: wpull2-url-origin [FILENAME] URL
# FILENAME defaults to wpull.db
# Trace back where a URL was discovered, all the way back to the root
url="$1"; curId=$(sqlite3 wpull.db 'SELECT id FROM url_strings WHERE url = "'"${url}"'"'); while :; do sqlite3 wpull.db 'SELECT queued_urls.*, url_strings.* FROM queued_urls JOIN url_strings ON queued_urls.url_string_id = url_strings.id WHERE url_strings.id = '$curId; IFS='|' read -r curId level < <(sqlite3 wpull.db 'SELECT parent_url_string_id, level FROM queued_urls WHERE url_string_id = '$curId); if [[ ${level} -eq 0 ]]; then break; fi done
if [[ $# -eq 2 ]]
then
filename="$1"
shift
else
filename=wpull.db
fi
if [[ ! -f "${filename}" ]]
then
echo "Error: ${filename} does not exist or is not a regular file" >&2
exit 1
fi
url="$1"
curId=$(sqlite3 "${filename}" 'SELECT id FROM url_strings WHERE url = "'"${url}"'"')
if [[ -z "${curId}" ]]
then
echo "Error: ${url} not found" >&2
exit 1
fi
while :
do
sqlite3 "${filename}" 'SELECT queued_urls.*, url_strings.* FROM queued_urls JOIN url_strings ON queued_urls.url_string_id = url_strings.id WHERE url_strings.id = '${curId}
IFS='|' read -r curId level < <(sqlite3 "${filename}" 'SELECT parent_url_string_id, level FROM queued_urls WHERE url_string_id = '${curId})
if [[ ${level} -eq 0 ]]
then
break
fi
done

読み込み中…
キャンセル
保存