Sfoglia il codice sorgente

Normalise URLs everywhere to reduce duplicates

master
JustAnotherArchivist 4 anni fa
parent
commit
79f0bd4332
1 ha cambiato i file con 5 aggiunte e 4 eliminazioni
  1. +5
    -4
      wiki-recursive-extract-normalise

wiki-recursive-extract → wiki-recursive-extract-normalise Vedi File

@@ -3,6 +3,7 @@
# Everything that looks like a social media link (including YouTube) is run through social-media-extract-profile-link. # Everything that looks like a social media link (including YouTube) is run through social-media-extract-profile-link.
# Everything else is run through website-extract-social-media. # Everything else is run through website-extract-social-media.
# This is done recursively until no new links are discovered anymore. # This is done recursively until no new links are discovered anymore.
# The output is further fed through url-normalise before, during, and after processing to avoid equivalent but slightly different duplicates.


verbose= verbose=
while [[ $# -gt 0 ]] while [[ $# -gt 0 ]]
@@ -31,7 +32,7 @@ function stderr_annotate {


scriptpath="$(cd "$(dirname "$0")"; pwd -P)" scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
declare -A sectionUrls declare -A sectionUrls
while read -r line
stderr_annotate "${scriptpath}/url-normalise" ${verbose} | while read -r line
do do
echo "${line}" echo "${line}"
if [[ "${line}" == '=='* ]] if [[ "${line}" == '=='* ]]
@@ -57,9 +58,9 @@ do


if grep -Pq '//([^/]+\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${curUrl}" if grep -Pq '//([^/]+\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${curUrl}"
then then
mapfile -t outUrls < <(stderr_annotate "${scriptpath}/social-media-extract-profile-link" ${verbose} "${curUrl}" < <(:))
mapfile -t outUrls < <(stderr_annotate "${scriptpath}/social-media-extract-profile-link" ${verbose} "${curUrl}" < <(:) | stderr_annotate "${scriptpath}/url-normalise" ${verbose})
else else
mapfile -t outUrls < <(stderr_annotate "${scriptpath}/website-extract-social-media" ${verbose} "${curUrl}" < <(:))
mapfile -t outUrls < <(stderr_annotate "${scriptpath}/website-extract-social-media" ${verbose} "${curUrl}" < <(:) | stderr_annotate "${scriptpath}/url-normalise" ${verbose})
fi fi


for outUrl in "${outUrls[@]}" for outUrl in "${outUrls[@]}"
@@ -77,4 +78,4 @@ do
done done
done done
fi fi
done
done | stderr_annotate "${scriptpath}/url-normalise" ${verbose}

Caricamento…
Annulla
Salva