From 7f0809270bb998721779c2e2ad9953b951dc65cf Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 6 Jun 2023 20:54:08 +0000 Subject: [PATCH] Catch urljoin exceptions (e.g. invalid IPv6) --- html-extract-stupid | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html-extract-stupid b/html-extract-stupid index 725fdaf..e435b34 100755 --- a/html-extract-stupid +++ b/html-extract-stupid @@ -42,5 +42,5 @@ # Combine base and values to get absolute URLs # If multiple base tags are present, they all get respected. This violates the HTML specs. - python3 -c 'import os, sys, urllib.parse; base = None'$'\n''try:'$'\n'' for l in map(str.strip, sys.stdin):'$'\n'' tag, value = l.split(" ", 1)'$'\n'' tag = tag.lower()'$'\n'' if base:'$'\n'' value = urllib.parse.urljoin(base, value)'$'\n'' if tag == "base":'$'\n'' base = value'$'\n'' continue'$'\n'' print(f"{tag} {value}")'$'\n''except BrokenPipeError:'$'\n'' os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stdout.fileno()); sys.exit(1)' + python3 -c 'import os, sys, urllib.parse; base = None'$'\n''try:'$'\n'' for l in map(str.strip, sys.stdin):'$'\n'' tag, value = l.split(" ", 1)'$'\n'' tag = tag.lower()'$'\n'' if base:'$'\n'' try:'$'\n'' value = urllib.parse.urljoin(base, value)'$'\n'' except ValueError as e:'$'\n'' print(f"Could not merge {base = }, {l = }: {type(e)} {e}", file = sys.stderr)'$'\n'' continue'$'\n'' if tag == "base":'$'\n'' base = value'$'\n'' continue'$'\n'' print(f"{tag} {value}")'$'\n''except BrokenPipeError:'$'\n'' os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stdout.fileno()); sys.exit(1)' }