The little things give you away... A collection of various small helper stuff
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 

50 lignes
3.3 KiB

  1. #!/bin/bash
  2. # Reads HTML from stdin, extracts links and page requisites with simple string splitting and regexes.
  3. # Usage cannot be recommended against enough.
  4. # Produces lines of 'TAG URL', e.g. 'a https://example.org/'.
  5. {
  6. # Reformat so each line is one tag
  7. # Yes, this may break attribute values if they contain CR, LF, or <, but that's rare enough.
  8. tr '\r\n' ' ' | tr '<' '\n' |
  9. # Extract tags of interest
  10. grep -ai '^\(a\|base\|img\|link\|script\)\s' |
  11. # Fix scripty backslash nonsense
  12. perl -pe 's,\\,,g' |
  13. # Split img tags with src and srcset
  14. perl -pe "s,^img(?=\s(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?src\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(?:\s|>))\s(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?srcset\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(\s|>).*,img src=\1\nimg srcset=\2,i" |
  15. # Extract interesting tags/attributes
  16. perl -pe "s,^(a|base)\s(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?href\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(\$|\s|>).*,+\1 \2,i;
  17. s,^(img|script)\s(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?src\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(\$|\s|>).*,+\1 \2,i;
  18. s,^(link)\s(?=(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?rel\s*=\s*(?:stylesheet|\"(?:[^\"]*\s)?stylesheet(?:\s[^\"]*)?\"|'(?:[^']*\s)?stylesheet(?:\s[^']*)?')(?:\$|\s|>))(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?href\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(\$|\s|>).*,+\1 \2,i;
  19. s,^(img)\s(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?(srcset)\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(\$|\s|>).*,+\1\2 \3,i;
  20. # Ensure that there's a LF at the end of each line since \s might match it.
  21. s,\s*$,\n,;
  22. " |
  23. # Filter out unprocessed lines
  24. grep -a '^+' | sed 's,^+,,' |
  25. # Remove quotes from attribute values
  26. perl -pe "s,^([a-zA-Z]+) (['\"])(.*)\2$,\1 \3," |
  27. # Filter out lines without an attribute value
  28. grep -Pva '^[a-zA-Z]+ $' |
  29. # Remove lines with invalid UTF-8
  30. LANG=C.UTF-8 grep -a '^.*$' |
  31. # img srcset splitting
  32. python3 -c 'import os, re, sys'$'\n''try:'$'\n'' for l in map(str.strip, sys.stdin):'$'\n'' tag, value = l.split(" ", 1)'$'\n'' tag = tag.lower()'$'\n'' if tag != "imgsrcset":'$'\n'' print(l); continue'$'\n'' for url in re.split(r"\s+\d+[wx]\s*(?:,\s*|$)|,\s+", value.strip()):'$'\n'' if url: print(f"img {url}")'$'\n''except BrokenPipeError:'$'\n'' os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stdout.fileno()); sys.exit(1)' |
  33. # Decode HTML references
  34. python3 -c 'import html, os, sys'$'\n''try:'$'\n'' for l in sys.stdin:'$'\n'' print(html.unescape(l.strip()).split("\r", 1)[0].split("\n", 1)[0])'$'\n''except BrokenPipeError:'$'\n'' os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stdout.fileno()); sys.exit(1)' |
  35. # Combine base and values to get absolute URLs
  36. # If multiple base tags are present, they all get respected. This violates the HTML specs.
  37. python3 -c 'import os, sys, urllib.parse; base = None'$'\n''try:'$'\n'' for l in map(str.strip, sys.stdin):'$'\n'' tag, value = l.split(" ", 1)'$'\n'' tag = tag.lower()'$'\n'' if base:'$'\n'' try:'$'\n'' value = urllib.parse.urljoin(base, value)'$'\n'' except ValueError as e:'$'\n'' print(f"Could not merge {base = }, {l = }: {type(e)} {e}", file = sys.stderr)'$'\n'' continue'$'\n'' if tag == "base":'$'\n'' base = value'$'\n'' continue'$'\n'' print(f"{tag} {value}")'$'\n''except BrokenPipeError:'$'\n'' os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stdout.fileno()); sys.exit(1)'
  38. }