The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

47 lines
2.8 KiB

  1. #!/bin/bash
  2. # Reads HTML from stdin, extracts links and page requisites with simple string splitting and regexes.
  3. # Usage cannot be recommended against enough.
  4. # Produces lines of 'TAG URL', e.g. 'a https://example.org/'.
  5. {
  6. # Reformat so each line is one tag
  7. # Yes, this may break attribute values if they contain CR, LF, or <, but that's rare enough.
  8. tr '\r\n' ' ' | tr '<' '\n' |
  9. # Extract tags of interest
  10. grep -i '^\(a\|base\|img\|link\|script\)\s' |
  11. # Fix scripty backslash nonsense
  12. perl -pe 's,\\,,g' |
  13. # Split img tags with src and srcset
  14. perl -pe "s,^img(?=\s(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?src\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(?:\s|>))\s(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?srcset\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(\s|>).*,img src=\1\nimg srcset=\2,i" |
  15. # Extract interesting tags/attributes
  16. perl -pe "s,^(a|base)\s(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?href\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(\$|\s|>).*,+\1 \2,i;
  17. s,^(img|script)\s(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?src\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(\$|\s|>).*,+\1 \2,i;
  18. s,^(link)\s(?=(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?rel\s*=\s*(?:stylesheet|\"(?:[^\"]*\s)?stylesheet(?:\s[^\"]*)?\"|'(?:[^']*\s)?stylesheet(?:\s[^']*)?')(?:\$|\s|>))(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?href\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(\$|\s|>).*,+\1 \2,i;
  19. s,^(img)\s(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?(srcset)\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(\$|\s|>).*,+\1\2 \3,i;
  20. # Ensure that there's a LF at the end of each line since \s might match it.
  21. s,\s*$,\n,;
  22. " |
  23. # Filter out unprocessed lines
  24. grep '^+' | sed 's,^+,,' |
  25. # Remove quotes from attribute values
  26. perl -pe "s,^([a-zA-Z]+) (['\"])(.*)\2$,\1 \3," |
  27. # Filter out lines without an attribute value
  28. grep -Pv '^[a-zA-Z]+ $' |
  29. # img srcset splitting
  30. python3 -c 'import re, sys'$'\n''for l in map(str.strip, sys.stdin):'$'\n'' try:'$'\n'' tag, value = l.split(" ", 1)'$'\n'' tag = tag.lower()'$'\n'' if tag != "imgsrcset":'$'\n'' print(l); continue'$'\n'' for url in re.split(r"\s+\d+[wx]\s*(?:,\s*|$)|,\s+", value.strip()):'$'\n'' if url: print(f"img {url}")'$'\n'' except BrokenPipeError: break' |
  31. # Decode HTML references
  32. python3 -c 'import html, sys'$'\n''for l in sys.stdin:'$'\n'' try: print(html.unescape(l.strip()))'$'\n'' except BrokenPipeError: break' |
  33. # Combine base and values to get absolute URLs
  34. # If multiple base tags are present, they all get respected. This violates the HTML specs.
  35. python3 -c 'import sys, urllib.parse; base = None'$'\n''for l in map(str.strip, sys.stdin):'$'\n'' tag, value = l.split(" ", 1)'$'\n'' tag = tag.lower()'$'\n'' if base:'$'\n'' value = urllib.parse.urljoin(base, value)'$'\n'' if tag == "base":'$'\n'' base = value'$'\n'' continue'$'\n'' try: print(f"{tag} {value}")'$'\n'' except BrokenPipeError: break'
  36. }