JustAnotherArchivist
/
little-things


			
				
					
						
						
							
							#!/bin/bash
# Reads HTML from stdin, extracts links and page requisites with simple string splitting and regexes.
# Usage cannot be recommended against enough.
# Produces lines of 'TAG URL', e.g. 'a https://example.org/'.
{
	# Reformat so each line is one tag
	# Yes, this may break attribute values if they contain CR, LF, or <, but that's rare enough.
	tr '\r\n' '  ' | tr '<' '\n' |

	# Extract tags of interest
	grep -ai '^\(a\|base\|img\|link\|script\)\s' |

	# Fix scripty backslash nonsense
	perl -pe 's,\\,,g' |

	# Split img tags with src and srcset
	perl -pe "s,^img(?=\s(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?src\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(?:\s|>))\s(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?srcset\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(\s|>).*,img src=\1\nimg srcset=\2,i" |

	# Extract interesting tags/attributes
	perl -pe "s,^(a|base)\s(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?href\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(\$|\s|>).*,+\1 \2,i;
	          s,^(img|script)\s(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?src\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(\$|\s|>).*,+\1 \2,i;
	          s,^(link)\s(?=(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?rel\s*=\s*(?:stylesheet|\"(?:[^\"]*\s)?stylesheet(?:\s[^\"]*)?\"|'(?:[^']*\s)?stylesheet(?:\s[^']*)?')(?:\$|\s|>))(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?href\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(\$|\s|>).*,+\1 \2,i;
	          s,^(img)\s(?:(?:[^>'\"]|\"[^\"]*\"|'[^']*')*\s)?(srcset)\s*=\s*([^>'\"\s]*|\"[^\"]*\"|'[^']*')(\$|\s|>).*,+\1\2 \3,i;
	          # Ensure that there's a LF at the end of each line since \s might match it.
	          s,\s*$,\n,;
	         " |

	# Filter out unprocessed lines
	grep -a '^+' | sed 's,^+,,' |

	# Remove quotes from attribute values
	perl -pe "s,^([a-zA-Z]+) (['\"])(.*)\2$,\1 \3," |

	# Filter out lines without an attribute value
	grep -Pva '^[a-zA-Z]+ $' |

	# Remove lines with invalid UTF-8
	LANG=C.UTF-8 grep -a '^.*$' |

	# img srcset splitting
	python3 -c 'import os, re, sys'$'\n''try:'$'\n'' for l in map(str.strip, sys.stdin):'$'\n''  tag, value = l.split(" ", 1)'$'\n''  tag = tag.lower()'$'\n''  if tag != "imgsrcset":'$'\n''   print(l); continue'$'\n''  for url in re.split(r"\s+\d+[wx]\s*(?:,\s*|$)|,\s+", value.strip()):'$'\n''   if url: print(f"img {url}")'$'\n''except BrokenPipeError:'$'\n'' os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stdout.fileno()); sys.exit(1)' |

	# Decode HTML references
	python3 -c 'import html, os, sys'$'\n''try:'$'\n'' for l in sys.stdin:'$'\n''  print(html.unescape(l.strip()).split("\r", 1)[0].split("\n", 1)[0])'$'\n''except BrokenPipeError:'$'\n'' os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stdout.fileno()); sys.exit(1)' |

	# Combine base and values to get absolute URLs
	# If multiple base tags are present, they all get respected. This violates the HTML specs.
	python3 -c 'import os, sys, urllib.parse; base = None'$'\n''try:'$'\n'' for l in map(str.strip, sys.stdin):'$'\n''  tag, value = l.split(" ", 1)'$'\n''  tag = tag.lower()'$'\n''  if base:'$'\n''   try:'$'\n''    value = urllib.parse.urljoin(base, value)'$'\n''   except ValueError as e:'$'\n''    print(f"Could not merge {base = }, {l = }: {type(e)} {e}", file = sys.stderr)'$'\n''    continue'$'\n''  if tag == "base":'$'\n''   base = value'$'\n''   continue'$'\n''  print(f"{tag} {value}")'$'\n''except BrokenPipeError:'$'\n'' os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stdout.fileno()); sys.exit(1)'
}