You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

385 lines
10 KiB

  1. // Package sanitize provides functions for sanitizing text.
  2. package sanitize
  3. import (
  4. "bytes"
  5. "html"
  6. "html/template"
  7. "io"
  8. "path"
  9. "regexp"
  10. "strings"
  11. parser "golang.org/x/net/html"
  12. )
  13. var (
  14. ignoreTags = []string{"title", "script", "style", "iframe", "frame", "frameset", "noframes", "noembed", "embed", "applet", "object", "base"}
  15. defaultTags = []string{"h1", "h2", "h3", "h4", "h5", "h6", "div", "span", "hr", "p", "br", "b", "i", "strong", "em", "ol", "ul", "li", "a", "img", "pre", "code", "blockquote"}
  16. defaultAttributes = []string{"id", "class", "src", "href", "title", "alt", "name", "rel"}
  17. )
  18. // HTMLAllowing sanitizes html, allowing some tags.
  19. // Arrays of allowed tags and allowed attributes may optionally be passed as the second and third arguments.
  20. func HTMLAllowing(s string, args ...[]string) (string, error) {
  21. allowedTags := defaultTags
  22. if len(args) > 0 {
  23. allowedTags = args[0]
  24. }
  25. allowedAttributes := defaultAttributes
  26. if len(args) > 1 {
  27. allowedAttributes = args[1]
  28. }
  29. // Parse the html
  30. tokenizer := parser.NewTokenizer(strings.NewReader(s))
  31. buffer := bytes.NewBufferString("")
  32. ignore := ""
  33. for {
  34. tokenType := tokenizer.Next()
  35. token := tokenizer.Token()
  36. switch tokenType {
  37. case parser.ErrorToken:
  38. err := tokenizer.Err()
  39. if err == io.EOF {
  40. return buffer.String(), nil
  41. }
  42. return "", err
  43. case parser.StartTagToken:
  44. if len(ignore) == 0 && includes(allowedTags, token.Data) {
  45. token.Attr = cleanAttributes(token.Attr, allowedAttributes)
  46. buffer.WriteString(token.String())
  47. } else if includes(ignoreTags, token.Data) {
  48. ignore = token.Data
  49. }
  50. case parser.SelfClosingTagToken:
  51. if len(ignore) == 0 && includes(allowedTags, token.Data) {
  52. token.Attr = cleanAttributes(token.Attr, allowedAttributes)
  53. buffer.WriteString(token.String())
  54. } else if token.Data == ignore {
  55. ignore = ""
  56. }
  57. case parser.EndTagToken:
  58. if len(ignore) == 0 && includes(allowedTags, token.Data) {
  59. token.Attr = []parser.Attribute{}
  60. buffer.WriteString(token.String())
  61. } else if token.Data == ignore {
  62. ignore = ""
  63. }
  64. case parser.TextToken:
  65. // We allow text content through, unless ignoring this entire tag and its contents (including other tags)
  66. if ignore == "" {
  67. buffer.WriteString(token.String())
  68. }
  69. case parser.CommentToken:
  70. // We ignore comments by default
  71. case parser.DoctypeToken:
  72. // We ignore doctypes by default - html5 does not require them and this is intended for sanitizing snippets of text
  73. default:
  74. // We ignore unknown token types by default
  75. }
  76. }
  77. }
  78. // HTML strips html tags, replace common entities, and escapes <>&;'" in the result.
  79. // Note the returned text may contain entities as it is escaped by HTMLEscapeString, and most entities are not translated.
  80. func HTML(s string) string {
  81. output := ""
  82. // Shortcut strings with no tags in them
  83. if !strings.ContainsAny(s, "<>") {
  84. output = s
  85. } else {
  86. // First remove line breaks etc as these have no meaning outside html tags (except pre)
  87. // this means pre sections will lose formatting... but will result in less uninentional paras.
  88. s = strings.Replace(s, "\n", "", -1)
  89. // Then replace line breaks with newlines, to preserve that formatting
  90. s = strings.Replace(s, "</p>", "\n", -1)
  91. s = strings.Replace(s, "<br>", "\n", -1)
  92. s = strings.Replace(s, "</br>", "\n", -1)
  93. s = strings.Replace(s, "<br/>", "\n", -1)
  94. // Walk through the string removing all tags
  95. b := bytes.NewBufferString("")
  96. inTag := false
  97. for _, r := range s {
  98. switch r {
  99. case '<':
  100. inTag = true
  101. case '>':
  102. inTag = false
  103. default:
  104. if !inTag {
  105. b.WriteRune(r)
  106. }
  107. }
  108. }
  109. output = b.String()
  110. }
  111. // Remove a few common harmless entities, to arrive at something more like plain text
  112. output = strings.Replace(output, "&#8216;", "'", -1)
  113. output = strings.Replace(output, "&#8217;", "'", -1)
  114. output = strings.Replace(output, "&#8220;", "\"", -1)
  115. output = strings.Replace(output, "&#8221;", "\"", -1)
  116. output = strings.Replace(output, "&nbsp;", " ", -1)
  117. output = strings.Replace(output, "&quot;", "\"", -1)
  118. output = strings.Replace(output, "&apos;", "'", -1)
  119. // Translate some entities into their plain text equivalent (for example accents, if encoded as entities)
  120. output = html.UnescapeString(output)
  121. // In case we have missed any tags above, escape the text - removes <, >, &, ' and ".
  122. output = template.HTMLEscapeString(output)
  123. // After processing, remove some harmless entities &, ' and " which are encoded by HTMLEscapeString
  124. output = strings.Replace(output, "&#34;", "\"", -1)
  125. output = strings.Replace(output, "&#39;", "'", -1)
  126. output = strings.Replace(output, "&amp; ", "& ", -1) // NB space after
  127. output = strings.Replace(output, "&amp;amp; ", "& ", -1) // NB space after
  128. return output
  129. }
  130. // We are very restrictive as this is intended for ascii url slugs
  131. var illegalPath = regexp.MustCompile(`[^[:alnum:]\~\-\./]`)
  132. // Path makes a string safe to use as an url path.
  133. func Path(s string) string {
  134. // Start with lowercase string
  135. filePath := strings.ToLower(s)
  136. filePath = strings.Replace(filePath, "..", "", -1)
  137. filePath = path.Clean(filePath)
  138. // Remove illegal characters for paths, flattening accents and replacing some common separators with -
  139. filePath = cleanString(filePath, illegalPath)
  140. // NB this may be of length 0, caller must check
  141. return filePath
  142. }
  143. // Remove all other unrecognised characters apart from
  144. var illegalName = regexp.MustCompile(`[^[:alnum:]-.]`)
  145. // Name makes a string safe to use in a file name by first finding the path basename, then replacing non-ascii characters.
  146. func Name(s string) string {
  147. // Start with lowercase string
  148. fileName := strings.ToLower(s)
  149. fileName = path.Clean(path.Base(fileName))
  150. // Remove illegal characters for names, replacing some common separators with -
  151. fileName = cleanString(fileName, illegalName)
  152. // NB this may be of length 0, caller must check
  153. return fileName
  154. }
  155. // Replace these separators with -
  156. var baseNameSeparators = regexp.MustCompile(`[./]`)
  157. // BaseName makes a string safe to use in a file name, producing a sanitized basename replacing . or / with -.
  158. // No attempt is made to normalise a path or normalise case.
  159. func BaseName(s string) string {
  160. // Replace certain joining characters with a dash
  161. baseName := baseNameSeparators.ReplaceAllString(s, "-")
  162. // Remove illegal characters for names, replacing some common separators with -
  163. baseName = cleanString(baseName, illegalName)
  164. // NB this may be of length 0, caller must check
  165. return baseName
  166. }
  167. // A very limited list of transliterations to catch common european names translated to urls.
  168. // This set could be expanded with at least caps and many more characters.
  169. var transliterations = map[rune]string{
  170. 'À': "A",
  171. 'Á': "A",
  172. 'Â': "A",
  173. 'Ã': "A",
  174. 'Ä': "A",
  175. 'Å': "AA",
  176. 'Æ': "AE",
  177. 'Ç': "C",
  178. 'È': "E",
  179. 'É': "E",
  180. 'Ê': "E",
  181. 'Ë': "E",
  182. 'Ì': "I",
  183. 'Í': "I",
  184. 'Î': "I",
  185. 'Ï': "I",
  186. 'Ð': "D",
  187. 'Ł': "L",
  188. 'Ñ': "N",
  189. 'Ò': "O",
  190. 'Ó': "O",
  191. 'Ô': "O",
  192. 'Õ': "O",
  193. 'Ö': "O",
  194. 'Ø': "OE",
  195. 'Ù': "U",
  196. 'Ú': "U",
  197. 'Ü': "U",
  198. 'Û': "U",
  199. 'Ý': "Y",
  200. 'Þ': "Th",
  201. 'ß': "ss",
  202. 'à': "a",
  203. 'á': "a",
  204. 'â': "a",
  205. 'ã': "a",
  206. 'ä': "a",
  207. 'å': "aa",
  208. 'æ': "ae",
  209. 'ç': "c",
  210. 'è': "e",
  211. 'é': "e",
  212. 'ê': "e",
  213. 'ë': "e",
  214. 'ì': "i",
  215. 'í': "i",
  216. 'î': "i",
  217. 'ï': "i",
  218. 'ð': "d",
  219. 'ł': "l",
  220. 'ñ': "n",
  221. 'ń': "n",
  222. 'ò': "o",
  223. 'ó': "o",
  224. 'ô': "o",
  225. 'õ': "o",
  226. 'ō': "o",
  227. 'ö': "o",
  228. 'ø': "oe",
  229. 'ś': "s",
  230. 'ù': "u",
  231. 'ú': "u",
  232. 'û': "u",
  233. 'ū': "u",
  234. 'ü': "u",
  235. 'ý': "y",
  236. 'þ': "th",
  237. 'ÿ': "y",
  238. 'ż': "z",
  239. 'Œ': "OE",
  240. 'œ': "oe",
  241. }
  242. // Accents replaces a set of accented characters with ascii equivalents.
  243. func Accents(s string) string {
  244. // Replace some common accent characters
  245. b := bytes.NewBufferString("")
  246. for _, c := range s {
  247. // Check transliterations first
  248. if val, ok := transliterations[c]; ok {
  249. b.WriteString(val)
  250. } else {
  251. b.WriteRune(c)
  252. }
  253. }
  254. return b.String()
  255. }
  256. var (
  257. // If the attribute contains data: or javascript: anywhere, ignore it
  258. // we don't allow this in attributes as it is so frequently used for xss
  259. // NB we allow spaces in the value, and lowercase.
  260. illegalAttr = regexp.MustCompile(`(d\s*a\s*t\s*a|j\s*a\s*v\s*a\s*s\s*c\s*r\s*i\s*p\s*t\s*)\s*:`)
  261. // We are far more restrictive with href attributes.
  262. legalHrefAttr = regexp.MustCompile(`\A[/#][^/\\]?|mailto://|http://|https://`)
  263. )
  264. // cleanAttributes returns an array of attributes after removing malicious ones.
  265. func cleanAttributes(a []parser.Attribute, allowed []string) []parser.Attribute {
  266. if len(a) == 0 {
  267. return a
  268. }
  269. var cleaned []parser.Attribute
  270. for _, attr := range a {
  271. if includes(allowed, attr.Key) {
  272. val := strings.ToLower(attr.Val)
  273. // Check for illegal attribute values
  274. if illegalAttr.FindString(val) != "" {
  275. attr.Val = ""
  276. }
  277. // Check for legal href values - / mailto:// http:// or https://
  278. if attr.Key == "href" {
  279. if legalHrefAttr.FindString(val) == "" {
  280. attr.Val = ""
  281. }
  282. }
  283. // If we still have an attribute, append it to the array
  284. if attr.Val != "" {
  285. cleaned = append(cleaned, attr)
  286. }
  287. }
  288. }
  289. return cleaned
  290. }
  291. // A list of characters we consider separators in normal strings and replace with our canonical separator - rather than removing.
  292. var (
  293. separators = regexp.MustCompile(`[ &_=+:]`)
  294. dashes = regexp.MustCompile(`[\-]+`)
  295. )
  296. // cleanString replaces separators with - and removes characters listed in the regexp provided from string.
  297. // Accents, spaces, and all characters not in A-Za-z0-9 are replaced.
  298. func cleanString(s string, r *regexp.Regexp) string {
  299. // Remove any trailing space to avoid ending on -
  300. s = strings.Trim(s, " ")
  301. // Flatten accents first so that if we remove non-ascii we still get a legible name
  302. s = Accents(s)
  303. // Replace certain joining characters with a dash
  304. s = separators.ReplaceAllString(s, "-")
  305. // Remove all other unrecognised characters - NB we do allow any printable characters
  306. s = r.ReplaceAllString(s, "")
  307. // Remove any multiple dashes caused by replacements above
  308. s = dashes.ReplaceAllString(s, "-")
  309. return s
  310. }
  311. // includes checks for inclusion of a string in a []string.
  312. func includes(a []string, s string) bool {
  313. for _, as := range a {
  314. if as == s {
  315. return true
  316. }
  317. }
  318. return false
  319. }