Non puoi selezionare più di 25 argomenti Gli argomenti devono iniziare con una lettera o un numero, possono includere trattini ('-') e possono essere lunghi fino a 35 caratteri.
 
 
 

2325 righe
54 KiB

  1. // Copyright 2010 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package html
  5. import (
  6. "errors"
  7. "fmt"
  8. "io"
  9. "strings"
  10. a "golang.org/x/net/html/atom"
  11. )
  12. // A parser implements the HTML5 parsing algorithm:
  13. // https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
  14. type parser struct {
  15. // tokenizer provides the tokens for the parser.
  16. tokenizer *Tokenizer
  17. // tok is the most recently read token.
  18. tok Token
  19. // Self-closing tags like <hr/> are treated as start tags, except that
  20. // hasSelfClosingToken is set while they are being processed.
  21. hasSelfClosingToken bool
  22. // doc is the document root element.
  23. doc *Node
  24. // The stack of open elements (section 12.2.4.2) and active formatting
  25. // elements (section 12.2.4.3).
  26. oe, afe nodeStack
  27. // Element pointers (section 12.2.4.4).
  28. head, form *Node
  29. // Other parsing state flags (section 12.2.4.5).
  30. scripting, framesetOK bool
  31. // The stack of template insertion modes
  32. templateStack insertionModeStack
  33. // im is the current insertion mode.
  34. im insertionMode
  35. // originalIM is the insertion mode to go back to after completing a text
  36. // or inTableText insertion mode.
  37. originalIM insertionMode
  38. // fosterParenting is whether new elements should be inserted according to
  39. // the foster parenting rules (section 12.2.6.1).
  40. fosterParenting bool
  41. // quirks is whether the parser is operating in "quirks mode."
  42. quirks bool
  43. // fragment is whether the parser is parsing an HTML fragment.
  44. fragment bool
  45. // context is the context element when parsing an HTML fragment
  46. // (section 12.4).
  47. context *Node
  48. }
  49. func (p *parser) top() *Node {
  50. if n := p.oe.top(); n != nil {
  51. return n
  52. }
  53. return p.doc
  54. }
  55. // Stop tags for use in popUntil. These come from section 12.2.4.2.
  56. var (
  57. defaultScopeStopTags = map[string][]a.Atom{
  58. "": {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template},
  59. "math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext},
  60. "svg": {a.Desc, a.ForeignObject, a.Title},
  61. }
  62. )
  63. type scope int
  64. const (
  65. defaultScope scope = iota
  66. listItemScope
  67. buttonScope
  68. tableScope
  69. tableRowScope
  70. tableBodyScope
  71. selectScope
  72. )
  73. // popUntil pops the stack of open elements at the highest element whose tag
  74. // is in matchTags, provided there is no higher element in the scope's stop
  75. // tags (as defined in section 12.2.4.2). It returns whether or not there was
  76. // such an element. If there was not, popUntil leaves the stack unchanged.
  77. //
  78. // For example, the set of stop tags for table scope is: "html", "table". If
  79. // the stack was:
  80. // ["html", "body", "font", "table", "b", "i", "u"]
  81. // then popUntil(tableScope, "font") would return false, but
  82. // popUntil(tableScope, "i") would return true and the stack would become:
  83. // ["html", "body", "font", "table", "b"]
  84. //
  85. // If an element's tag is in both the stop tags and matchTags, then the stack
  86. // will be popped and the function returns true (provided, of course, there was
  87. // no higher element in the stack that was also in the stop tags). For example,
  88. // popUntil(tableScope, "table") returns true and leaves:
  89. // ["html", "body", "font"]
  90. func (p *parser) popUntil(s scope, matchTags ...a.Atom) bool {
  91. if i := p.indexOfElementInScope(s, matchTags...); i != -1 {
  92. p.oe = p.oe[:i]
  93. return true
  94. }
  95. return false
  96. }
  97. // indexOfElementInScope returns the index in p.oe of the highest element whose
  98. // tag is in matchTags that is in scope. If no matching element is in scope, it
  99. // returns -1.
  100. func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int {
  101. for i := len(p.oe) - 1; i >= 0; i-- {
  102. tagAtom := p.oe[i].DataAtom
  103. if p.oe[i].Namespace == "" {
  104. for _, t := range matchTags {
  105. if t == tagAtom {
  106. return i
  107. }
  108. }
  109. switch s {
  110. case defaultScope:
  111. // No-op.
  112. case listItemScope:
  113. if tagAtom == a.Ol || tagAtom == a.Ul {
  114. return -1
  115. }
  116. case buttonScope:
  117. if tagAtom == a.Button {
  118. return -1
  119. }
  120. case tableScope:
  121. if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template {
  122. return -1
  123. }
  124. case selectScope:
  125. if tagAtom != a.Optgroup && tagAtom != a.Option {
  126. return -1
  127. }
  128. default:
  129. panic("unreachable")
  130. }
  131. }
  132. switch s {
  133. case defaultScope, listItemScope, buttonScope:
  134. for _, t := range defaultScopeStopTags[p.oe[i].Namespace] {
  135. if t == tagAtom {
  136. return -1
  137. }
  138. }
  139. }
  140. }
  141. return -1
  142. }
  143. // elementInScope is like popUntil, except that it doesn't modify the stack of
  144. // open elements.
  145. func (p *parser) elementInScope(s scope, matchTags ...a.Atom) bool {
  146. return p.indexOfElementInScope(s, matchTags...) != -1
  147. }
  148. // clearStackToContext pops elements off the stack of open elements until a
  149. // scope-defined element is found.
  150. func (p *parser) clearStackToContext(s scope) {
  151. for i := len(p.oe) - 1; i >= 0; i-- {
  152. tagAtom := p.oe[i].DataAtom
  153. switch s {
  154. case tableScope:
  155. if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template {
  156. p.oe = p.oe[:i+1]
  157. return
  158. }
  159. case tableRowScope:
  160. if tagAtom == a.Html || tagAtom == a.Tr || tagAtom == a.Template {
  161. p.oe = p.oe[:i+1]
  162. return
  163. }
  164. case tableBodyScope:
  165. if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead || tagAtom == a.Template {
  166. p.oe = p.oe[:i+1]
  167. return
  168. }
  169. default:
  170. panic("unreachable")
  171. }
  172. }
  173. }
  174. // generateImpliedEndTags pops nodes off the stack of open elements as long as
  175. // the top node has a tag name of dd, dt, li, optgroup, option, p, rb, rp, rt or rtc.
  176. // If exceptions are specified, nodes with that name will not be popped off.
  177. func (p *parser) generateImpliedEndTags(exceptions ...string) {
  178. var i int
  179. loop:
  180. for i = len(p.oe) - 1; i >= 0; i-- {
  181. n := p.oe[i]
  182. if n.Type == ElementNode {
  183. switch n.DataAtom {
  184. case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc:
  185. for _, except := range exceptions {
  186. if n.Data == except {
  187. break loop
  188. }
  189. }
  190. continue
  191. }
  192. }
  193. break
  194. }
  195. p.oe = p.oe[:i+1]
  196. }
  197. // addChild adds a child node n to the top element, and pushes n onto the stack
  198. // of open elements if it is an element node.
  199. func (p *parser) addChild(n *Node) {
  200. if p.shouldFosterParent() {
  201. p.fosterParent(n)
  202. } else {
  203. p.top().AppendChild(n)
  204. }
  205. if n.Type == ElementNode {
  206. p.oe = append(p.oe, n)
  207. }
  208. }
  209. // shouldFosterParent returns whether the next node to be added should be
  210. // foster parented.
  211. func (p *parser) shouldFosterParent() bool {
  212. if p.fosterParenting {
  213. switch p.top().DataAtom {
  214. case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
  215. return true
  216. }
  217. }
  218. return false
  219. }
  220. // fosterParent adds a child node according to the foster parenting rules.
  221. // Section 12.2.6.1, "foster parenting".
  222. func (p *parser) fosterParent(n *Node) {
  223. var table, parent, prev, template *Node
  224. var i int
  225. for i = len(p.oe) - 1; i >= 0; i-- {
  226. if p.oe[i].DataAtom == a.Table {
  227. table = p.oe[i]
  228. break
  229. }
  230. }
  231. var j int
  232. for j = len(p.oe) - 1; j >= 0; j-- {
  233. if p.oe[j].DataAtom == a.Template {
  234. template = p.oe[j]
  235. break
  236. }
  237. }
  238. if template != nil && (table == nil || j > i) {
  239. template.AppendChild(n)
  240. return
  241. }
  242. if table == nil {
  243. // The foster parent is the html element.
  244. parent = p.oe[0]
  245. } else {
  246. parent = table.Parent
  247. }
  248. if parent == nil {
  249. parent = p.oe[i-1]
  250. }
  251. if table != nil {
  252. prev = table.PrevSibling
  253. } else {
  254. prev = parent.LastChild
  255. }
  256. if prev != nil && prev.Type == TextNode && n.Type == TextNode {
  257. prev.Data += n.Data
  258. return
  259. }
  260. parent.InsertBefore(n, table)
  261. }
  262. // addText adds text to the preceding node if it is a text node, or else it
  263. // calls addChild with a new text node.
  264. func (p *parser) addText(text string) {
  265. if text == "" {
  266. return
  267. }
  268. if p.shouldFosterParent() {
  269. p.fosterParent(&Node{
  270. Type: TextNode,
  271. Data: text,
  272. })
  273. return
  274. }
  275. t := p.top()
  276. if n := t.LastChild; n != nil && n.Type == TextNode {
  277. n.Data += text
  278. return
  279. }
  280. p.addChild(&Node{
  281. Type: TextNode,
  282. Data: text,
  283. })
  284. }
  285. // addElement adds a child element based on the current token.
  286. func (p *parser) addElement() {
  287. p.addChild(&Node{
  288. Type: ElementNode,
  289. DataAtom: p.tok.DataAtom,
  290. Data: p.tok.Data,
  291. Attr: p.tok.Attr,
  292. })
  293. }
  294. // Section 12.2.4.3.
  295. func (p *parser) addFormattingElement() {
  296. tagAtom, attr := p.tok.DataAtom, p.tok.Attr
  297. p.addElement()
  298. // Implement the Noah's Ark clause, but with three per family instead of two.
  299. identicalElements := 0
  300. findIdenticalElements:
  301. for i := len(p.afe) - 1; i >= 0; i-- {
  302. n := p.afe[i]
  303. if n.Type == scopeMarkerNode {
  304. break
  305. }
  306. if n.Type != ElementNode {
  307. continue
  308. }
  309. if n.Namespace != "" {
  310. continue
  311. }
  312. if n.DataAtom != tagAtom {
  313. continue
  314. }
  315. if len(n.Attr) != len(attr) {
  316. continue
  317. }
  318. compareAttributes:
  319. for _, t0 := range n.Attr {
  320. for _, t1 := range attr {
  321. if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val {
  322. // Found a match for this attribute, continue with the next attribute.
  323. continue compareAttributes
  324. }
  325. }
  326. // If we get here, there is no attribute that matches a.
  327. // Therefore the element is not identical to the new one.
  328. continue findIdenticalElements
  329. }
  330. identicalElements++
  331. if identicalElements >= 3 {
  332. p.afe.remove(n)
  333. }
  334. }
  335. p.afe = append(p.afe, p.top())
  336. }
  337. // Section 12.2.4.3.
  338. func (p *parser) clearActiveFormattingElements() {
  339. for {
  340. n := p.afe.pop()
  341. if len(p.afe) == 0 || n.Type == scopeMarkerNode {
  342. return
  343. }
  344. }
  345. }
  346. // Section 12.2.4.3.
  347. func (p *parser) reconstructActiveFormattingElements() {
  348. n := p.afe.top()
  349. if n == nil {
  350. return
  351. }
  352. if n.Type == scopeMarkerNode || p.oe.index(n) != -1 {
  353. return
  354. }
  355. i := len(p.afe) - 1
  356. for n.Type != scopeMarkerNode && p.oe.index(n) == -1 {
  357. if i == 0 {
  358. i = -1
  359. break
  360. }
  361. i--
  362. n = p.afe[i]
  363. }
  364. for {
  365. i++
  366. clone := p.afe[i].clone()
  367. p.addChild(clone)
  368. p.afe[i] = clone
  369. if i == len(p.afe)-1 {
  370. break
  371. }
  372. }
  373. }
  374. // Section 12.2.5.
  375. func (p *parser) acknowledgeSelfClosingTag() {
  376. p.hasSelfClosingToken = false
  377. }
  378. // An insertion mode (section 12.2.4.1) is the state transition function from
  379. // a particular state in the HTML5 parser's state machine. It updates the
  380. // parser's fields depending on parser.tok (where ErrorToken means EOF).
  381. // It returns whether the token was consumed.
  382. type insertionMode func(*parser) bool
  383. // setOriginalIM sets the insertion mode to return to after completing a text or
  384. // inTableText insertion mode.
  385. // Section 12.2.4.1, "using the rules for".
  386. func (p *parser) setOriginalIM() {
  387. if p.originalIM != nil {
  388. panic("html: bad parser state: originalIM was set twice")
  389. }
  390. p.originalIM = p.im
  391. }
  392. // Section 12.2.4.1, "reset the insertion mode".
  393. func (p *parser) resetInsertionMode() {
  394. for i := len(p.oe) - 1; i >= 0; i-- {
  395. n := p.oe[i]
  396. last := i == 0
  397. if last && p.context != nil {
  398. n = p.context
  399. }
  400. switch n.DataAtom {
  401. case a.Select:
  402. if !last {
  403. for ancestor, first := n, p.oe[0]; ancestor != first; {
  404. ancestor = p.oe[p.oe.index(ancestor)-1]
  405. switch ancestor.DataAtom {
  406. case a.Template:
  407. p.im = inSelectIM
  408. return
  409. case a.Table:
  410. p.im = inSelectInTableIM
  411. return
  412. }
  413. }
  414. }
  415. p.im = inSelectIM
  416. case a.Td, a.Th:
  417. // TODO: remove this divergence from the HTML5 spec.
  418. //
  419. // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
  420. p.im = inCellIM
  421. case a.Tr:
  422. p.im = inRowIM
  423. case a.Tbody, a.Thead, a.Tfoot:
  424. p.im = inTableBodyIM
  425. case a.Caption:
  426. p.im = inCaptionIM
  427. case a.Colgroup:
  428. p.im = inColumnGroupIM
  429. case a.Table:
  430. p.im = inTableIM
  431. case a.Template:
  432. // TODO: remove this divergence from the HTML5 spec.
  433. if n.Namespace != "" {
  434. continue
  435. }
  436. p.im = p.templateStack.top()
  437. case a.Head:
  438. // TODO: remove this divergence from the HTML5 spec.
  439. //
  440. // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
  441. p.im = inHeadIM
  442. case a.Body:
  443. p.im = inBodyIM
  444. case a.Frameset:
  445. p.im = inFramesetIM
  446. case a.Html:
  447. if p.head == nil {
  448. p.im = beforeHeadIM
  449. } else {
  450. p.im = afterHeadIM
  451. }
  452. default:
  453. if last {
  454. p.im = inBodyIM
  455. return
  456. }
  457. continue
  458. }
  459. return
  460. }
  461. }
  462. const whitespace = " \t\r\n\f"
  463. // Section 12.2.6.4.1.
  464. func initialIM(p *parser) bool {
  465. switch p.tok.Type {
  466. case TextToken:
  467. p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
  468. if len(p.tok.Data) == 0 {
  469. // It was all whitespace, so ignore it.
  470. return true
  471. }
  472. case CommentToken:
  473. p.doc.AppendChild(&Node{
  474. Type: CommentNode,
  475. Data: p.tok.Data,
  476. })
  477. return true
  478. case DoctypeToken:
  479. n, quirks := parseDoctype(p.tok.Data)
  480. p.doc.AppendChild(n)
  481. p.quirks = quirks
  482. p.im = beforeHTMLIM
  483. return true
  484. }
  485. p.quirks = true
  486. p.im = beforeHTMLIM
  487. return false
  488. }
  489. // Section 12.2.6.4.2.
  490. func beforeHTMLIM(p *parser) bool {
  491. switch p.tok.Type {
  492. case DoctypeToken:
  493. // Ignore the token.
  494. return true
  495. case TextToken:
  496. p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
  497. if len(p.tok.Data) == 0 {
  498. // It was all whitespace, so ignore it.
  499. return true
  500. }
  501. case StartTagToken:
  502. if p.tok.DataAtom == a.Html {
  503. p.addElement()
  504. p.im = beforeHeadIM
  505. return true
  506. }
  507. case EndTagToken:
  508. switch p.tok.DataAtom {
  509. case a.Head, a.Body, a.Html, a.Br:
  510. p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())
  511. return false
  512. default:
  513. // Ignore the token.
  514. return true
  515. }
  516. case CommentToken:
  517. p.doc.AppendChild(&Node{
  518. Type: CommentNode,
  519. Data: p.tok.Data,
  520. })
  521. return true
  522. }
  523. p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())
  524. return false
  525. }
  526. // Section 12.2.6.4.3.
  527. func beforeHeadIM(p *parser) bool {
  528. switch p.tok.Type {
  529. case TextToken:
  530. p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
  531. if len(p.tok.Data) == 0 {
  532. // It was all whitespace, so ignore it.
  533. return true
  534. }
  535. case StartTagToken:
  536. switch p.tok.DataAtom {
  537. case a.Head:
  538. p.addElement()
  539. p.head = p.top()
  540. p.im = inHeadIM
  541. return true
  542. case a.Html:
  543. return inBodyIM(p)
  544. }
  545. case EndTagToken:
  546. switch p.tok.DataAtom {
  547. case a.Head, a.Body, a.Html, a.Br:
  548. p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())
  549. return false
  550. default:
  551. // Ignore the token.
  552. return true
  553. }
  554. case CommentToken:
  555. p.addChild(&Node{
  556. Type: CommentNode,
  557. Data: p.tok.Data,
  558. })
  559. return true
  560. case DoctypeToken:
  561. // Ignore the token.
  562. return true
  563. }
  564. p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())
  565. return false
  566. }
  567. // Section 12.2.6.4.4.
  568. func inHeadIM(p *parser) bool {
  569. switch p.tok.Type {
  570. case TextToken:
  571. s := strings.TrimLeft(p.tok.Data, whitespace)
  572. if len(s) < len(p.tok.Data) {
  573. // Add the initial whitespace to the current node.
  574. p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
  575. if s == "" {
  576. return true
  577. }
  578. p.tok.Data = s
  579. }
  580. case StartTagToken:
  581. switch p.tok.DataAtom {
  582. case a.Html:
  583. return inBodyIM(p)
  584. case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta:
  585. p.addElement()
  586. p.oe.pop()
  587. p.acknowledgeSelfClosingTag()
  588. return true
  589. case a.Script, a.Title, a.Noscript, a.Noframes, a.Style:
  590. p.addElement()
  591. p.setOriginalIM()
  592. p.im = textIM
  593. return true
  594. case a.Head:
  595. // Ignore the token.
  596. return true
  597. case a.Template:
  598. p.addElement()
  599. p.afe = append(p.afe, &scopeMarker)
  600. p.framesetOK = false
  601. p.im = inTemplateIM
  602. p.templateStack = append(p.templateStack, inTemplateIM)
  603. return true
  604. }
  605. case EndTagToken:
  606. switch p.tok.DataAtom {
  607. case a.Head:
  608. p.oe.pop()
  609. p.im = afterHeadIM
  610. return true
  611. case a.Body, a.Html, a.Br:
  612. p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())
  613. return false
  614. case a.Template:
  615. if !p.oe.contains(a.Template) {
  616. return true
  617. }
  618. // TODO: remove this divergence from the HTML5 spec.
  619. //
  620. // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
  621. p.generateImpliedEndTags()
  622. for i := len(p.oe) - 1; i >= 0; i-- {
  623. if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template {
  624. p.oe = p.oe[:i]
  625. break
  626. }
  627. }
  628. p.clearActiveFormattingElements()
  629. p.templateStack.pop()
  630. p.resetInsertionMode()
  631. return true
  632. default:
  633. // Ignore the token.
  634. return true
  635. }
  636. case CommentToken:
  637. p.addChild(&Node{
  638. Type: CommentNode,
  639. Data: p.tok.Data,
  640. })
  641. return true
  642. case DoctypeToken:
  643. // Ignore the token.
  644. return true
  645. }
  646. p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())
  647. return false
  648. }
  649. // Section 12.2.6.4.6.
  650. func afterHeadIM(p *parser) bool {
  651. switch p.tok.Type {
  652. case TextToken:
  653. s := strings.TrimLeft(p.tok.Data, whitespace)
  654. if len(s) < len(p.tok.Data) {
  655. // Add the initial whitespace to the current node.
  656. p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
  657. if s == "" {
  658. return true
  659. }
  660. p.tok.Data = s
  661. }
  662. case StartTagToken:
  663. switch p.tok.DataAtom {
  664. case a.Html:
  665. return inBodyIM(p)
  666. case a.Body:
  667. p.addElement()
  668. p.framesetOK = false
  669. p.im = inBodyIM
  670. return true
  671. case a.Frameset:
  672. p.addElement()
  673. p.im = inFramesetIM
  674. return true
  675. case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
  676. p.oe = append(p.oe, p.head)
  677. defer p.oe.remove(p.head)
  678. return inHeadIM(p)
  679. case a.Head:
  680. // Ignore the token.
  681. return true
  682. }
  683. case EndTagToken:
  684. switch p.tok.DataAtom {
  685. case a.Body, a.Html, a.Br:
  686. // Drop down to creating an implied <body> tag.
  687. case a.Template:
  688. return inHeadIM(p)
  689. default:
  690. // Ignore the token.
  691. return true
  692. }
  693. case CommentToken:
  694. p.addChild(&Node{
  695. Type: CommentNode,
  696. Data: p.tok.Data,
  697. })
  698. return true
  699. case DoctypeToken:
  700. // Ignore the token.
  701. return true
  702. }
  703. p.parseImpliedToken(StartTagToken, a.Body, a.Body.String())
  704. p.framesetOK = true
  705. return false
  706. }
  707. // copyAttributes copies attributes of src not found on dst to dst.
  708. func copyAttributes(dst *Node, src Token) {
  709. if len(src.Attr) == 0 {
  710. return
  711. }
  712. attr := map[string]string{}
  713. for _, t := range dst.Attr {
  714. attr[t.Key] = t.Val
  715. }
  716. for _, t := range src.Attr {
  717. if _, ok := attr[t.Key]; !ok {
  718. dst.Attr = append(dst.Attr, t)
  719. attr[t.Key] = t.Val
  720. }
  721. }
  722. }
  723. // Section 12.2.6.4.7.
  724. func inBodyIM(p *parser) bool {
  725. switch p.tok.Type {
  726. case TextToken:
  727. d := p.tok.Data
  728. switch n := p.oe.top(); n.DataAtom {
  729. case a.Pre, a.Listing:
  730. if n.FirstChild == nil {
  731. // Ignore a newline at the start of a <pre> block.
  732. if d != "" && d[0] == '\r' {
  733. d = d[1:]
  734. }
  735. if d != "" && d[0] == '\n' {
  736. d = d[1:]
  737. }
  738. }
  739. }
  740. d = strings.Replace(d, "\x00", "", -1)
  741. if d == "" {
  742. return true
  743. }
  744. p.reconstructActiveFormattingElements()
  745. p.addText(d)
  746. if p.framesetOK && strings.TrimLeft(d, whitespace) != "" {
  747. // There were non-whitespace characters inserted.
  748. p.framesetOK = false
  749. }
  750. case StartTagToken:
  751. switch p.tok.DataAtom {
  752. case a.Html:
  753. if p.oe.contains(a.Template) {
  754. return true
  755. }
  756. copyAttributes(p.oe[0], p.tok)
  757. case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
  758. return inHeadIM(p)
  759. case a.Body:
  760. if p.oe.contains(a.Template) {
  761. return true
  762. }
  763. if len(p.oe) >= 2 {
  764. body := p.oe[1]
  765. if body.Type == ElementNode && body.DataAtom == a.Body {
  766. p.framesetOK = false
  767. copyAttributes(body, p.tok)
  768. }
  769. }
  770. case a.Frameset:
  771. if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body {
  772. // Ignore the token.
  773. return true
  774. }
  775. body := p.oe[1]
  776. if body.Parent != nil {
  777. body.Parent.RemoveChild(body)
  778. }
  779. p.oe = p.oe[:1]
  780. p.addElement()
  781. p.im = inFramesetIM
  782. return true
  783. case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Menu, a.Nav, a.Ol, a.P, a.Section, a.Summary, a.Ul:
  784. p.popUntil(buttonScope, a.P)
  785. p.addElement()
  786. case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
  787. p.popUntil(buttonScope, a.P)
  788. switch n := p.top(); n.DataAtom {
  789. case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
  790. p.oe.pop()
  791. }
  792. p.addElement()
  793. case a.Pre, a.Listing:
  794. p.popUntil(buttonScope, a.P)
  795. p.addElement()
  796. // The newline, if any, will be dealt with by the TextToken case.
  797. p.framesetOK = false
  798. case a.Form:
  799. if p.form != nil && !p.oe.contains(a.Template) {
  800. // Ignore the token
  801. return true
  802. }
  803. p.popUntil(buttonScope, a.P)
  804. p.addElement()
  805. if !p.oe.contains(a.Template) {
  806. p.form = p.top()
  807. }
  808. case a.Li:
  809. p.framesetOK = false
  810. for i := len(p.oe) - 1; i >= 0; i-- {
  811. node := p.oe[i]
  812. switch node.DataAtom {
  813. case a.Li:
  814. p.oe = p.oe[:i]
  815. case a.Address, a.Div, a.P:
  816. continue
  817. default:
  818. if !isSpecialElement(node) {
  819. continue
  820. }
  821. }
  822. break
  823. }
  824. p.popUntil(buttonScope, a.P)
  825. p.addElement()
  826. case a.Dd, a.Dt:
  827. p.framesetOK = false
  828. for i := len(p.oe) - 1; i >= 0; i-- {
  829. node := p.oe[i]
  830. switch node.DataAtom {
  831. case a.Dd, a.Dt:
  832. p.oe = p.oe[:i]
  833. case a.Address, a.Div, a.P:
  834. continue
  835. default:
  836. if !isSpecialElement(node) {
  837. continue
  838. }
  839. }
  840. break
  841. }
  842. p.popUntil(buttonScope, a.P)
  843. p.addElement()
  844. case a.Plaintext:
  845. p.popUntil(buttonScope, a.P)
  846. p.addElement()
  847. case a.Button:
  848. p.popUntil(defaultScope, a.Button)
  849. p.reconstructActiveFormattingElements()
  850. p.addElement()
  851. p.framesetOK = false
  852. case a.A:
  853. for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- {
  854. if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A {
  855. p.inBodyEndTagFormatting(a.A)
  856. p.oe.remove(n)
  857. p.afe.remove(n)
  858. break
  859. }
  860. }
  861. p.reconstructActiveFormattingElements()
  862. p.addFormattingElement()
  863. case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
  864. p.reconstructActiveFormattingElements()
  865. p.addFormattingElement()
  866. case a.Nobr:
  867. p.reconstructActiveFormattingElements()
  868. if p.elementInScope(defaultScope, a.Nobr) {
  869. p.inBodyEndTagFormatting(a.Nobr)
  870. p.reconstructActiveFormattingElements()
  871. }
  872. p.addFormattingElement()
  873. case a.Applet, a.Marquee, a.Object:
  874. p.reconstructActiveFormattingElements()
  875. p.addElement()
  876. p.afe = append(p.afe, &scopeMarker)
  877. p.framesetOK = false
  878. case a.Table:
  879. if !p.quirks {
  880. p.popUntil(buttonScope, a.P)
  881. }
  882. p.addElement()
  883. p.framesetOK = false
  884. p.im = inTableIM
  885. return true
  886. case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr:
  887. p.reconstructActiveFormattingElements()
  888. p.addElement()
  889. p.oe.pop()
  890. p.acknowledgeSelfClosingTag()
  891. if p.tok.DataAtom == a.Input {
  892. for _, t := range p.tok.Attr {
  893. if t.Key == "type" {
  894. if strings.ToLower(t.Val) == "hidden" {
  895. // Skip setting framesetOK = false
  896. return true
  897. }
  898. }
  899. }
  900. }
  901. p.framesetOK = false
  902. case a.Param, a.Source, a.Track:
  903. p.addElement()
  904. p.oe.pop()
  905. p.acknowledgeSelfClosingTag()
  906. case a.Hr:
  907. p.popUntil(buttonScope, a.P)
  908. p.addElement()
  909. p.oe.pop()
  910. p.acknowledgeSelfClosingTag()
  911. p.framesetOK = false
  912. case a.Image:
  913. p.tok.DataAtom = a.Img
  914. p.tok.Data = a.Img.String()
  915. return false
  916. case a.Isindex:
  917. if p.form != nil {
  918. // Ignore the token.
  919. return true
  920. }
  921. action := ""
  922. prompt := "This is a searchable index. Enter search keywords: "
  923. attr := []Attribute{{Key: "name", Val: "isindex"}}
  924. for _, t := range p.tok.Attr {
  925. switch t.Key {
  926. case "action":
  927. action = t.Val
  928. case "name":
  929. // Ignore the attribute.
  930. case "prompt":
  931. prompt = t.Val
  932. default:
  933. attr = append(attr, t)
  934. }
  935. }
  936. p.acknowledgeSelfClosingTag()
  937. p.popUntil(buttonScope, a.P)
  938. p.parseImpliedToken(StartTagToken, a.Form, a.Form.String())
  939. if p.form == nil {
  940. // NOTE: The 'isindex' element has been removed,
  941. // and the 'template' element has not been designed to be
  942. // collaborative with the index element.
  943. //
  944. // Ignore the token.
  945. return true
  946. }
  947. if action != "" {
  948. p.form.Attr = []Attribute{{Key: "action", Val: action}}
  949. }
  950. p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String())
  951. p.parseImpliedToken(StartTagToken, a.Label, a.Label.String())
  952. p.addText(prompt)
  953. p.addChild(&Node{
  954. Type: ElementNode,
  955. DataAtom: a.Input,
  956. Data: a.Input.String(),
  957. Attr: attr,
  958. })
  959. p.oe.pop()
  960. p.parseImpliedToken(EndTagToken, a.Label, a.Label.String())
  961. p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String())
  962. p.parseImpliedToken(EndTagToken, a.Form, a.Form.String())
  963. case a.Textarea:
  964. p.addElement()
  965. p.setOriginalIM()
  966. p.framesetOK = false
  967. p.im = textIM
  968. case a.Xmp:
  969. p.popUntil(buttonScope, a.P)
  970. p.reconstructActiveFormattingElements()
  971. p.framesetOK = false
  972. p.addElement()
  973. p.setOriginalIM()
  974. p.im = textIM
  975. case a.Iframe:
  976. p.framesetOK = false
  977. p.addElement()
  978. p.setOriginalIM()
  979. p.im = textIM
  980. case a.Noembed, a.Noscript:
  981. p.addElement()
  982. p.setOriginalIM()
  983. p.im = textIM
  984. case a.Select:
  985. p.reconstructActiveFormattingElements()
  986. p.addElement()
  987. p.framesetOK = false
  988. p.im = inSelectIM
  989. return true
  990. case a.Optgroup, a.Option:
  991. if p.top().DataAtom == a.Option {
  992. p.oe.pop()
  993. }
  994. p.reconstructActiveFormattingElements()
  995. p.addElement()
  996. case a.Rb, a.Rtc:
  997. if p.elementInScope(defaultScope, a.Ruby) {
  998. p.generateImpliedEndTags()
  999. }
  1000. p.addElement()
  1001. case a.Rp, a.Rt:
  1002. if p.elementInScope(defaultScope, a.Ruby) {
  1003. p.generateImpliedEndTags("rtc")
  1004. }
  1005. p.addElement()
  1006. case a.Math, a.Svg:
  1007. p.reconstructActiveFormattingElements()
  1008. if p.tok.DataAtom == a.Math {
  1009. adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)
  1010. } else {
  1011. adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
  1012. }
  1013. adjustForeignAttributes(p.tok.Attr)
  1014. p.addElement()
  1015. p.top().Namespace = p.tok.Data
  1016. if p.hasSelfClosingToken {
  1017. p.oe.pop()
  1018. p.acknowledgeSelfClosingTag()
  1019. }
  1020. return true
  1021. case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
  1022. // Ignore the token.
  1023. default:
  1024. p.reconstructActiveFormattingElements()
  1025. p.addElement()
  1026. }
  1027. case EndTagToken:
  1028. switch p.tok.DataAtom {
  1029. case a.Body:
  1030. if p.elementInScope(defaultScope, a.Body) {
  1031. p.im = afterBodyIM
  1032. }
  1033. case a.Html:
  1034. if p.elementInScope(defaultScope, a.Body) {
  1035. p.parseImpliedToken(EndTagToken, a.Body, a.Body.String())
  1036. return false
  1037. }
  1038. return true
  1039. case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Menu, a.Nav, a.Ol, a.Pre, a.Section, a.Summary, a.Ul:
  1040. p.popUntil(defaultScope, p.tok.DataAtom)
  1041. case a.Form:
  1042. if p.oe.contains(a.Template) {
  1043. i := p.indexOfElementInScope(defaultScope, a.Form)
  1044. if i == -1 {
  1045. // Ignore the token.
  1046. return true
  1047. }
  1048. p.generateImpliedEndTags()
  1049. if p.oe[i].DataAtom != a.Form {
  1050. // Ignore the token.
  1051. return true
  1052. }
  1053. p.popUntil(defaultScope, a.Form)
  1054. } else {
  1055. node := p.form
  1056. p.form = nil
  1057. i := p.indexOfElementInScope(defaultScope, a.Form)
  1058. if node == nil || i == -1 || p.oe[i] != node {
  1059. // Ignore the token.
  1060. return true
  1061. }
  1062. p.generateImpliedEndTags()
  1063. p.oe.remove(node)
  1064. }
  1065. case a.P:
  1066. if !p.elementInScope(buttonScope, a.P) {
  1067. p.parseImpliedToken(StartTagToken, a.P, a.P.String())
  1068. }
  1069. p.popUntil(buttonScope, a.P)
  1070. case a.Li:
  1071. p.popUntil(listItemScope, a.Li)
  1072. case a.Dd, a.Dt:
  1073. p.popUntil(defaultScope, p.tok.DataAtom)
  1074. case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
  1075. p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6)
  1076. case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
  1077. p.inBodyEndTagFormatting(p.tok.DataAtom)
  1078. case a.Applet, a.Marquee, a.Object:
  1079. if p.popUntil(defaultScope, p.tok.DataAtom) {
  1080. p.clearActiveFormattingElements()
  1081. }
  1082. case a.Br:
  1083. p.tok.Type = StartTagToken
  1084. return false
  1085. case a.Template:
  1086. return inHeadIM(p)
  1087. default:
  1088. p.inBodyEndTagOther(p.tok.DataAtom)
  1089. }
  1090. case CommentToken:
  1091. p.addChild(&Node{
  1092. Type: CommentNode,
  1093. Data: p.tok.Data,
  1094. })
  1095. case ErrorToken:
  1096. // TODO: remove this divergence from the HTML5 spec.
  1097. if len(p.templateStack) > 0 {
  1098. p.im = inTemplateIM
  1099. return false
  1100. } else {
  1101. for _, e := range p.oe {
  1102. switch e.DataAtom {
  1103. case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc, a.Tbody, a.Td, a.Tfoot, a.Th,
  1104. a.Thead, a.Tr, a.Body, a.Html:
  1105. default:
  1106. return true
  1107. }
  1108. }
  1109. }
  1110. }
  1111. return true
  1112. }
  1113. func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom) {
  1114. // This is the "adoption agency" algorithm, described at
  1115. // https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency
  1116. // TODO: this is a fairly literal line-by-line translation of that algorithm.
  1117. // Once the code successfully parses the comprehensive test suite, we should
  1118. // refactor this code to be more idiomatic.
  1119. // Steps 1-4. The outer loop.
  1120. for i := 0; i < 8; i++ {
  1121. // Step 5. Find the formatting element.
  1122. var formattingElement *Node
  1123. for j := len(p.afe) - 1; j >= 0; j-- {
  1124. if p.afe[j].Type == scopeMarkerNode {
  1125. break
  1126. }
  1127. if p.afe[j].DataAtom == tagAtom {
  1128. formattingElement = p.afe[j]
  1129. break
  1130. }
  1131. }
  1132. if formattingElement == nil {
  1133. p.inBodyEndTagOther(tagAtom)
  1134. return
  1135. }
  1136. feIndex := p.oe.index(formattingElement)
  1137. if feIndex == -1 {
  1138. p.afe.remove(formattingElement)
  1139. return
  1140. }
  1141. if !p.elementInScope(defaultScope, tagAtom) {
  1142. // Ignore the tag.
  1143. return
  1144. }
  1145. // Steps 9-10. Find the furthest block.
  1146. var furthestBlock *Node
  1147. for _, e := range p.oe[feIndex:] {
  1148. if isSpecialElement(e) {
  1149. furthestBlock = e
  1150. break
  1151. }
  1152. }
  1153. if furthestBlock == nil {
  1154. e := p.oe.pop()
  1155. for e != formattingElement {
  1156. e = p.oe.pop()
  1157. }
  1158. p.afe.remove(e)
  1159. return
  1160. }
  1161. // Steps 11-12. Find the common ancestor and bookmark node.
  1162. commonAncestor := p.oe[feIndex-1]
  1163. bookmark := p.afe.index(formattingElement)
  1164. // Step 13. The inner loop. Find the lastNode to reparent.
  1165. lastNode := furthestBlock
  1166. node := furthestBlock
  1167. x := p.oe.index(node)
  1168. // Steps 13.1-13.2
  1169. for j := 0; j < 3; j++ {
  1170. // Step 13.3.
  1171. x--
  1172. node = p.oe[x]
  1173. // Step 13.4 - 13.5.
  1174. if p.afe.index(node) == -1 {
  1175. p.oe.remove(node)
  1176. continue
  1177. }
  1178. // Step 13.6.
  1179. if node == formattingElement {
  1180. break
  1181. }
  1182. // Step 13.7.
  1183. clone := node.clone()
  1184. p.afe[p.afe.index(node)] = clone
  1185. p.oe[p.oe.index(node)] = clone
  1186. node = clone
  1187. // Step 13.8.
  1188. if lastNode == furthestBlock {
  1189. bookmark = p.afe.index(node) + 1
  1190. }
  1191. // Step 13.9.
  1192. if lastNode.Parent != nil {
  1193. lastNode.Parent.RemoveChild(lastNode)
  1194. }
  1195. node.AppendChild(lastNode)
  1196. // Step 13.10.
  1197. lastNode = node
  1198. }
  1199. // Step 14. Reparent lastNode to the common ancestor,
  1200. // or for misnested table nodes, to the foster parent.
  1201. if lastNode.Parent != nil {
  1202. lastNode.Parent.RemoveChild(lastNode)
  1203. }
  1204. switch commonAncestor.DataAtom {
  1205. case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
  1206. p.fosterParent(lastNode)
  1207. default:
  1208. commonAncestor.AppendChild(lastNode)
  1209. }
  1210. // Steps 15-17. Reparent nodes from the furthest block's children
  1211. // to a clone of the formatting element.
  1212. clone := formattingElement.clone()
  1213. reparentChildren(clone, furthestBlock)
  1214. furthestBlock.AppendChild(clone)
  1215. // Step 18. Fix up the list of active formatting elements.
  1216. if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark {
  1217. // Move the bookmark with the rest of the list.
  1218. bookmark--
  1219. }
  1220. p.afe.remove(formattingElement)
  1221. p.afe.insert(bookmark, clone)
  1222. // Step 19. Fix up the stack of open elements.
  1223. p.oe.remove(formattingElement)
  1224. p.oe.insert(p.oe.index(furthestBlock)+1, clone)
  1225. }
  1226. }
  1227. // inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM.
  1228. // "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content
  1229. // https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign
  1230. func (p *parser) inBodyEndTagOther(tagAtom a.Atom) {
  1231. for i := len(p.oe) - 1; i >= 0; i-- {
  1232. if p.oe[i].DataAtom == tagAtom {
  1233. p.oe = p.oe[:i]
  1234. break
  1235. }
  1236. if isSpecialElement(p.oe[i]) {
  1237. break
  1238. }
  1239. }
  1240. }
  1241. // Section 12.2.6.4.8.
  1242. func textIM(p *parser) bool {
  1243. switch p.tok.Type {
  1244. case ErrorToken:
  1245. p.oe.pop()
  1246. case TextToken:
  1247. d := p.tok.Data
  1248. if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil {
  1249. // Ignore a newline at the start of a <textarea> block.
  1250. if d != "" && d[0] == '\r' {
  1251. d = d[1:]
  1252. }
  1253. if d != "" && d[0] == '\n' {
  1254. d = d[1:]
  1255. }
  1256. }
  1257. if d == "" {
  1258. return true
  1259. }
  1260. p.addText(d)
  1261. return true
  1262. case EndTagToken:
  1263. p.oe.pop()
  1264. }
  1265. p.im = p.originalIM
  1266. p.originalIM = nil
  1267. return p.tok.Type == EndTagToken
  1268. }
  1269. // Section 12.2.6.4.9.
  1270. func inTableIM(p *parser) bool {
  1271. switch p.tok.Type {
  1272. case TextToken:
  1273. p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1)
  1274. switch p.oe.top().DataAtom {
  1275. case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
  1276. if strings.Trim(p.tok.Data, whitespace) == "" {
  1277. p.addText(p.tok.Data)
  1278. return true
  1279. }
  1280. }
  1281. case StartTagToken:
  1282. switch p.tok.DataAtom {
  1283. case a.Caption:
  1284. p.clearStackToContext(tableScope)
  1285. p.afe = append(p.afe, &scopeMarker)
  1286. p.addElement()
  1287. p.im = inCaptionIM
  1288. return true
  1289. case a.Colgroup:
  1290. p.clearStackToContext(tableScope)
  1291. p.addElement()
  1292. p.im = inColumnGroupIM
  1293. return true
  1294. case a.Col:
  1295. p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String())
  1296. return false
  1297. case a.Tbody, a.Tfoot, a.Thead:
  1298. p.clearStackToContext(tableScope)
  1299. p.addElement()
  1300. p.im = inTableBodyIM
  1301. return true
  1302. case a.Td, a.Th, a.Tr:
  1303. p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String())
  1304. return false
  1305. case a.Table:
  1306. if p.popUntil(tableScope, a.Table) {
  1307. p.resetInsertionMode()
  1308. return false
  1309. }
  1310. // Ignore the token.
  1311. return true
  1312. case a.Style, a.Script, a.Template:
  1313. return inHeadIM(p)
  1314. case a.Input:
  1315. for _, t := range p.tok.Attr {
  1316. if t.Key == "type" && strings.ToLower(t.Val) == "hidden" {
  1317. p.addElement()
  1318. p.oe.pop()
  1319. return true
  1320. }
  1321. }
  1322. // Otherwise drop down to the default action.
  1323. case a.Form:
  1324. if p.oe.contains(a.Template) || p.form != nil {
  1325. // Ignore the token.
  1326. return true
  1327. }
  1328. p.addElement()
  1329. p.form = p.oe.pop()
  1330. case a.Select:
  1331. p.reconstructActiveFormattingElements()
  1332. switch p.top().DataAtom {
  1333. case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
  1334. p.fosterParenting = true
  1335. }
  1336. p.addElement()
  1337. p.fosterParenting = false
  1338. p.framesetOK = false
  1339. p.im = inSelectInTableIM
  1340. return true
  1341. }
  1342. case EndTagToken:
  1343. switch p.tok.DataAtom {
  1344. case a.Table:
  1345. if p.popUntil(tableScope, a.Table) {
  1346. p.resetInsertionMode()
  1347. return true
  1348. }
  1349. // Ignore the token.
  1350. return true
  1351. case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
  1352. // Ignore the token.
  1353. return true
  1354. case a.Template:
  1355. return inHeadIM(p)
  1356. }
  1357. case CommentToken:
  1358. p.addChild(&Node{
  1359. Type: CommentNode,
  1360. Data: p.tok.Data,
  1361. })
  1362. return true
  1363. case DoctypeToken:
  1364. // Ignore the token.
  1365. return true
  1366. case ErrorToken:
  1367. return inBodyIM(p)
  1368. }
  1369. p.fosterParenting = true
  1370. defer func() { p.fosterParenting = false }()
  1371. return inBodyIM(p)
  1372. }
  1373. // Section 12.2.6.4.11.
  1374. func inCaptionIM(p *parser) bool {
  1375. switch p.tok.Type {
  1376. case StartTagToken:
  1377. switch p.tok.DataAtom {
  1378. case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr:
  1379. if p.popUntil(tableScope, a.Caption) {
  1380. p.clearActiveFormattingElements()
  1381. p.im = inTableIM
  1382. return false
  1383. } else {
  1384. // Ignore the token.
  1385. return true
  1386. }
  1387. case a.Select:
  1388. p.reconstructActiveFormattingElements()
  1389. p.addElement()
  1390. p.framesetOK = false
  1391. p.im = inSelectInTableIM
  1392. return true
  1393. }
  1394. case EndTagToken:
  1395. switch p.tok.DataAtom {
  1396. case a.Caption:
  1397. if p.popUntil(tableScope, a.Caption) {
  1398. p.clearActiveFormattingElements()
  1399. p.im = inTableIM
  1400. }
  1401. return true
  1402. case a.Table:
  1403. if p.popUntil(tableScope, a.Caption) {
  1404. p.clearActiveFormattingElements()
  1405. p.im = inTableIM
  1406. return false
  1407. } else {
  1408. // Ignore the token.
  1409. return true
  1410. }
  1411. case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
  1412. // Ignore the token.
  1413. return true
  1414. }
  1415. }
  1416. return inBodyIM(p)
  1417. }
  1418. // Section 12.2.6.4.12.
  1419. func inColumnGroupIM(p *parser) bool {
  1420. switch p.tok.Type {
  1421. case TextToken:
  1422. s := strings.TrimLeft(p.tok.Data, whitespace)
  1423. if len(s) < len(p.tok.Data) {
  1424. // Add the initial whitespace to the current node.
  1425. p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
  1426. if s == "" {
  1427. return true
  1428. }
  1429. p.tok.Data = s
  1430. }
  1431. case CommentToken:
  1432. p.addChild(&Node{
  1433. Type: CommentNode,
  1434. Data: p.tok.Data,
  1435. })
  1436. return true
  1437. case DoctypeToken:
  1438. // Ignore the token.
  1439. return true
  1440. case StartTagToken:
  1441. switch p.tok.DataAtom {
  1442. case a.Html:
  1443. return inBodyIM(p)
  1444. case a.Col:
  1445. p.addElement()
  1446. p.oe.pop()
  1447. p.acknowledgeSelfClosingTag()
  1448. return true
  1449. case a.Template:
  1450. return inHeadIM(p)
  1451. }
  1452. case EndTagToken:
  1453. switch p.tok.DataAtom {
  1454. case a.Colgroup:
  1455. if p.oe.top().DataAtom == a.Colgroup {
  1456. p.oe.pop()
  1457. p.im = inTableIM
  1458. }
  1459. return true
  1460. case a.Col:
  1461. // Ignore the token.
  1462. return true
  1463. case a.Template:
  1464. return inHeadIM(p)
  1465. }
  1466. case ErrorToken:
  1467. return inBodyIM(p)
  1468. }
  1469. if p.oe.top().DataAtom != a.Colgroup {
  1470. return true
  1471. }
  1472. p.oe.pop()
  1473. p.im = inTableIM
  1474. return false
  1475. }
  1476. // Section 12.2.6.4.13.
  1477. func inTableBodyIM(p *parser) bool {
  1478. switch p.tok.Type {
  1479. case StartTagToken:
  1480. switch p.tok.DataAtom {
  1481. case a.Tr:
  1482. p.clearStackToContext(tableBodyScope)
  1483. p.addElement()
  1484. p.im = inRowIM
  1485. return true
  1486. case a.Td, a.Th:
  1487. p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String())
  1488. return false
  1489. case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead:
  1490. if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) {
  1491. p.im = inTableIM
  1492. return false
  1493. }
  1494. // Ignore the token.
  1495. return true
  1496. }
  1497. case EndTagToken:
  1498. switch p.tok.DataAtom {
  1499. case a.Tbody, a.Tfoot, a.Thead:
  1500. if p.elementInScope(tableScope, p.tok.DataAtom) {
  1501. p.clearStackToContext(tableBodyScope)
  1502. p.oe.pop()
  1503. p.im = inTableIM
  1504. }
  1505. return true
  1506. case a.Table:
  1507. if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) {
  1508. p.im = inTableIM
  1509. return false
  1510. }
  1511. // Ignore the token.
  1512. return true
  1513. case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr:
  1514. // Ignore the token.
  1515. return true
  1516. }
  1517. case CommentToken:
  1518. p.addChild(&Node{
  1519. Type: CommentNode,
  1520. Data: p.tok.Data,
  1521. })
  1522. return true
  1523. }
  1524. return inTableIM(p)
  1525. }
  1526. // Section 12.2.6.4.14.
  1527. func inRowIM(p *parser) bool {
  1528. switch p.tok.Type {
  1529. case StartTagToken:
  1530. switch p.tok.DataAtom {
  1531. case a.Td, a.Th:
  1532. p.clearStackToContext(tableRowScope)
  1533. p.addElement()
  1534. p.afe = append(p.afe, &scopeMarker)
  1535. p.im = inCellIM
  1536. return true
  1537. case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr:
  1538. if p.popUntil(tableScope, a.Tr) {
  1539. p.im = inTableBodyIM
  1540. return false
  1541. }
  1542. // Ignore the token.
  1543. return true
  1544. }
  1545. case EndTagToken:
  1546. switch p.tok.DataAtom {
  1547. case a.Tr:
  1548. if p.popUntil(tableScope, a.Tr) {
  1549. p.im = inTableBodyIM
  1550. return true
  1551. }
  1552. // Ignore the token.
  1553. return true
  1554. case a.Table:
  1555. if p.popUntil(tableScope, a.Tr) {
  1556. p.im = inTableBodyIM
  1557. return false
  1558. }
  1559. // Ignore the token.
  1560. return true
  1561. case a.Tbody, a.Tfoot, a.Thead:
  1562. if p.elementInScope(tableScope, p.tok.DataAtom) {
  1563. p.parseImpliedToken(EndTagToken, a.Tr, a.Tr.String())
  1564. return false
  1565. }
  1566. // Ignore the token.
  1567. return true
  1568. case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th:
  1569. // Ignore the token.
  1570. return true
  1571. }
  1572. }
  1573. return inTableIM(p)
  1574. }
  1575. // Section 12.2.6.4.15.
  1576. func inCellIM(p *parser) bool {
  1577. switch p.tok.Type {
  1578. case StartTagToken:
  1579. switch p.tok.DataAtom {
  1580. case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
  1581. if p.popUntil(tableScope, a.Td, a.Th) {
  1582. // Close the cell and reprocess.
  1583. p.clearActiveFormattingElements()
  1584. p.im = inRowIM
  1585. return false
  1586. }
  1587. // Ignore the token.
  1588. return true
  1589. case a.Select:
  1590. p.reconstructActiveFormattingElements()
  1591. p.addElement()
  1592. p.framesetOK = false
  1593. p.im = inSelectInTableIM
  1594. return true
  1595. }
  1596. case EndTagToken:
  1597. switch p.tok.DataAtom {
  1598. case a.Td, a.Th:
  1599. if !p.popUntil(tableScope, p.tok.DataAtom) {
  1600. // Ignore the token.
  1601. return true
  1602. }
  1603. p.clearActiveFormattingElements()
  1604. p.im = inRowIM
  1605. return true
  1606. case a.Body, a.Caption, a.Col, a.Colgroup, a.Html:
  1607. // Ignore the token.
  1608. return true
  1609. case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
  1610. if !p.elementInScope(tableScope, p.tok.DataAtom) {
  1611. // Ignore the token.
  1612. return true
  1613. }
  1614. // Close the cell and reprocess.
  1615. p.popUntil(tableScope, a.Td, a.Th)
  1616. p.clearActiveFormattingElements()
  1617. p.im = inRowIM
  1618. return false
  1619. }
  1620. }
  1621. return inBodyIM(p)
  1622. }
  1623. // Section 12.2.6.4.16.
  1624. func inSelectIM(p *parser) bool {
  1625. switch p.tok.Type {
  1626. case TextToken:
  1627. p.addText(strings.Replace(p.tok.Data, "\x00", "", -1))
  1628. case StartTagToken:
  1629. switch p.tok.DataAtom {
  1630. case a.Html:
  1631. return inBodyIM(p)
  1632. case a.Option:
  1633. if p.top().DataAtom == a.Option {
  1634. p.oe.pop()
  1635. }
  1636. p.addElement()
  1637. case a.Optgroup:
  1638. if p.top().DataAtom == a.Option {
  1639. p.oe.pop()
  1640. }
  1641. if p.top().DataAtom == a.Optgroup {
  1642. p.oe.pop()
  1643. }
  1644. p.addElement()
  1645. case a.Select:
  1646. if p.popUntil(selectScope, a.Select) {
  1647. p.resetInsertionMode()
  1648. } else {
  1649. // Ignore the token.
  1650. return true
  1651. }
  1652. case a.Input, a.Keygen, a.Textarea:
  1653. if p.elementInScope(selectScope, a.Select) {
  1654. p.parseImpliedToken(EndTagToken, a.Select, a.Select.String())
  1655. return false
  1656. }
  1657. // In order to properly ignore <textarea>, we need to change the tokenizer mode.
  1658. p.tokenizer.NextIsNotRawText()
  1659. // Ignore the token.
  1660. return true
  1661. case a.Script, a.Template:
  1662. return inHeadIM(p)
  1663. }
  1664. case EndTagToken:
  1665. switch p.tok.DataAtom {
  1666. case a.Option:
  1667. if p.top().DataAtom == a.Option {
  1668. p.oe.pop()
  1669. }
  1670. case a.Optgroup:
  1671. i := len(p.oe) - 1
  1672. if p.oe[i].DataAtom == a.Option {
  1673. i--
  1674. }
  1675. if p.oe[i].DataAtom == a.Optgroup {
  1676. p.oe = p.oe[:i]
  1677. }
  1678. case a.Select:
  1679. if p.popUntil(selectScope, a.Select) {
  1680. p.resetInsertionMode()
  1681. } else {
  1682. // Ignore the token.
  1683. return true
  1684. }
  1685. case a.Template:
  1686. return inHeadIM(p)
  1687. }
  1688. case CommentToken:
  1689. p.addChild(&Node{
  1690. Type: CommentNode,
  1691. Data: p.tok.Data,
  1692. })
  1693. case DoctypeToken:
  1694. // Ignore the token.
  1695. return true
  1696. case ErrorToken:
  1697. return inBodyIM(p)
  1698. }
  1699. return true
  1700. }
  1701. // Section 12.2.6.4.17.
  1702. func inSelectInTableIM(p *parser) bool {
  1703. switch p.tok.Type {
  1704. case StartTagToken, EndTagToken:
  1705. switch p.tok.DataAtom {
  1706. case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th:
  1707. if p.tok.Type == EndTagToken && !p.elementInScope(tableScope, p.tok.DataAtom) {
  1708. // Ignore the token.
  1709. return true
  1710. }
  1711. // This is like p.popUntil(selectScope, a.Select), but it also
  1712. // matches <math select>, not just <select>. Matching the MathML
  1713. // tag is arguably incorrect (conceptually), but it mimics what
  1714. // Chromium does.
  1715. for i := len(p.oe) - 1; i >= 0; i-- {
  1716. if n := p.oe[i]; n.DataAtom == a.Select {
  1717. p.oe = p.oe[:i]
  1718. break
  1719. }
  1720. }
  1721. p.resetInsertionMode()
  1722. return false
  1723. }
  1724. }
  1725. return inSelectIM(p)
  1726. }
  1727. // Section 12.2.6.4.18.
  1728. func inTemplateIM(p *parser) bool {
  1729. switch p.tok.Type {
  1730. case TextToken, CommentToken, DoctypeToken:
  1731. return inBodyIM(p)
  1732. case StartTagToken:
  1733. switch p.tok.DataAtom {
  1734. case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
  1735. return inHeadIM(p)
  1736. case a.Caption, a.Colgroup, a.Tbody, a.Tfoot, a.Thead:
  1737. p.templateStack.pop()
  1738. p.templateStack = append(p.templateStack, inTableIM)
  1739. p.im = inTableIM
  1740. return false
  1741. case a.Col:
  1742. p.templateStack.pop()
  1743. p.templateStack = append(p.templateStack, inColumnGroupIM)
  1744. p.im = inColumnGroupIM
  1745. return false
  1746. case a.Tr:
  1747. p.templateStack.pop()
  1748. p.templateStack = append(p.templateStack, inTableBodyIM)
  1749. p.im = inTableBodyIM
  1750. return false
  1751. case a.Td, a.Th:
  1752. p.templateStack.pop()
  1753. p.templateStack = append(p.templateStack, inRowIM)
  1754. p.im = inRowIM
  1755. return false
  1756. default:
  1757. p.templateStack.pop()
  1758. p.templateStack = append(p.templateStack, inBodyIM)
  1759. p.im = inBodyIM
  1760. return false
  1761. }
  1762. case EndTagToken:
  1763. switch p.tok.DataAtom {
  1764. case a.Template:
  1765. return inHeadIM(p)
  1766. default:
  1767. // Ignore the token.
  1768. return true
  1769. }
  1770. case ErrorToken:
  1771. if !p.oe.contains(a.Template) {
  1772. // Ignore the token.
  1773. return true
  1774. }
  1775. // TODO: remove this divergence from the HTML5 spec.
  1776. //
  1777. // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
  1778. p.generateImpliedEndTags()
  1779. for i := len(p.oe) - 1; i >= 0; i-- {
  1780. if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template {
  1781. p.oe = p.oe[:i]
  1782. break
  1783. }
  1784. }
  1785. p.clearActiveFormattingElements()
  1786. p.templateStack.pop()
  1787. p.resetInsertionMode()
  1788. return false
  1789. }
  1790. return false
  1791. }
  1792. // Section 12.2.6.4.19.
  1793. func afterBodyIM(p *parser) bool {
  1794. switch p.tok.Type {
  1795. case ErrorToken:
  1796. // Stop parsing.
  1797. return true
  1798. case TextToken:
  1799. s := strings.TrimLeft(p.tok.Data, whitespace)
  1800. if len(s) == 0 {
  1801. // It was all whitespace.
  1802. return inBodyIM(p)
  1803. }
  1804. case StartTagToken:
  1805. if p.tok.DataAtom == a.Html {
  1806. return inBodyIM(p)
  1807. }
  1808. case EndTagToken:
  1809. if p.tok.DataAtom == a.Html {
  1810. if !p.fragment {
  1811. p.im = afterAfterBodyIM
  1812. }
  1813. return true
  1814. }
  1815. case CommentToken:
  1816. // The comment is attached to the <html> element.
  1817. if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html {
  1818. panic("html: bad parser state: <html> element not found, in the after-body insertion mode")
  1819. }
  1820. p.oe[0].AppendChild(&Node{
  1821. Type: CommentNode,
  1822. Data: p.tok.Data,
  1823. })
  1824. return true
  1825. }
  1826. p.im = inBodyIM
  1827. return false
  1828. }
  1829. // Section 12.2.6.4.20.
  1830. func inFramesetIM(p *parser) bool {
  1831. switch p.tok.Type {
  1832. case CommentToken:
  1833. p.addChild(&Node{
  1834. Type: CommentNode,
  1835. Data: p.tok.Data,
  1836. })
  1837. case TextToken:
  1838. // Ignore all text but whitespace.
  1839. s := strings.Map(func(c rune) rune {
  1840. switch c {
  1841. case ' ', '\t', '\n', '\f', '\r':
  1842. return c
  1843. }
  1844. return -1
  1845. }, p.tok.Data)
  1846. if s != "" {
  1847. p.addText(s)
  1848. }
  1849. case StartTagToken:
  1850. switch p.tok.DataAtom {
  1851. case a.Html:
  1852. return inBodyIM(p)
  1853. case a.Frameset:
  1854. p.addElement()
  1855. case a.Frame:
  1856. p.addElement()
  1857. p.oe.pop()
  1858. p.acknowledgeSelfClosingTag()
  1859. case a.Noframes:
  1860. return inHeadIM(p)
  1861. }
  1862. case EndTagToken:
  1863. switch p.tok.DataAtom {
  1864. case a.Frameset:
  1865. if p.oe.top().DataAtom != a.Html {
  1866. p.oe.pop()
  1867. if p.oe.top().DataAtom != a.Frameset {
  1868. p.im = afterFramesetIM
  1869. return true
  1870. }
  1871. }
  1872. }
  1873. default:
  1874. // Ignore the token.
  1875. }
  1876. return true
  1877. }
  1878. // Section 12.2.6.4.21.
  1879. func afterFramesetIM(p *parser) bool {
  1880. switch p.tok.Type {
  1881. case CommentToken:
  1882. p.addChild(&Node{
  1883. Type: CommentNode,
  1884. Data: p.tok.Data,
  1885. })
  1886. case TextToken:
  1887. // Ignore all text but whitespace.
  1888. s := strings.Map(func(c rune) rune {
  1889. switch c {
  1890. case ' ', '\t', '\n', '\f', '\r':
  1891. return c
  1892. }
  1893. return -1
  1894. }, p.tok.Data)
  1895. if s != "" {
  1896. p.addText(s)
  1897. }
  1898. case StartTagToken:
  1899. switch p.tok.DataAtom {
  1900. case a.Html:
  1901. return inBodyIM(p)
  1902. case a.Noframes:
  1903. return inHeadIM(p)
  1904. }
  1905. case EndTagToken:
  1906. switch p.tok.DataAtom {
  1907. case a.Html:
  1908. p.im = afterAfterFramesetIM
  1909. return true
  1910. }
  1911. default:
  1912. // Ignore the token.
  1913. }
  1914. return true
  1915. }
  1916. // Section 12.2.6.4.22.
  1917. func afterAfterBodyIM(p *parser) bool {
  1918. switch p.tok.Type {
  1919. case ErrorToken:
  1920. // Stop parsing.
  1921. return true
  1922. case TextToken:
  1923. s := strings.TrimLeft(p.tok.Data, whitespace)
  1924. if len(s) == 0 {
  1925. // It was all whitespace.
  1926. return inBodyIM(p)
  1927. }
  1928. case StartTagToken:
  1929. if p.tok.DataAtom == a.Html {
  1930. return inBodyIM(p)
  1931. }
  1932. case CommentToken:
  1933. p.doc.AppendChild(&Node{
  1934. Type: CommentNode,
  1935. Data: p.tok.Data,
  1936. })
  1937. return true
  1938. case DoctypeToken:
  1939. return inBodyIM(p)
  1940. }
  1941. p.im = inBodyIM
  1942. return false
  1943. }
  1944. // Section 12.2.6.4.23.
  1945. func afterAfterFramesetIM(p *parser) bool {
  1946. switch p.tok.Type {
  1947. case CommentToken:
  1948. p.doc.AppendChild(&Node{
  1949. Type: CommentNode,
  1950. Data: p.tok.Data,
  1951. })
  1952. case TextToken:
  1953. // Ignore all text but whitespace.
  1954. s := strings.Map(func(c rune) rune {
  1955. switch c {
  1956. case ' ', '\t', '\n', '\f', '\r':
  1957. return c
  1958. }
  1959. return -1
  1960. }, p.tok.Data)
  1961. if s != "" {
  1962. p.tok.Data = s
  1963. return inBodyIM(p)
  1964. }
  1965. case StartTagToken:
  1966. switch p.tok.DataAtom {
  1967. case a.Html:
  1968. return inBodyIM(p)
  1969. case a.Noframes:
  1970. return inHeadIM(p)
  1971. }
  1972. case DoctypeToken:
  1973. return inBodyIM(p)
  1974. default:
  1975. // Ignore the token.
  1976. }
  1977. return true
  1978. }
  1979. const whitespaceOrNUL = whitespace + "\x00"
  1980. // Section 12.2.6.5
  1981. func parseForeignContent(p *parser) bool {
  1982. switch p.tok.Type {
  1983. case TextToken:
  1984. if p.framesetOK {
  1985. p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == ""
  1986. }
  1987. p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1)
  1988. p.addText(p.tok.Data)
  1989. case CommentToken:
  1990. p.addChild(&Node{
  1991. Type: CommentNode,
  1992. Data: p.tok.Data,
  1993. })
  1994. case StartTagToken:
  1995. b := breakout[p.tok.Data]
  1996. if p.tok.DataAtom == a.Font {
  1997. loop:
  1998. for _, attr := range p.tok.Attr {
  1999. switch attr.Key {
  2000. case "color", "face", "size":
  2001. b = true
  2002. break loop
  2003. }
  2004. }
  2005. }
  2006. if b {
  2007. for i := len(p.oe) - 1; i >= 0; i-- {
  2008. n := p.oe[i]
  2009. if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) {
  2010. p.oe = p.oe[:i+1]
  2011. break
  2012. }
  2013. }
  2014. return false
  2015. }
  2016. switch p.top().Namespace {
  2017. case "math":
  2018. adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)
  2019. case "svg":
  2020. // Adjust SVG tag names. The tokenizer lower-cases tag names, but
  2021. // SVG wants e.g. "foreignObject" with a capital second "O".
  2022. if x := svgTagNameAdjustments[p.tok.Data]; x != "" {
  2023. p.tok.DataAtom = a.Lookup([]byte(x))
  2024. p.tok.Data = x
  2025. }
  2026. adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
  2027. default:
  2028. panic("html: bad parser state: unexpected namespace")
  2029. }
  2030. adjustForeignAttributes(p.tok.Attr)
  2031. namespace := p.top().Namespace
  2032. p.addElement()
  2033. p.top().Namespace = namespace
  2034. if namespace != "" {
  2035. // Don't let the tokenizer go into raw text mode in foreign content
  2036. // (e.g. in an SVG <title> tag).
  2037. p.tokenizer.NextIsNotRawText()
  2038. }
  2039. if p.hasSelfClosingToken {
  2040. p.oe.pop()
  2041. p.acknowledgeSelfClosingTag()
  2042. }
  2043. case EndTagToken:
  2044. for i := len(p.oe) - 1; i >= 0; i-- {
  2045. if p.oe[i].Namespace == "" {
  2046. return p.im(p)
  2047. }
  2048. if strings.EqualFold(p.oe[i].Data, p.tok.Data) {
  2049. p.oe = p.oe[:i]
  2050. break
  2051. }
  2052. }
  2053. return true
  2054. default:
  2055. // Ignore the token.
  2056. }
  2057. return true
  2058. }
  2059. // Section 12.2.6.
  2060. func (p *parser) inForeignContent() bool {
  2061. if len(p.oe) == 0 {
  2062. return false
  2063. }
  2064. n := p.oe[len(p.oe)-1]
  2065. if n.Namespace == "" {
  2066. return false
  2067. }
  2068. if mathMLTextIntegrationPoint(n) {
  2069. if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark {
  2070. return false
  2071. }
  2072. if p.tok.Type == TextToken {
  2073. return false
  2074. }
  2075. }
  2076. if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg {
  2077. return false
  2078. }
  2079. if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) {
  2080. return false
  2081. }
  2082. if p.tok.Type == ErrorToken {
  2083. return false
  2084. }
  2085. return true
  2086. }
  2087. // parseImpliedToken parses a token as though it had appeared in the parser's
  2088. // input.
  2089. func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) {
  2090. realToken, selfClosing := p.tok, p.hasSelfClosingToken
  2091. p.tok = Token{
  2092. Type: t,
  2093. DataAtom: dataAtom,
  2094. Data: data,
  2095. }
  2096. p.hasSelfClosingToken = false
  2097. p.parseCurrentToken()
  2098. p.tok, p.hasSelfClosingToken = realToken, selfClosing
  2099. }
  2100. // parseCurrentToken runs the current token through the parsing routines
  2101. // until it is consumed.
  2102. func (p *parser) parseCurrentToken() {
  2103. if p.tok.Type == SelfClosingTagToken {
  2104. p.hasSelfClosingToken = true
  2105. p.tok.Type = StartTagToken
  2106. }
  2107. consumed := false
  2108. for !consumed {
  2109. if p.inForeignContent() {
  2110. consumed = parseForeignContent(p)
  2111. } else {
  2112. consumed = p.im(p)
  2113. }
  2114. }
  2115. if p.hasSelfClosingToken {
  2116. // This is a parse error, but ignore it.
  2117. p.hasSelfClosingToken = false
  2118. }
  2119. }
  2120. func (p *parser) parse() error {
  2121. // Iterate until EOF. Any other error will cause an early return.
  2122. var err error
  2123. for err != io.EOF {
  2124. // CDATA sections are allowed only in foreign content.
  2125. n := p.oe.top()
  2126. p.tokenizer.AllowCDATA(n != nil && n.Namespace != "")
  2127. // Read and parse the next token.
  2128. p.tokenizer.Next()
  2129. p.tok = p.tokenizer.Token()
  2130. if p.tok.Type == ErrorToken {
  2131. err = p.tokenizer.Err()
  2132. if err != nil && err != io.EOF {
  2133. return err
  2134. }
  2135. }
  2136. p.parseCurrentToken()
  2137. }
  2138. return nil
  2139. }
  2140. // Parse returns the parse tree for the HTML from the given Reader.
  2141. //
  2142. // It implements the HTML5 parsing algorithm
  2143. // (https://html.spec.whatwg.org/multipage/syntax.html#tree-construction),
  2144. // which is very complicated. The resultant tree can contain implicitly created
  2145. // nodes that have no explicit <tag> listed in r's data, and nodes' parents can
  2146. // differ from the nesting implied by a naive processing of start and end
  2147. // <tag>s. Conversely, explicit <tag>s in r's data can be silently dropped,
  2148. // with no corresponding node in the resulting tree.
  2149. //
  2150. // The input is assumed to be UTF-8 encoded.
  2151. func Parse(r io.Reader) (*Node, error) {
  2152. p := &parser{
  2153. tokenizer: NewTokenizer(r),
  2154. doc: &Node{
  2155. Type: DocumentNode,
  2156. },
  2157. scripting: true,
  2158. framesetOK: true,
  2159. im: initialIM,
  2160. }
  2161. err := p.parse()
  2162. if err != nil {
  2163. return nil, err
  2164. }
  2165. return p.doc, nil
  2166. }
  2167. // ParseFragment parses a fragment of HTML and returns the nodes that were
  2168. // found. If the fragment is the InnerHTML for an existing element, pass that
  2169. // element in context.
  2170. //
  2171. // It has the same intricacies as Parse.
  2172. func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
  2173. contextTag := ""
  2174. if context != nil {
  2175. if context.Type != ElementNode {
  2176. return nil, errors.New("html: ParseFragment of non-element Node")
  2177. }
  2178. // The next check isn't just context.DataAtom.String() == context.Data because
  2179. // it is valid to pass an element whose tag isn't a known atom. For example,
  2180. // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent.
  2181. if context.DataAtom != a.Lookup([]byte(context.Data)) {
  2182. return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data)
  2183. }
  2184. contextTag = context.DataAtom.String()
  2185. }
  2186. p := &parser{
  2187. tokenizer: NewTokenizerFragment(r, contextTag),
  2188. doc: &Node{
  2189. Type: DocumentNode,
  2190. },
  2191. scripting: true,
  2192. fragment: true,
  2193. context: context,
  2194. }
  2195. root := &Node{
  2196. Type: ElementNode,
  2197. DataAtom: a.Html,
  2198. Data: a.Html.String(),
  2199. }
  2200. p.doc.AppendChild(root)
  2201. p.oe = nodeStack{root}
  2202. if context != nil && context.DataAtom == a.Template {
  2203. p.templateStack = append(p.templateStack, inTemplateIM)
  2204. }
  2205. p.resetInsertionMode()
  2206. for n := context; n != nil; n = n.Parent {
  2207. if n.Type == ElementNode && n.DataAtom == a.Form {
  2208. p.form = n
  2209. break
  2210. }
  2211. }
  2212. err := p.parse()
  2213. if err != nil {
  2214. return nil, err
  2215. }
  2216. parent := p.doc
  2217. if context != nil {
  2218. parent = root
  2219. }
  2220. var result []*Node
  2221. for c := parent.FirstChild; c != nil; {
  2222. next := c.NextSibling
  2223. parent.RemoveChild(c)
  2224. result = append(result, c)
  2225. c = next
  2226. }
  2227. return result, nil
  2228. }