// Copyright 2010 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package html import ( "errors" "fmt" "io" "strings" a "golang.org/x/net/html/atom" ) // A parser implements the HTML5 parsing algorithm: // https://html.spec.whatwg.org/multipage/syntax.html#tree-construction type parser struct { // tokenizer provides the tokens for the parser. tokenizer *Tokenizer // tok is the most recently read token. tok Token // Self-closing tags like
block.
if d != "" && d[0] == '\r' {
d = d[1:]
}
if d != "" && d[0] == '\n' {
d = d[1:]
}
}
}
d = strings.Replace(d, "\x00", "", -1)
if d == "" {
return true
}
p.reconstructActiveFormattingElements()
p.addText(d)
if p.framesetOK && strings.TrimLeft(d, whitespace) != "" {
// There were non-whitespace characters inserted.
p.framesetOK = false
}
case StartTagToken:
switch p.tok.DataAtom {
case a.Html:
if p.oe.contains(a.Template) {
return true
}
copyAttributes(p.oe[0], p.tok)
case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
return inHeadIM(p)
case a.Body:
if p.oe.contains(a.Template) {
return true
}
if len(p.oe) >= 2 {
body := p.oe[1]
if body.Type == ElementNode && body.DataAtom == a.Body {
p.framesetOK = false
copyAttributes(body, p.tok)
}
}
case a.Frameset:
if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body {
// Ignore the token.
return true
}
body := p.oe[1]
if body.Parent != nil {
body.Parent.RemoveChild(body)
}
p.oe = p.oe[:1]
p.addElement()
p.im = inFramesetIM
return true
case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Menu, a.Nav, a.Ol, a.P, a.Section, a.Summary, a.Ul:
p.popUntil(buttonScope, a.P)
p.addElement()
case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
p.popUntil(buttonScope, a.P)
switch n := p.top(); n.DataAtom {
case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
p.oe.pop()
}
p.addElement()
case a.Pre, a.Listing:
p.popUntil(buttonScope, a.P)
p.addElement()
// The newline, if any, will be dealt with by the TextToken case.
p.framesetOK = false
case a.Form:
if p.form != nil && !p.oe.contains(a.Template) {
// Ignore the token
return true
}
p.popUntil(buttonScope, a.P)
p.addElement()
if !p.oe.contains(a.Template) {
p.form = p.top()
}
case a.Li:
p.framesetOK = false
for i := len(p.oe) - 1; i >= 0; i-- {
node := p.oe[i]
switch node.DataAtom {
case a.Li:
p.oe = p.oe[:i]
case a.Address, a.Div, a.P:
continue
default:
if !isSpecialElement(node) {
continue
}
}
break
}
p.popUntil(buttonScope, a.P)
p.addElement()
case a.Dd, a.Dt:
p.framesetOK = false
for i := len(p.oe) - 1; i >= 0; i-- {
node := p.oe[i]
switch node.DataAtom {
case a.Dd, a.Dt:
p.oe = p.oe[:i]
case a.Address, a.Div, a.P:
continue
default:
if !isSpecialElement(node) {
continue
}
}
break
}
p.popUntil(buttonScope, a.P)
p.addElement()
case a.Plaintext:
p.popUntil(buttonScope, a.P)
p.addElement()
case a.Button:
p.popUntil(defaultScope, a.Button)
p.reconstructActiveFormattingElements()
p.addElement()
p.framesetOK = false
case a.A:
for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- {
if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A {
p.inBodyEndTagFormatting(a.A, "a")
p.oe.remove(n)
p.afe.remove(n)
break
}
}
p.reconstructActiveFormattingElements()
p.addFormattingElement()
case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
p.reconstructActiveFormattingElements()
p.addFormattingElement()
case a.Nobr:
p.reconstructActiveFormattingElements()
if p.elementInScope(defaultScope, a.Nobr) {
p.inBodyEndTagFormatting(a.Nobr, "nobr")
p.reconstructActiveFormattingElements()
}
p.addFormattingElement()
case a.Applet, a.Marquee, a.Object:
p.reconstructActiveFormattingElements()
p.addElement()
p.afe = append(p.afe, &scopeMarker)
p.framesetOK = false
case a.Table:
if !p.quirks {
p.popUntil(buttonScope, a.P)
}
p.addElement()
p.framesetOK = false
p.im = inTableIM
return true
case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr:
p.reconstructActiveFormattingElements()
p.addElement()
p.oe.pop()
p.acknowledgeSelfClosingTag()
if p.tok.DataAtom == a.Input {
for _, t := range p.tok.Attr {
if t.Key == "type" {
if strings.ToLower(t.Val) == "hidden" {
// Skip setting framesetOK = false
return true
}
}
}
}
p.framesetOK = false
case a.Param, a.Source, a.Track:
p.addElement()
p.oe.pop()
p.acknowledgeSelfClosingTag()
case a.Hr:
p.popUntil(buttonScope, a.P)
p.addElement()
p.oe.pop()
p.acknowledgeSelfClosingTag()
p.framesetOK = false
case a.Image:
p.tok.DataAtom = a.Img
p.tok.Data = a.Img.String()
return false
case a.Isindex:
if p.form != nil {
// Ignore the token.
return true
}
action := ""
prompt := "This is a searchable index. Enter search keywords: "
attr := []Attribute{{Key: "name", Val: "isindex"}}
for _, t := range p.tok.Attr {
switch t.Key {
case "action":
action = t.Val
case "name":
// Ignore the attribute.
case "prompt":
prompt = t.Val
default:
attr = append(attr, t)
}
}
p.acknowledgeSelfClosingTag()
p.popUntil(buttonScope, a.P)
p.parseImpliedToken(StartTagToken, a.Form, a.Form.String())
if p.form == nil {
// NOTE: The 'isindex' element has been removed,
// and the 'template' element has not been designed to be
// collaborative with the index element.
//
// Ignore the token.
return true
}
if action != "" {
p.form.Attr = []Attribute{{Key: "action", Val: action}}
}
p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String())
p.parseImpliedToken(StartTagToken, a.Label, a.Label.String())
p.addText(prompt)
p.addChild(&Node{
Type: ElementNode,
DataAtom: a.Input,
Data: a.Input.String(),
Attr: attr,
})
p.oe.pop()
p.parseImpliedToken(EndTagToken, a.Label, a.Label.String())
p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String())
p.parseImpliedToken(EndTagToken, a.Form, a.Form.String())
case a.Textarea:
p.addElement()
p.setOriginalIM()
p.framesetOK = false
p.im = textIM
case a.Xmp:
p.popUntil(buttonScope, a.P)
p.reconstructActiveFormattingElements()
p.framesetOK = false
p.addElement()
p.setOriginalIM()
p.im = textIM
case a.Iframe:
p.framesetOK = false
p.addElement()
p.setOriginalIM()
p.im = textIM
case a.Noembed, a.Noscript:
p.addElement()
p.setOriginalIM()
p.im = textIM
case a.Select:
p.reconstructActiveFormattingElements()
p.addElement()
p.framesetOK = false
p.im = inSelectIM
return true
case a.Optgroup, a.Option:
if p.top().DataAtom == a.Option {
p.oe.pop()
}
p.reconstructActiveFormattingElements()
p.addElement()
case a.Rb, a.Rtc:
if p.elementInScope(defaultScope, a.Ruby) {
p.generateImpliedEndTags()
}
p.addElement()
case a.Rp, a.Rt:
if p.elementInScope(defaultScope, a.Ruby) {
p.generateImpliedEndTags("rtc")
}
p.addElement()
case a.Math, a.Svg:
p.reconstructActiveFormattingElements()
if p.tok.DataAtom == a.Math {
adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)
} else {
adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
}
adjustForeignAttributes(p.tok.Attr)
p.addElement()
p.top().Namespace = p.tok.Data
if p.hasSelfClosingToken {
p.oe.pop()
p.acknowledgeSelfClosingTag()
}
return true
case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
// Ignore the token.
default:
p.reconstructActiveFormattingElements()
p.addElement()
}
case EndTagToken:
switch p.tok.DataAtom {
case a.Body:
if p.elementInScope(defaultScope, a.Body) {
p.im = afterBodyIM
}
case a.Html:
if p.elementInScope(defaultScope, a.Body) {
p.parseImpliedToken(EndTagToken, a.Body, a.Body.String())
return false
}
return true
case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Menu, a.Nav, a.Ol, a.Pre, a.Section, a.Summary, a.Ul:
p.popUntil(defaultScope, p.tok.DataAtom)
case a.Form:
if p.oe.contains(a.Template) {
i := p.indexOfElementInScope(defaultScope, a.Form)
if i == -1 {
// Ignore the token.
return true
}
p.generateImpliedEndTags()
if p.oe[i].DataAtom != a.Form {
// Ignore the token.
return true
}
p.popUntil(defaultScope, a.Form)
} else {
node := p.form
p.form = nil
i := p.indexOfElementInScope(defaultScope, a.Form)
if node == nil || i == -1 || p.oe[i] != node {
// Ignore the token.
return true
}
p.generateImpliedEndTags()
p.oe.remove(node)
}
case a.P:
if !p.elementInScope(buttonScope, a.P) {
p.parseImpliedToken(StartTagToken, a.P, a.P.String())
}
p.popUntil(buttonScope, a.P)
case a.Li:
p.popUntil(listItemScope, a.Li)
case a.Dd, a.Dt:
p.popUntil(defaultScope, p.tok.DataAtom)
case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6)
case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
p.inBodyEndTagFormatting(p.tok.DataAtom, p.tok.Data)
case a.Applet, a.Marquee, a.Object:
if p.popUntil(defaultScope, p.tok.DataAtom) {
p.clearActiveFormattingElements()
}
case a.Br:
p.tok.Type = StartTagToken
return false
case a.Template:
return inHeadIM(p)
default:
p.inBodyEndTagOther(p.tok.DataAtom, p.tok.Data)
}
case CommentToken:
p.addChild(&Node{
Type: CommentNode,
Data: p.tok.Data,
})
case ErrorToken:
// TODO: remove this divergence from the HTML5 spec.
if len(p.templateStack) > 0 {
p.im = inTemplateIM
return false
} else {
for _, e := range p.oe {
switch e.DataAtom {
case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc, a.Tbody, a.Td, a.Tfoot, a.Th,
a.Thead, a.Tr, a.Body, a.Html:
default:
return true
}
}
}
}
return true
}
func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom, tagName string) {
// This is the "adoption agency" algorithm, described at
// https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency
// TODO: this is a fairly literal line-by-line translation of that algorithm.
// Once the code successfully parses the comprehensive test suite, we should
// refactor this code to be more idiomatic.
// Steps 1-4. The outer loop.
for i := 0; i < 8; i++ {
// Step 5. Find the formatting element.
var formattingElement *Node
for j := len(p.afe) - 1; j >= 0; j-- {
if p.afe[j].Type == scopeMarkerNode {
break
}
if p.afe[j].DataAtom == tagAtom {
formattingElement = p.afe[j]
break
}
}
if formattingElement == nil {
p.inBodyEndTagOther(tagAtom, tagName)
return
}
feIndex := p.oe.index(formattingElement)
if feIndex == -1 {
p.afe.remove(formattingElement)
return
}
if !p.elementInScope(defaultScope, tagAtom) {
// Ignore the tag.
return
}
// Steps 9-10. Find the furthest block.
var furthestBlock *Node
for _, e := range p.oe[feIndex:] {
if isSpecialElement(e) {
furthestBlock = e
break
}
}
if furthestBlock == nil {
e := p.oe.pop()
for e != formattingElement {
e = p.oe.pop()
}
p.afe.remove(e)
return
}
// Steps 11-12. Find the common ancestor and bookmark node.
commonAncestor := p.oe[feIndex-1]
bookmark := p.afe.index(formattingElement)
// Step 13. The inner loop. Find the lastNode to reparent.
lastNode := furthestBlock
node := furthestBlock
x := p.oe.index(node)
// Steps 13.1-13.2
for j := 0; j < 3; j++ {
// Step 13.3.
x--
node = p.oe[x]
// Step 13.4 - 13.5.
if p.afe.index(node) == -1 {
p.oe.remove(node)
continue
}
// Step 13.6.
if node == formattingElement {
break
}
// Step 13.7.
clone := node.clone()
p.afe[p.afe.index(node)] = clone
p.oe[p.oe.index(node)] = clone
node = clone
// Step 13.8.
if lastNode == furthestBlock {
bookmark = p.afe.index(node) + 1
}
// Step 13.9.
if lastNode.Parent != nil {
lastNode.Parent.RemoveChild(lastNode)
}
node.AppendChild(lastNode)
// Step 13.10.
lastNode = node
}
// Step 14. Reparent lastNode to the common ancestor,
// or for misnested table nodes, to the foster parent.
if lastNode.Parent != nil {
lastNode.Parent.RemoveChild(lastNode)
}
switch commonAncestor.DataAtom {
case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
p.fosterParent(lastNode)
default:
commonAncestor.AppendChild(lastNode)
}
// Steps 15-17. Reparent nodes from the furthest block's children
// to a clone of the formatting element.
clone := formattingElement.clone()
reparentChildren(clone, furthestBlock)
furthestBlock.AppendChild(clone)
// Step 18. Fix up the list of active formatting elements.
if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark {
// Move the bookmark with the rest of the list.
bookmark--
}
p.afe.remove(formattingElement)
p.afe.insert(bookmark, clone)
// Step 19. Fix up the stack of open elements.
p.oe.remove(formattingElement)
p.oe.insert(p.oe.index(furthestBlock)+1, clone)
}
}
// inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM.
// "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content
// https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign
func (p *parser) inBodyEndTagOther(tagAtom a.Atom, tagName string) {
for i := len(p.oe) - 1; i >= 0; i-- {
// Two element nodes have the same tag if they have the same Data (a
// string-typed field). As an optimization, for common HTML tags, each
// Data string is assigned a unique, non-zero DataAtom (a uint32-typed
// field), since integer comparison is faster than string comparison.
// Uncommon (custom) tags get a zero DataAtom.
//
// The if condition here is equivalent to (p.oe[i].Data == tagName).
if (p.oe[i].DataAtom == tagAtom) &&
((tagAtom != 0) || (p.oe[i].Data == tagName)) {
p.oe = p.oe[:i]
break
}
if isSpecialElement(p.oe[i]) {
break
}
}
}
// Section 12.2.6.4.8.
func textIM(p *parser) bool {
switch p.tok.Type {
case ErrorToken:
p.oe.pop()
case TextToken:
d := p.tok.Data
if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil {
// Ignore a newline at the start of a