diff --git a/cf_logo.png b/assets/cf_logo.png similarity index 100% rename from cf_logo.png rename to assets/cf_logo.png diff --git a/assets/index.gohtml b/assets/index.gohtml new file mode 100644 index 0000000..1f3f986 --- /dev/null +++ b/assets/index.gohtml @@ -0,0 +1,19 @@ +{{- /*gotype: github.com/gerg/http2_tile_demo.TemplateArgs*/ -}} + + +
block. + if d != "" && d[0] == '\r' { + d = d[1:] + } + if d != "" && d[0] == '\n' { + d = d[1:] + } + } + } + d = strings.Replace(d, "\x00", "", -1) + if d == "" { + return true + } + p.reconstructActiveFormattingElements() + p.addText(d) + if p.framesetOK && strings.TrimLeft(d, whitespace) != "" { + // There were non-whitespace characters inserted. + p.framesetOK = false + } + case StartTagToken: + switch p.tok.DataAtom { + case a.Html: + if p.oe.contains(a.Template) { + return true + } + copyAttributes(p.oe[0], p.tok) + case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: + return inHeadIM(p) + case a.Body: + if p.oe.contains(a.Template) { + return true + } + if len(p.oe) >= 2 { + body := p.oe[1] + if body.Type == ElementNode && body.DataAtom == a.Body { + p.framesetOK = false + copyAttributes(body, p.tok) + } + } + case a.Frameset: + if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body { + // Ignore the token. + return true + } + body := p.oe[1] + if body.Parent != nil { + body.Parent.RemoveChild(body) + } + p.oe = p.oe[:1] + p.addElement() + p.im = inFramesetIM + return true + case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Main, a.Menu, a.Nav, a.Ol, a.P, a.Section, a.Summary, a.Ul: + p.popUntil(buttonScope, a.P) + p.addElement() + case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: + p.popUntil(buttonScope, a.P) + switch n := p.top(); n.DataAtom { + case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: + p.oe.pop() + } + p.addElement() + case a.Pre, a.Listing: + p.popUntil(buttonScope, a.P) + p.addElement() + // The newline, if any, will be dealt with by the TextToken case. + p.framesetOK = false + case a.Form: + if p.form != nil && !p.oe.contains(a.Template) { + // Ignore the token + return true + } + p.popUntil(buttonScope, a.P) + p.addElement() + if !p.oe.contains(a.Template) { + p.form = p.top() + } + case a.Li: + p.framesetOK = false + for i := len(p.oe) - 1; i >= 0; i-- { + node := p.oe[i] + switch node.DataAtom { + case a.Li: + p.oe = p.oe[:i] + case a.Address, a.Div, a.P: + continue + default: + if !isSpecialElement(node) { + continue + } + } + break + } + p.popUntil(buttonScope, a.P) + p.addElement() + case a.Dd, a.Dt: + p.framesetOK = false + for i := len(p.oe) - 1; i >= 0; i-- { + node := p.oe[i] + switch node.DataAtom { + case a.Dd, a.Dt: + p.oe = p.oe[:i] + case a.Address, a.Div, a.P: + continue + default: + if !isSpecialElement(node) { + continue + } + } + break + } + p.popUntil(buttonScope, a.P) + p.addElement() + case a.Plaintext: + p.popUntil(buttonScope, a.P) + p.addElement() + case a.Button: + p.popUntil(defaultScope, a.Button) + p.reconstructActiveFormattingElements() + p.addElement() + p.framesetOK = false + case a.A: + for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- { + if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A { + p.inBodyEndTagFormatting(a.A, "a") + p.oe.remove(n) + p.afe.remove(n) + break + } + } + p.reconstructActiveFormattingElements() + p.addFormattingElement() + case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: + p.reconstructActiveFormattingElements() + p.addFormattingElement() + case a.Nobr: + p.reconstructActiveFormattingElements() + if p.elementInScope(defaultScope, a.Nobr) { + p.inBodyEndTagFormatting(a.Nobr, "nobr") + p.reconstructActiveFormattingElements() + } + p.addFormattingElement() + case a.Applet, a.Marquee, a.Object: + p.reconstructActiveFormattingElements() + p.addElement() + p.afe = append(p.afe, &scopeMarker) + p.framesetOK = false + case a.Table: + if !p.quirks { + p.popUntil(buttonScope, a.P) + } + p.addElement() + p.framesetOK = false + p.im = inTableIM + return true + case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr: + p.reconstructActiveFormattingElements() + p.addElement() + p.oe.pop() + p.acknowledgeSelfClosingTag() + if p.tok.DataAtom == a.Input { + for _, t := range p.tok.Attr { + if t.Key == "type" { + if strings.ToLower(t.Val) == "hidden" { + // Skip setting framesetOK = false + return true + } + } + } + } + p.framesetOK = false + case a.Param, a.Source, a.Track: + p.addElement() + p.oe.pop() + p.acknowledgeSelfClosingTag() + case a.Hr: + p.popUntil(buttonScope, a.P) + p.addElement() + p.oe.pop() + p.acknowledgeSelfClosingTag() + p.framesetOK = false + case a.Image: + p.tok.DataAtom = a.Img + p.tok.Data = a.Img.String() + return false + case a.Textarea: + p.addElement() + p.setOriginalIM() + p.framesetOK = false + p.im = textIM + case a.Xmp: + p.popUntil(buttonScope, a.P) + p.reconstructActiveFormattingElements() + p.framesetOK = false + p.parseGenericRawTextElement() + case a.Iframe: + p.framesetOK = false + p.parseGenericRawTextElement() + case a.Noembed: + p.parseGenericRawTextElement() + case a.Noscript: + if p.scripting { + p.parseGenericRawTextElement() + return true + } + p.reconstructActiveFormattingElements() + p.addElement() + // Don't let the tokenizer go into raw text mode when scripting is disabled. + p.tokenizer.NextIsNotRawText() + case a.Select: + p.reconstructActiveFormattingElements() + p.addElement() + p.framesetOK = false + p.im = inSelectIM + return true + case a.Optgroup, a.Option: + if p.top().DataAtom == a.Option { + p.oe.pop() + } + p.reconstructActiveFormattingElements() + p.addElement() + case a.Rb, a.Rtc: + if p.elementInScope(defaultScope, a.Ruby) { + p.generateImpliedEndTags() + } + p.addElement() + case a.Rp, a.Rt: + if p.elementInScope(defaultScope, a.Ruby) { + p.generateImpliedEndTags("rtc") + } + p.addElement() + case a.Math, a.Svg: + p.reconstructActiveFormattingElements() + if p.tok.DataAtom == a.Math { + adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) + } else { + adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) + } + adjustForeignAttributes(p.tok.Attr) + p.addElement() + p.top().Namespace = p.tok.Data + if p.hasSelfClosingToken { + p.oe.pop() + p.acknowledgeSelfClosingTag() + } + return true + case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: + // Ignore the token. + default: + p.reconstructActiveFormattingElements() + p.addElement() + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Body: + if p.elementInScope(defaultScope, a.Body) { + p.im = afterBodyIM + } + case a.Html: + if p.elementInScope(defaultScope, a.Body) { + p.parseImpliedToken(EndTagToken, a.Body, a.Body.String()) + return false + } + return true + case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Main, a.Menu, a.Nav, a.Ol, a.Pre, a.Section, a.Summary, a.Ul: + p.popUntil(defaultScope, p.tok.DataAtom) + case a.Form: + if p.oe.contains(a.Template) { + i := p.indexOfElementInScope(defaultScope, a.Form) + if i == -1 { + // Ignore the token. + return true + } + p.generateImpliedEndTags() + if p.oe[i].DataAtom != a.Form { + // Ignore the token. + return true + } + p.popUntil(defaultScope, a.Form) + } else { + node := p.form + p.form = nil + i := p.indexOfElementInScope(defaultScope, a.Form) + if node == nil || i == -1 || p.oe[i] != node { + // Ignore the token. + return true + } + p.generateImpliedEndTags() + p.oe.remove(node) + } + case a.P: + if !p.elementInScope(buttonScope, a.P) { + p.parseImpliedToken(StartTagToken, a.P, a.P.String()) + } + p.popUntil(buttonScope, a.P) + case a.Li: + p.popUntil(listItemScope, a.Li) + case a.Dd, a.Dt: + p.popUntil(defaultScope, p.tok.DataAtom) + case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: + p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6) + case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: + p.inBodyEndTagFormatting(p.tok.DataAtom, p.tok.Data) + case a.Applet, a.Marquee, a.Object: + if p.popUntil(defaultScope, p.tok.DataAtom) { + p.clearActiveFormattingElements() + } + case a.Br: + p.tok.Type = StartTagToken + return false + case a.Template: + return inHeadIM(p) + default: + p.inBodyEndTagOther(p.tok.DataAtom, p.tok.Data) + } + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + case ErrorToken: + // TODO: remove this divergence from the HTML5 spec. + if len(p.templateStack) > 0 { + p.im = inTemplateIM + return false + } + for _, e := range p.oe { + switch e.DataAtom { + case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc, a.Tbody, a.Td, a.Tfoot, a.Th, + a.Thead, a.Tr, a.Body, a.Html: + default: + return true + } + } + } + + return true +} + +func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom, tagName string) { + // This is the "adoption agency" algorithm, described at + // https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency + + // TODO: this is a fairly literal line-by-line translation of that algorithm. + // Once the code successfully parses the comprehensive test suite, we should + // refactor this code to be more idiomatic. + + // Steps 1-2 + if current := p.oe.top(); current.Data == tagName && p.afe.index(current) == -1 { + p.oe.pop() + return + } + + // Steps 3-5. The outer loop. + for i := 0; i < 8; i++ { + // Step 6. Find the formatting element. + var formattingElement *Node + for j := len(p.afe) - 1; j >= 0; j-- { + if p.afe[j].Type == scopeMarkerNode { + break + } + if p.afe[j].DataAtom == tagAtom { + formattingElement = p.afe[j] + break + } + } + if formattingElement == nil { + p.inBodyEndTagOther(tagAtom, tagName) + return + } + + // Step 7. Ignore the tag if formatting element is not in the stack of open elements. + feIndex := p.oe.index(formattingElement) + if feIndex == -1 { + p.afe.remove(formattingElement) + return + } + // Step 8. Ignore the tag if formatting element is not in the scope. + if !p.elementInScope(defaultScope, tagAtom) { + // Ignore the tag. + return + } + + // Step 9. This step is omitted because it's just a parse error but no need to return. + + // Steps 10-11. Find the furthest block. + var furthestBlock *Node + for _, e := range p.oe[feIndex:] { + if isSpecialElement(e) { + furthestBlock = e + break + } + } + if furthestBlock == nil { + e := p.oe.pop() + for e != formattingElement { + e = p.oe.pop() + } + p.afe.remove(e) + return + } + + // Steps 12-13. Find the common ancestor and bookmark node. + commonAncestor := p.oe[feIndex-1] + bookmark := p.afe.index(formattingElement) + + // Step 14. The inner loop. Find the lastNode to reparent. + lastNode := furthestBlock + node := furthestBlock + x := p.oe.index(node) + // Step 14.1. + j := 0 + for { + // Step 14.2. + j++ + // Step. 14.3. + x-- + node = p.oe[x] + // Step 14.4. Go to the next step if node is formatting element. + if node == formattingElement { + break + } + // Step 14.5. Remove node from the list of active formatting elements if + // inner loop counter is greater than three and node is in the list of + // active formatting elements. + if ni := p.afe.index(node); j > 3 && ni > -1 { + p.afe.remove(node) + // If any element of the list of active formatting elements is removed, + // we need to take care whether bookmark should be decremented or not. + // This is because the value of bookmark may exceed the size of the + // list by removing elements from the list. + if ni <= bookmark { + bookmark-- + } + continue + } + // Step 14.6. Continue the next inner loop if node is not in the list of + // active formatting elements. + if p.afe.index(node) == -1 { + p.oe.remove(node) + continue + } + // Step 14.7. + clone := node.clone() + p.afe[p.afe.index(node)] = clone + p.oe[p.oe.index(node)] = clone + node = clone + // Step 14.8. + if lastNode == furthestBlock { + bookmark = p.afe.index(node) + 1 + } + // Step 14.9. + if lastNode.Parent != nil { + lastNode.Parent.RemoveChild(lastNode) + } + node.AppendChild(lastNode) + // Step 14.10. + lastNode = node + } + + // Step 15. Reparent lastNode to the common ancestor, + // or for misnested table nodes, to the foster parent. + if lastNode.Parent != nil { + lastNode.Parent.RemoveChild(lastNode) + } + switch commonAncestor.DataAtom { + case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: + p.fosterParent(lastNode) + default: + commonAncestor.AppendChild(lastNode) + } + + // Steps 16-18. Reparent nodes from the furthest block's children + // to a clone of the formatting element. + clone := formattingElement.clone() + reparentChildren(clone, furthestBlock) + furthestBlock.AppendChild(clone) + + // Step 19. Fix up the list of active formatting elements. + if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark { + // Move the bookmark with the rest of the list. + bookmark-- + } + p.afe.remove(formattingElement) + p.afe.insert(bookmark, clone) + + // Step 20. Fix up the stack of open elements. + p.oe.remove(formattingElement) + p.oe.insert(p.oe.index(furthestBlock)+1, clone) + } +} + +// inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM. +// "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content +// https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign +func (p *parser) inBodyEndTagOther(tagAtom a.Atom, tagName string) { + for i := len(p.oe) - 1; i >= 0; i-- { + // Two element nodes have the same tag if they have the same Data (a + // string-typed field). As an optimization, for common HTML tags, each + // Data string is assigned a unique, non-zero DataAtom (a uint32-typed + // field), since integer comparison is faster than string comparison. + // Uncommon (custom) tags get a zero DataAtom. + // + // The if condition here is equivalent to (p.oe[i].Data == tagName). + if (p.oe[i].DataAtom == tagAtom) && + ((tagAtom != 0) || (p.oe[i].Data == tagName)) { + p.oe = p.oe[:i] + break + } + if isSpecialElement(p.oe[i]) { + break + } + } +} + +// Section 12.2.6.4.8. +func textIM(p *parser) bool { + switch p.tok.Type { + case ErrorToken: + p.oe.pop() + case TextToken: + d := p.tok.Data + if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil { + // Ignore a newline at the start of a block. + if d != "" && d[0] == '\r' { + d = d[1:] + } + if d != "" && d[0] == '\n' { + d = d[1:] + } + } + if d == "" { + return true + } + p.addText(d) + return true + case EndTagToken: + p.oe.pop() + } + p.im = p.originalIM + p.originalIM = nil + return p.tok.Type == EndTagToken +} + +// Section 12.2.6.4.9. +func inTableIM(p *parser) bool { + switch p.tok.Type { + case TextToken: + p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1) + switch p.oe.top().DataAtom { + case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: + if strings.Trim(p.tok.Data, whitespace) == "" { + p.addText(p.tok.Data) + return true + } + } + case StartTagToken: + switch p.tok.DataAtom { + case a.Caption: + p.clearStackToContext(tableScope) + p.afe = append(p.afe, &scopeMarker) + p.addElement() + p.im = inCaptionIM + return true + case a.Colgroup: + p.clearStackToContext(tableScope) + p.addElement() + p.im = inColumnGroupIM + return true + case a.Col: + p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String()) + return false + case a.Tbody, a.Tfoot, a.Thead: + p.clearStackToContext(tableScope) + p.addElement() + p.im = inTableBodyIM + return true + case a.Td, a.Th, a.Tr: + p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String()) + return false + case a.Table: + if p.popUntil(tableScope, a.Table) { + p.resetInsertionMode() + return false + } + // Ignore the token. + return true + case a.Style, a.Script, a.Template: + return inHeadIM(p) + case a.Input: + for _, t := range p.tok.Attr { + if t.Key == "type" && strings.ToLower(t.Val) == "hidden" { + p.addElement() + p.oe.pop() + return true + } + } + // Otherwise drop down to the default action. + case a.Form: + if p.oe.contains(a.Template) || p.form != nil { + // Ignore the token. + return true + } + p.addElement() + p.form = p.oe.pop() + case a.Select: + p.reconstructActiveFormattingElements() + switch p.top().DataAtom { + case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: + p.fosterParenting = true + } + p.addElement() + p.fosterParenting = false + p.framesetOK = false + p.im = inSelectInTableIM + return true + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Table: + if p.popUntil(tableScope, a.Table) { + p.resetInsertionMode() + return true + } + // Ignore the token. + return true + case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: + // Ignore the token. + return true + case a.Template: + return inHeadIM(p) + } + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + return true + case DoctypeToken: + // Ignore the token. + return true + case ErrorToken: + return inBodyIM(p) + } + + p.fosterParenting = true + defer func() { p.fosterParenting = false }() + + return inBodyIM(p) +} + +// Section 12.2.6.4.11. +func inCaptionIM(p *parser) bool { + switch p.tok.Type { + case StartTagToken: + switch p.tok.DataAtom { + case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr: + if !p.popUntil(tableScope, a.Caption) { + // Ignore the token. + return true + } + p.clearActiveFormattingElements() + p.im = inTableIM + return false + case a.Select: + p.reconstructActiveFormattingElements() + p.addElement() + p.framesetOK = false + p.im = inSelectInTableIM + return true + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Caption: + if p.popUntil(tableScope, a.Caption) { + p.clearActiveFormattingElements() + p.im = inTableIM + } + return true + case a.Table: + if !p.popUntil(tableScope, a.Caption) { + // Ignore the token. + return true + } + p.clearActiveFormattingElements() + p.im = inTableIM + return false + case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: + // Ignore the token. + return true + } + } + return inBodyIM(p) +} + +// Section 12.2.6.4.12. +func inColumnGroupIM(p *parser) bool { + switch p.tok.Type { + case TextToken: + s := strings.TrimLeft(p.tok.Data, whitespace) + if len(s) < len(p.tok.Data) { + // Add the initial whitespace to the current node. + p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) + if s == "" { + return true + } + p.tok.Data = s + } + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + return true + case DoctypeToken: + // Ignore the token. + return true + case StartTagToken: + switch p.tok.DataAtom { + case a.Html: + return inBodyIM(p) + case a.Col: + p.addElement() + p.oe.pop() + p.acknowledgeSelfClosingTag() + return true + case a.Template: + return inHeadIM(p) + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Colgroup: + if p.oe.top().DataAtom == a.Colgroup { + p.oe.pop() + p.im = inTableIM + } + return true + case a.Col: + // Ignore the token. + return true + case a.Template: + return inHeadIM(p) + } + case ErrorToken: + return inBodyIM(p) + } + if p.oe.top().DataAtom != a.Colgroup { + return true + } + p.oe.pop() + p.im = inTableIM + return false +} + +// Section 12.2.6.4.13. +func inTableBodyIM(p *parser) bool { + switch p.tok.Type { + case StartTagToken: + switch p.tok.DataAtom { + case a.Tr: + p.clearStackToContext(tableBodyScope) + p.addElement() + p.im = inRowIM + return true + case a.Td, a.Th: + p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String()) + return false + case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead: + if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { + p.im = inTableIM + return false + } + // Ignore the token. + return true + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Tbody, a.Tfoot, a.Thead: + if p.elementInScope(tableScope, p.tok.DataAtom) { + p.clearStackToContext(tableBodyScope) + p.oe.pop() + p.im = inTableIM + } + return true + case a.Table: + if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { + p.im = inTableIM + return false + } + // Ignore the token. + return true + case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr: + // Ignore the token. + return true + } + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + return true + } + + return inTableIM(p) +} + +// Section 12.2.6.4.14. +func inRowIM(p *parser) bool { + switch p.tok.Type { + case StartTagToken: + switch p.tok.DataAtom { + case a.Td, a.Th: + p.clearStackToContext(tableRowScope) + p.addElement() + p.afe = append(p.afe, &scopeMarker) + p.im = inCellIM + return true + case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr: + if p.popUntil(tableScope, a.Tr) { + p.im = inTableBodyIM + return false + } + // Ignore the token. + return true + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Tr: + if p.popUntil(tableScope, a.Tr) { + p.im = inTableBodyIM + return true + } + // Ignore the token. + return true + case a.Table: + if p.popUntil(tableScope, a.Tr) { + p.im = inTableBodyIM + return false + } + // Ignore the token. + return true + case a.Tbody, a.Tfoot, a.Thead: + if p.elementInScope(tableScope, p.tok.DataAtom) { + p.parseImpliedToken(EndTagToken, a.Tr, a.Tr.String()) + return false + } + // Ignore the token. + return true + case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th: + // Ignore the token. + return true + } + } + + return inTableIM(p) +} + +// Section 12.2.6.4.15. +func inCellIM(p *parser) bool { + switch p.tok.Type { + case StartTagToken: + switch p.tok.DataAtom { + case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: + if p.popUntil(tableScope, a.Td, a.Th) { + // Close the cell and reprocess. + p.clearActiveFormattingElements() + p.im = inRowIM + return false + } + // Ignore the token. + return true + case a.Select: + p.reconstructActiveFormattingElements() + p.addElement() + p.framesetOK = false + p.im = inSelectInTableIM + return true + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Td, a.Th: + if !p.popUntil(tableScope, p.tok.DataAtom) { + // Ignore the token. + return true + } + p.clearActiveFormattingElements() + p.im = inRowIM + return true + case a.Body, a.Caption, a.Col, a.Colgroup, a.Html: + // Ignore the token. + return true + case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: + if !p.elementInScope(tableScope, p.tok.DataAtom) { + // Ignore the token. + return true + } + // Close the cell and reprocess. + if p.popUntil(tableScope, a.Td, a.Th) { + p.clearActiveFormattingElements() + } + p.im = inRowIM + return false + } + } + return inBodyIM(p) +} + +// Section 12.2.6.4.16. +func inSelectIM(p *parser) bool { + switch p.tok.Type { + case TextToken: + p.addText(strings.Replace(p.tok.Data, "\x00", "", -1)) + case StartTagToken: + switch p.tok.DataAtom { + case a.Html: + return inBodyIM(p) + case a.Option: + if p.top().DataAtom == a.Option { + p.oe.pop() + } + p.addElement() + case a.Optgroup: + if p.top().DataAtom == a.Option { + p.oe.pop() + } + if p.top().DataAtom == a.Optgroup { + p.oe.pop() + } + p.addElement() + case a.Select: + if !p.popUntil(selectScope, a.Select) { + // Ignore the token. + return true + } + p.resetInsertionMode() + case a.Input, a.Keygen, a.Textarea: + if p.elementInScope(selectScope, a.Select) { + p.parseImpliedToken(EndTagToken, a.Select, a.Select.String()) + return false + } + // In order to properly ignore , we need to change the tokenizer mode. + p.tokenizer.NextIsNotRawText() + // Ignore the token. + return true + case a.Script, a.Template: + return inHeadIM(p) + case a.Iframe, a.Noembed, a.Noframes, a.Noscript, a.Plaintext, a.Style, a.Title, a.Xmp: + // Don't let the tokenizer go into raw text mode when there are raw tags + // to be ignored. These tags should be ignored from the tokenizer + // properly. + p.tokenizer.NextIsNotRawText() + // Ignore the token. + return true + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Option: + if p.top().DataAtom == a.Option { + p.oe.pop() + } + case a.Optgroup: + i := len(p.oe) - 1 + if p.oe[i].DataAtom == a.Option { + i-- + } + if p.oe[i].DataAtom == a.Optgroup { + p.oe = p.oe[:i] + } + case a.Select: + if !p.popUntil(selectScope, a.Select) { + // Ignore the token. + return true + } + p.resetInsertionMode() + case a.Template: + return inHeadIM(p) + } + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + case DoctypeToken: + // Ignore the token. + return true + case ErrorToken: + return inBodyIM(p) + } + + return true +} + +// Section 12.2.6.4.17. +func inSelectInTableIM(p *parser) bool { + switch p.tok.Type { + case StartTagToken, EndTagToken: + switch p.tok.DataAtom { + case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th: + if p.tok.Type == EndTagToken && !p.elementInScope(tableScope, p.tok.DataAtom) { + // Ignore the token. + return true + } + // This is like p.popUntil(selectScope, a.Select), but it also + // matches , not just . Matching the MathML + // tag is arguably incorrect (conceptually), but it mimics what + // Chromium does. + for i := len(p.oe) - 1; i >= 0; i-- { + if n := p.oe[i]; n.DataAtom == a.Select { + p.oe = p.oe[:i] + break + } + } + p.resetInsertionMode() + return false + } + } + return inSelectIM(p) +} + +// Section 12.2.6.4.18. +func inTemplateIM(p *parser) bool { + switch p.tok.Type { + case TextToken, CommentToken, DoctypeToken: + return inBodyIM(p) + case StartTagToken: + switch p.tok.DataAtom { + case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: + return inHeadIM(p) + case a.Caption, a.Colgroup, a.Tbody, a.Tfoot, a.Thead: + p.templateStack.pop() + p.templateStack = append(p.templateStack, inTableIM) + p.im = inTableIM + return false + case a.Col: + p.templateStack.pop() + p.templateStack = append(p.templateStack, inColumnGroupIM) + p.im = inColumnGroupIM + return false + case a.Tr: + p.templateStack.pop() + p.templateStack = append(p.templateStack, inTableBodyIM) + p.im = inTableBodyIM + return false + case a.Td, a.Th: + p.templateStack.pop() + p.templateStack = append(p.templateStack, inRowIM) + p.im = inRowIM + return false + default: + p.templateStack.pop() + p.templateStack = append(p.templateStack, inBodyIM) + p.im = inBodyIM + return false + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Template: + return inHeadIM(p) + default: + // Ignore the token. + return true + } + case ErrorToken: + if !p.oe.contains(a.Template) { + // Ignore the token. + return true + } + // TODO: remove this divergence from the HTML5 spec. + // + // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 + p.generateImpliedEndTags() + for i := len(p.oe) - 1; i >= 0; i-- { + if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template { + p.oe = p.oe[:i] + break + } + } + p.clearActiveFormattingElements() + p.templateStack.pop() + p.resetInsertionMode() + return false + } + return false +} + +// Section 12.2.6.4.19. +func afterBodyIM(p *parser) bool { + switch p.tok.Type { + case ErrorToken: + // Stop parsing. + return true + case TextToken: + s := strings.TrimLeft(p.tok.Data, whitespace) + if len(s) == 0 { + // It was all whitespace. + return inBodyIM(p) + } + case StartTagToken: + if p.tok.DataAtom == a.Html { + return inBodyIM(p) + } + case EndTagToken: + if p.tok.DataAtom == a.Html { + if !p.fragment { + p.im = afterAfterBodyIM + } + return true + } + case CommentToken: + // The comment is attached to the element. + if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html { + panic("html: bad parser state: element not found, in the after-body insertion mode") + } + p.oe[0].AppendChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + return true + } + p.im = inBodyIM + return false +} + +// Section 12.2.6.4.20. +func inFramesetIM(p *parser) bool { + switch p.tok.Type { + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + case TextToken: + // Ignore all text but whitespace. + s := strings.Map(func(c rune) rune { + switch c { + case ' ', '\t', '\n', '\f', '\r': + return c + } + return -1 + }, p.tok.Data) + if s != "" { + p.addText(s) + } + case StartTagToken: + switch p.tok.DataAtom { + case a.Html: + return inBodyIM(p) + case a.Frameset: + p.addElement() + case a.Frame: + p.addElement() + p.oe.pop() + p.acknowledgeSelfClosingTag() + case a.Noframes: + return inHeadIM(p) + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Frameset: + if p.oe.top().DataAtom != a.Html { + p.oe.pop() + if p.oe.top().DataAtom != a.Frameset { + p.im = afterFramesetIM + return true + } + } + } + default: + // Ignore the token. + } + return true +} + +// Section 12.2.6.4.21. +func afterFramesetIM(p *parser) bool { + switch p.tok.Type { + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + case TextToken: + // Ignore all text but whitespace. + s := strings.Map(func(c rune) rune { + switch c { + case ' ', '\t', '\n', '\f', '\r': + return c + } + return -1 + }, p.tok.Data) + if s != "" { + p.addText(s) + } + case StartTagToken: + switch p.tok.DataAtom { + case a.Html: + return inBodyIM(p) + case a.Noframes: + return inHeadIM(p) + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Html: + p.im = afterAfterFramesetIM + return true + } + default: + // Ignore the token. + } + return true +} + +// Section 12.2.6.4.22. +func afterAfterBodyIM(p *parser) bool { + switch p.tok.Type { + case ErrorToken: + // Stop parsing. + return true + case TextToken: + s := strings.TrimLeft(p.tok.Data, whitespace) + if len(s) == 0 { + // It was all whitespace. + return inBodyIM(p) + } + case StartTagToken: + if p.tok.DataAtom == a.Html { + return inBodyIM(p) + } + case CommentToken: + p.doc.AppendChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + return true + case DoctypeToken: + return inBodyIM(p) + } + p.im = inBodyIM + return false +} + +// Section 12.2.6.4.23. +func afterAfterFramesetIM(p *parser) bool { + switch p.tok.Type { + case CommentToken: + p.doc.AppendChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + case TextToken: + // Ignore all text but whitespace. + s := strings.Map(func(c rune) rune { + switch c { + case ' ', '\t', '\n', '\f', '\r': + return c + } + return -1 + }, p.tok.Data) + if s != "" { + p.tok.Data = s + return inBodyIM(p) + } + case StartTagToken: + switch p.tok.DataAtom { + case a.Html: + return inBodyIM(p) + case a.Noframes: + return inHeadIM(p) + } + case DoctypeToken: + return inBodyIM(p) + default: + // Ignore the token. + } + return true +} + +func ignoreTheRemainingTokens(p *parser) bool { + return true +} + +const whitespaceOrNUL = whitespace + "\x00" + +// Section 12.2.6.5 +func parseForeignContent(p *parser) bool { + switch p.tok.Type { + case TextToken: + if p.framesetOK { + p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == "" + } + p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1) + p.addText(p.tok.Data) + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + case StartTagToken: + if !p.fragment { + b := breakout[p.tok.Data] + if p.tok.DataAtom == a.Font { + loop: + for _, attr := range p.tok.Attr { + switch attr.Key { + case "color", "face", "size": + b = true + break loop + } + } + } + if b { + for i := len(p.oe) - 1; i >= 0; i-- { + n := p.oe[i] + if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) { + p.oe = p.oe[:i+1] + break + } + } + return false + } + } + current := p.adjustedCurrentNode() + switch current.Namespace { + case "math": + adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) + case "svg": + // Adjust SVG tag names. The tokenizer lower-cases tag names, but + // SVG wants e.g. "foreignObject" with a capital second "O". + if x := svgTagNameAdjustments[p.tok.Data]; x != "" { + p.tok.DataAtom = a.Lookup([]byte(x)) + p.tok.Data = x + } + adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) + default: + panic("html: bad parser state: unexpected namespace") + } + adjustForeignAttributes(p.tok.Attr) + namespace := current.Namespace + p.addElement() + p.top().Namespace = namespace + if namespace != "" { + // Don't let the tokenizer go into raw text mode in foreign content + // (e.g. in an SVG tag). + p.tokenizer.NextIsNotRawText() + } + if p.hasSelfClosingToken { + p.oe.pop() + p.acknowledgeSelfClosingTag() + } + case EndTagToken: + for i := len(p.oe) - 1; i >= 0; i-- { + if p.oe[i].Namespace == "" { + return p.im(p) + } + if strings.EqualFold(p.oe[i].Data, p.tok.Data) { + p.oe = p.oe[:i] + break + } + } + return true + default: + // Ignore the token. + } + return true +} + +// Section 12.2.4.2. +func (p *parser) adjustedCurrentNode() *Node { + if len(p.oe) == 1 && p.fragment && p.context != nil { + return p.context + } + return p.oe.top() +} + +// Section 12.2.6. +func (p *parser) inForeignContent() bool { + if len(p.oe) == 0 { + return false + } + n := p.adjustedCurrentNode() + if n.Namespace == "" { + return false + } + if mathMLTextIntegrationPoint(n) { + if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark { + return false + } + if p.tok.Type == TextToken { + return false + } + } + if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg { + return false + } + if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) { + return false + } + if p.tok.Type == ErrorToken { + return false + } + return true +} + +// parseImpliedToken parses a token as though it had appeared in the parser's +// input. +func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) { + realToken, selfClosing := p.tok, p.hasSelfClosingToken + p.tok = Token{ + Type: t, + DataAtom: dataAtom, + Data: data, + } + p.hasSelfClosingToken = false + p.parseCurrentToken() + p.tok, p.hasSelfClosingToken = realToken, selfClosing +} + +// parseCurrentToken runs the current token through the parsing routines +// until it is consumed. +func (p *parser) parseCurrentToken() { + if p.tok.Type == SelfClosingTagToken { + p.hasSelfClosingToken = true + p.tok.Type = StartTagToken + } + + consumed := false + for !consumed { + if p.inForeignContent() { + consumed = parseForeignContent(p) + } else { + consumed = p.im(p) + } + } + + if p.hasSelfClosingToken { + // This is a parse error, but ignore it. + p.hasSelfClosingToken = false + } +} + +func (p *parser) parse() error { + // Iterate until EOF. Any other error will cause an early return. + var err error + for err != io.EOF { + // CDATA sections are allowed only in foreign content. + n := p.oe.top() + p.tokenizer.AllowCDATA(n != nil && n.Namespace != "") + // Read and parse the next token. + p.tokenizer.Next() + p.tok = p.tokenizer.Token() + if p.tok.Type == ErrorToken { + err = p.tokenizer.Err() + if err != nil && err != io.EOF { + return err + } + } + p.parseCurrentToken() + } + return nil +} + +// Parse returns the parse tree for the HTML from the given Reader. +// +// It implements the HTML5 parsing algorithm +// (https://html.spec.whatwg.org/multipage/syntax.html#tree-construction), +// which is very complicated. The resultant tree can contain implicitly created +// nodes that have no explicit listed in r's data, and nodes' parents can +// differ from the nesting implied by a naive processing of start and end +// s. Conversely, explicit s in r's data can be silently dropped, +// with no corresponding node in the resulting tree. +// +// The input is assumed to be UTF-8 encoded. +func Parse(r io.Reader) (*Node, error) { + return ParseWithOptions(r) +} + +// ParseFragment parses a fragment of HTML and returns the nodes that were +// found. If the fragment is the InnerHTML for an existing element, pass that +// element in context. +// +// It has the same intricacies as Parse. +func ParseFragment(r io.Reader, context *Node) ([]*Node, error) { + return ParseFragmentWithOptions(r, context) +} + +// ParseOption configures a parser. +type ParseOption func(p *parser) + +// ParseOptionEnableScripting configures the scripting flag. +// https://html.spec.whatwg.org/multipage/webappapis.html#enabling-and-disabling-scripting +// +// By default, scripting is enabled. +func ParseOptionEnableScripting(enable bool) ParseOption { + return func(p *parser) { + p.scripting = enable + } +} + +// ParseWithOptions is like Parse, with options. +func ParseWithOptions(r io.Reader, opts ...ParseOption) (*Node, error) { + p := &parser{ + tokenizer: NewTokenizer(r), + doc: &Node{ + Type: DocumentNode, + }, + scripting: true, + framesetOK: true, + im: initialIM, + } + + for _, f := range opts { + f(p) + } + + if err := p.parse(); err != nil { + return nil, err + } + return p.doc, nil +} + +// ParseFragmentWithOptions is like ParseFragment, with options. +func ParseFragmentWithOptions(r io.Reader, context *Node, opts ...ParseOption) ([]*Node, error) { + contextTag := "" + if context != nil { + if context.Type != ElementNode { + return nil, errors.New("html: ParseFragment of non-element Node") + } + // The next check isn't just context.DataAtom.String() == context.Data because + // it is valid to pass an element whose tag isn't a known atom. For example, + // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent. + if context.DataAtom != a.Lookup([]byte(context.Data)) { + return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data) + } + contextTag = context.DataAtom.String() + } + p := &parser{ + doc: &Node{ + Type: DocumentNode, + }, + scripting: true, + fragment: true, + context: context, + } + if context != nil && context.Namespace != "" { + p.tokenizer = NewTokenizer(r) + } else { + p.tokenizer = NewTokenizerFragment(r, contextTag) + } + + for _, f := range opts { + f(p) + } + + root := &Node{ + Type: ElementNode, + DataAtom: a.Html, + Data: a.Html.String(), + } + p.doc.AppendChild(root) + p.oe = nodeStack{root} + if context != nil && context.DataAtom == a.Template { + p.templateStack = append(p.templateStack, inTemplateIM) + } + p.resetInsertionMode() + + for n := context; n != nil; n = n.Parent { + if n.Type == ElementNode && n.DataAtom == a.Form { + p.form = n + break + } + } + + if err := p.parse(); err != nil { + return nil, err + } + + parent := p.doc + if context != nil { + parent = root + } + + var result []*Node + for c := parent.FirstChild; c != nil; { + next := c.NextSibling + parent.RemoveChild(c) + result = append(result, c) + c = next + } + return result, nil +} diff --git a/vendor/golang.org/x/net/html/render.go b/vendor/golang.org/x/net/html/render.go new file mode 100644 index 0000000..b46d81c --- /dev/null +++ b/vendor/golang.org/x/net/html/render.go @@ -0,0 +1,273 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "bufio" + "errors" + "fmt" + "io" + "strings" +) + +type writer interface { + io.Writer + io.ByteWriter + WriteString(string) (int, error) +} + +// Render renders the parse tree n to the given writer. +// +// Rendering is done on a 'best effort' basis: calling Parse on the output of +// Render will always result in something similar to the original tree, but it +// is not necessarily an exact clone unless the original tree was 'well-formed'. +// 'Well-formed' is not easily specified; the HTML5 specification is +// complicated. +// +// Calling Parse on arbitrary input typically results in a 'well-formed' parse +// tree. However, it is possible for Parse to yield a 'badly-formed' parse tree. +// For example, in a 'well-formed' parse tree, no element is a child of +// another element: parsing "" results in two sibling elements. +// Similarly, in a 'well-formed' parse tree, no element is a child of a +// element: parsing "" results in a with two sibling +// children; the is reparented to the 's parent. However, calling +// Parse on "" does not return an error, but the result has an +// element with an child, and is therefore not 'well-formed'. +// +// Programmatically constructed trees are typically also 'well-formed', but it +// is possible to construct a tree that looks innocuous but, when rendered and +// re-parsed, results in a different tree. A simple example is that a solitary +// text node would become a tree containing , and elements. +// Another example is that the programmatic equivalent of "abc" +// becomes "abc". +func Render(w io.Writer, n *Node) error { + if x, ok := w.(writer); ok { + return render(x, n) + } + buf := bufio.NewWriter(w) + if err := render(buf, n); err != nil { + return err + } + return buf.Flush() +} + +// plaintextAbort is returned from render1 when a element +// has been rendered. No more end tags should be rendered after that. +var plaintextAbort = errors.New("html: internal error (plaintext abort)") + +func render(w writer, n *Node) error { + err := render1(w, n) + if err == plaintextAbort { + err = nil + } + return err +} + +func render1(w writer, n *Node) error { + // Render non-element nodes; these are the easy cases. + switch n.Type { + case ErrorNode: + return errors.New("html: cannot render an ErrorNode node") + case TextNode: + return escape(w, n.Data) + case DocumentNode: + for c := n.FirstChild; c != nil; c = c.NextSibling { + if err := render1(w, c); err != nil { + return err + } + } + return nil + case ElementNode: + // No-op. + case CommentNode: + if _, err := w.WriteString(""); err != nil { + return err + } + return nil + case DoctypeNode: + if _, err := w.WriteString("') + case RawNode: + _, err := w.WriteString(n.Data) + return err + default: + return errors.New("html: unknown node type") + } + + // Render the opening tag. + if err := w.WriteByte('<'); err != nil { + return err + } + if _, err := w.WriteString(n.Data); err != nil { + return err + } + for _, a := range n.Attr { + if err := w.WriteByte(' '); err != nil { + return err + } + if a.Namespace != "" { + if _, err := w.WriteString(a.Namespace); err != nil { + return err + } + if err := w.WriteByte(':'); err != nil { + return err + } + } + if _, err := w.WriteString(a.Key); err != nil { + return err + } + if _, err := w.WriteString(`="`); err != nil { + return err + } + if err := escape(w, a.Val); err != nil { + return err + } + if err := w.WriteByte('"'); err != nil { + return err + } + } + if voidElements[n.Data] { + if n.FirstChild != nil { + return fmt.Errorf("html: void element <%s> has child nodes", n.Data) + } + _, err := w.WriteString("/>") + return err + } + if err := w.WriteByte('>'); err != nil { + return err + } + + // Add initial newline where there is danger of a newline beging ignored. + if c := n.FirstChild; c != nil && c.Type == TextNode && strings.HasPrefix(c.Data, "\n") { + switch n.Data { + case "pre", "listing", "textarea": + if err := w.WriteByte('\n'); err != nil { + return err + } + } + } + + // Render any child nodes. + switch n.Data { + case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "xmp": + for c := n.FirstChild; c != nil; c = c.NextSibling { + if c.Type == TextNode { + if _, err := w.WriteString(c.Data); err != nil { + return err + } + } else { + if err := render1(w, c); err != nil { + return err + } + } + } + if n.Data == "plaintext" { + // Don't render anything else. must be the + // last element in the file, with no closing tag. + return plaintextAbort + } + default: + for c := n.FirstChild; c != nil; c = c.NextSibling { + if err := render1(w, c); err != nil { + return err + } + } + } + + // Render the closing tag. + if _, err := w.WriteString(""); err != nil { + return err + } + if _, err := w.WriteString(n.Data); err != nil { + return err + } + return w.WriteByte('>') +} + +// writeQuoted writes s to w surrounded by quotes. Normally it will use double +// quotes, but if s contains a double quote, it will use single quotes. +// It is used for writing the identifiers in a doctype declaration. +// In valid HTML, they can't contain both types of quotes. +func writeQuoted(w writer, s string) error { + var q byte = '"' + if strings.Contains(s, `"`) { + q = '\'' + } + if err := w.WriteByte(q); err != nil { + return err + } + if _, err := w.WriteString(s); err != nil { + return err + } + if err := w.WriteByte(q); err != nil { + return err + } + return nil +} + +// Section 12.1.2, "Elements", gives this list of void elements. Void elements +// are those that can't have any contents. +var voidElements = map[string]bool{ + "area": true, + "base": true, + "br": true, + "col": true, + "embed": true, + "hr": true, + "img": true, + "input": true, + "keygen": true, // "keygen" has been removed from the spec, but are kept here for backwards compatibility. + "link": true, + "meta": true, + "param": true, + "source": true, + "track": true, + "wbr": true, +} diff --git a/vendor/golang.org/x/net/html/token.go b/vendor/golang.org/x/net/html/token.go new file mode 100644 index 0000000..877709f --- /dev/null +++ b/vendor/golang.org/x/net/html/token.go @@ -0,0 +1,1224 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "bytes" + "errors" + "io" + "strconv" + "strings" + + "golang.org/x/net/html/atom" +) + +// A TokenType is the type of a Token. +type TokenType uint32 + +const ( + // ErrorToken means that an error occurred during tokenization. + ErrorToken TokenType = iota + // TextToken means a text node. + TextToken + // A StartTagToken looks like . + StartTagToken + // An EndTagToken looks like . + EndTagToken + // A SelfClosingTagToken tag looks like . + SelfClosingTagToken + // A CommentToken looks like . + CommentToken + // A DoctypeToken looks like + DoctypeToken +) + +// ErrBufferExceeded means that the buffering limit was exceeded. +var ErrBufferExceeded = errors.New("max buffer exceeded") + +// String returns a string representation of the TokenType. +func (t TokenType) String() string { + switch t { + case ErrorToken: + return "Error" + case TextToken: + return "Text" + case StartTagToken: + return "StartTag" + case EndTagToken: + return "EndTag" + case SelfClosingTagToken: + return "SelfClosingTag" + case CommentToken: + return "Comment" + case DoctypeToken: + return "Doctype" + } + return "Invalid(" + strconv.Itoa(int(t)) + ")" +} + +// An Attribute is an attribute namespace-key-value triple. Namespace is +// non-empty for foreign attributes like xlink, Key is alphabetic (and hence +// does not contain escapable characters like '&', '<' or '>'), and Val is +// unescaped (it looks like "a" + case EndTagToken: + return "" + t.tagString() + ">" + case SelfClosingTagToken: + return "<" + t.tagString() + "/>" + case CommentToken: + return "" + case DoctypeToken: + return "" + } + return "Invalid(" + strconv.Itoa(int(t.Type)) + ")" +} + +// span is a range of bytes in a Tokenizer's buffer. The start is inclusive, +// the end is exclusive. +type span struct { + start, end int +} + +// A Tokenizer returns a stream of HTML Tokens. +type Tokenizer struct { + // r is the source of the HTML text. + r io.Reader + // tt is the TokenType of the current token. + tt TokenType + // err is the first error encountered during tokenization. It is possible + // for tt != Error && err != nil to hold: this means that Next returned a + // valid token but the subsequent Next call will return an error token. + // For example, if the HTML text input was just "plain", then the first + // Next call would set z.err to io.EOF but return a TextToken, and all + // subsequent Next calls would return an ErrorToken. + // err is never reset. Once it becomes non-nil, it stays non-nil. + err error + // readErr is the error returned by the io.Reader r. It is separate from + // err because it is valid for an io.Reader to return (n int, err1 error) + // such that n > 0 && err1 != nil, and callers should always process the + // n > 0 bytes before considering the error err1. + readErr error + // buf[raw.start:raw.end] holds the raw bytes of the current token. + // buf[raw.end:] is buffered input that will yield future tokens. + raw span + buf []byte + // maxBuf limits the data buffered in buf. A value of 0 means unlimited. + maxBuf int + // buf[data.start:data.end] holds the raw bytes of the current token's data: + // a text token's text, a tag token's tag name, etc. + data span + // pendingAttr is the attribute key and value currently being tokenized. + // When complete, pendingAttr is pushed onto attr. nAttrReturned is + // incremented on each call to TagAttr. + pendingAttr [2]span + attr [][2]span + nAttrReturned int + // rawTag is the "script" in "" that closes the next token. If + // non-empty, the subsequent call to Next will return a raw or RCDATA text + // token: one that treats "" as text instead of an element. + // rawTag's contents are lower-cased. + rawTag string + // textIsRaw is whether the current text token's data is not escaped. + textIsRaw bool + // convertNUL is whether NUL bytes in the current token's data should + // be converted into \ufffd replacement characters. + convertNUL bool + // allowCDATA is whether CDATA sections are allowed in the current context. + allowCDATA bool +} + +// AllowCDATA sets whether or not the tokenizer recognizes as +// the text "foo". The default value is false, which means to recognize it as +// a bogus comment "" instead. +// +// Strictly speaking, an HTML5 compliant tokenizer should allow CDATA if and +// only if tokenizing foreign content, such as MathML and SVG. However, +// tracking foreign-contentness is difficult to do purely in the tokenizer, +// as opposed to the parser, due to HTML integration points: an element +// can contain a that is foreign-to-SVG but not foreign-to- +// HTML. For strict compliance with the HTML5 tokenization algorithm, it is the +// responsibility of the user of a tokenizer to call AllowCDATA as appropriate. +// In practice, if using the tokenizer without caring whether MathML or SVG +// CDATA is text or comments, such as tokenizing HTML to find all the anchor +// text, it is acceptable to ignore this responsibility. +func (z *Tokenizer) AllowCDATA(allowCDATA bool) { + z.allowCDATA = allowCDATA +} + +// NextIsNotRawText instructs the tokenizer that the next token should not be +// considered as 'raw text'. Some elements, such as script and title elements, +// normally require the next token after the opening tag to be 'raw text' that +// has no child elements. For example, tokenizing "acd" +// yields a start tag token for "", a text token for "acd", and +// an end tag token for "". There are no distinct start tag or end tag +// tokens for the "" and "". +// +// This tokenizer implementation will generally look for raw text at the right +// times. Strictly speaking, an HTML5 compliant tokenizer should not look for +// raw text if in foreign content: generally needs raw text, but a +// inside an does not. Another example is that a +// generally needs raw text, but a is not allowed as an immediate +// child of a ; in normal parsing, a implies , but +// one cannot close the implicit element when parsing a 's InnerHTML. +// Similarly to AllowCDATA, tracking the correct moment to override raw-text- +// ness is difficult to do purely in the tokenizer, as opposed to the parser. +// For strict compliance with the HTML5 tokenization algorithm, it is the +// responsibility of the user of a tokenizer to call NextIsNotRawText as +// appropriate. In practice, like AllowCDATA, it is acceptable to ignore this +// responsibility for basic usage. +// +// Note that this 'raw text' concept is different from the one offered by the +// Tokenizer.Raw method. +func (z *Tokenizer) NextIsNotRawText() { + z.rawTag = "" +} + +// Err returns the error associated with the most recent ErrorToken token. +// This is typically io.EOF, meaning the end of tokenization. +func (z *Tokenizer) Err() error { + if z.tt != ErrorToken { + return nil + } + return z.err +} + +// readByte returns the next byte from the input stream, doing a buffered read +// from z.r into z.buf if necessary. z.buf[z.raw.start:z.raw.end] remains a contiguous byte +// slice that holds all the bytes read so far for the current token. +// It sets z.err if the underlying reader returns an error. +// Pre-condition: z.err == nil. +func (z *Tokenizer) readByte() byte { + if z.raw.end >= len(z.buf) { + // Our buffer is exhausted and we have to read from z.r. Check if the + // previous read resulted in an error. + if z.readErr != nil { + z.err = z.readErr + return 0 + } + // We copy z.buf[z.raw.start:z.raw.end] to the beginning of z.buf. If the length + // z.raw.end - z.raw.start is more than half the capacity of z.buf, then we + // allocate a new buffer before the copy. + c := cap(z.buf) + d := z.raw.end - z.raw.start + var buf1 []byte + if 2*d > c { + buf1 = make([]byte, d, 2*c) + } else { + buf1 = z.buf[:d] + } + copy(buf1, z.buf[z.raw.start:z.raw.end]) + if x := z.raw.start; x != 0 { + // Adjust the data/attr spans to refer to the same contents after the copy. + z.data.start -= x + z.data.end -= x + z.pendingAttr[0].start -= x + z.pendingAttr[0].end -= x + z.pendingAttr[1].start -= x + z.pendingAttr[1].end -= x + for i := range z.attr { + z.attr[i][0].start -= x + z.attr[i][0].end -= x + z.attr[i][1].start -= x + z.attr[i][1].end -= x + } + } + z.raw.start, z.raw.end, z.buf = 0, d, buf1[:d] + // Now that we have copied the live bytes to the start of the buffer, + // we read from z.r into the remainder. + var n int + n, z.readErr = readAtLeastOneByte(z.r, buf1[d:cap(buf1)]) + if n == 0 { + z.err = z.readErr + return 0 + } + z.buf = buf1[:d+n] + } + x := z.buf[z.raw.end] + z.raw.end++ + if z.maxBuf > 0 && z.raw.end-z.raw.start >= z.maxBuf { + z.err = ErrBufferExceeded + return 0 + } + return x +} + +// Buffered returns a slice containing data buffered but not yet tokenized. +func (z *Tokenizer) Buffered() []byte { + return z.buf[z.raw.end:] +} + +// readAtLeastOneByte wraps an io.Reader so that reading cannot return (0, nil). +// It returns io.ErrNoProgress if the underlying r.Read method returns (0, nil) +// too many times in succession. +func readAtLeastOneByte(r io.Reader, b []byte) (int, error) { + for i := 0; i < 100; i++ { + if n, err := r.Read(b); n != 0 || err != nil { + return n, err + } + } + return 0, io.ErrNoProgress +} + +// skipWhiteSpace skips past any white space. +func (z *Tokenizer) skipWhiteSpace() { + if z.err != nil { + return + } + for { + c := z.readByte() + if z.err != nil { + return + } + switch c { + case ' ', '\n', '\r', '\t', '\f': + // No-op. + default: + z.raw.end-- + return + } + } +} + +// readRawOrRCDATA reads until the next "", where "foo" is z.rawTag and +// is typically something like "script" or "textarea". +func (z *Tokenizer) readRawOrRCDATA() { + if z.rawTag == "script" { + z.readScript() + z.textIsRaw = true + z.rawTag = "" + return + } +loop: + for { + c := z.readByte() + if z.err != nil { + break loop + } + if c != '<' { + continue loop + } + c = z.readByte() + if z.err != nil { + break loop + } + if c != '/' { + z.raw.end-- + continue loop + } + if z.readRawEndTag() || z.err != nil { + break loop + } + } + z.data.end = z.raw.end + // A textarea's or title's RCDATA can contain escaped entities. + z.textIsRaw = z.rawTag != "textarea" && z.rawTag != "title" + z.rawTag = "" +} + +// readRawEndTag attempts to read a tag like "", where "foo" is z.rawTag. +// If it succeeds, it backs up the input position to reconsume the tag and +// returns true. Otherwise it returns false. The opening "" has already been +// consumed. +func (z *Tokenizer) readRawEndTag() bool { + for i := 0; i < len(z.rawTag); i++ { + c := z.readByte() + if z.err != nil { + return false + } + if c != z.rawTag[i] && c != z.rawTag[i]-('a'-'A') { + z.raw.end-- + return false + } + } + c := z.readByte() + if z.err != nil { + return false + } + switch c { + case ' ', '\n', '\r', '\t', '\f', '/', '>': + // The 3 is 2 for the leading "" plus 1 for the trailing character c. + z.raw.end -= 3 + len(z.rawTag) + return true + } + z.raw.end-- + return false +} + +// readScript reads until the next tag, following the byzantine +// rules for escaping/hiding the closing tag. +func (z *Tokenizer) readScript() { + defer func() { + z.data.end = z.raw.end + }() + var c byte + +scriptData: + c = z.readByte() + if z.err != nil { + return + } + if c == '<' { + goto scriptDataLessThanSign + } + goto scriptData + +scriptDataLessThanSign: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '/': + goto scriptDataEndTagOpen + case '!': + goto scriptDataEscapeStart + } + z.raw.end-- + goto scriptData + +scriptDataEndTagOpen: + if z.readRawEndTag() || z.err != nil { + return + } + goto scriptData + +scriptDataEscapeStart: + c = z.readByte() + if z.err != nil { + return + } + if c == '-' { + goto scriptDataEscapeStartDash + } + z.raw.end-- + goto scriptData + +scriptDataEscapeStartDash: + c = z.readByte() + if z.err != nil { + return + } + if c == '-' { + goto scriptDataEscapedDashDash + } + z.raw.end-- + goto scriptData + +scriptDataEscaped: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataEscapedDash + case '<': + goto scriptDataEscapedLessThanSign + } + goto scriptDataEscaped + +scriptDataEscapedDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataEscapedDashDash + case '<': + goto scriptDataEscapedLessThanSign + } + goto scriptDataEscaped + +scriptDataEscapedDashDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataEscapedDashDash + case '<': + goto scriptDataEscapedLessThanSign + case '>': + goto scriptData + } + goto scriptDataEscaped + +scriptDataEscapedLessThanSign: + c = z.readByte() + if z.err != nil { + return + } + if c == '/' { + goto scriptDataEscapedEndTagOpen + } + if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { + goto scriptDataDoubleEscapeStart + } + z.raw.end-- + goto scriptData + +scriptDataEscapedEndTagOpen: + if z.readRawEndTag() || z.err != nil { + return + } + goto scriptDataEscaped + +scriptDataDoubleEscapeStart: + z.raw.end-- + for i := 0; i < len("script"); i++ { + c = z.readByte() + if z.err != nil { + return + } + if c != "script"[i] && c != "SCRIPT"[i] { + z.raw.end-- + goto scriptDataEscaped + } + } + c = z.readByte() + if z.err != nil { + return + } + switch c { + case ' ', '\n', '\r', '\t', '\f', '/', '>': + goto scriptDataDoubleEscaped + } + z.raw.end-- + goto scriptDataEscaped + +scriptDataDoubleEscaped: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataDoubleEscapedDash + case '<': + goto scriptDataDoubleEscapedLessThanSign + } + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapedDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataDoubleEscapedDashDash + case '<': + goto scriptDataDoubleEscapedLessThanSign + } + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapedDashDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataDoubleEscapedDashDash + case '<': + goto scriptDataDoubleEscapedLessThanSign + case '>': + goto scriptData + } + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapedLessThanSign: + c = z.readByte() + if z.err != nil { + return + } + if c == '/' { + goto scriptDataDoubleEscapeEnd + } + z.raw.end-- + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapeEnd: + if z.readRawEndTag() { + z.raw.end += len("") + goto scriptDataEscaped + } + if z.err != nil { + return + } + goto scriptDataDoubleEscaped +} + +// readComment reads the next comment token starting with ". + z.data.end = z.data.start + } + }() + for dashCount := 2; ; { + c := z.readByte() + if z.err != nil { + // Ignore up to two dashes at EOF. + if dashCount > 2 { + dashCount = 2 + } + z.data.end = z.raw.end - dashCount + return + } + switch c { + case '-': + dashCount++ + continue + case '>': + if dashCount >= 2 { + z.data.end = z.raw.end - len("-->") + return + } + case '!': + if dashCount >= 2 { + c = z.readByte() + if z.err != nil { + z.data.end = z.raw.end + return + } + if c == '>' { + z.data.end = z.raw.end - len("--!>") + return + } + } + } + dashCount = 0 + } +} + +// readUntilCloseAngle reads until the next ">". +func (z *Tokenizer) readUntilCloseAngle() { + z.data.start = z.raw.end + for { + c := z.readByte() + if z.err != nil { + z.data.end = z.raw.end + return + } + if c == '>' { + z.data.end = z.raw.end - len(">") + return + } + } +} + +// readMarkupDeclaration reads the next token starting with "", a "", a "" or +// "': + if brackets >= 2 { + z.data.end = z.raw.end - len("]]>") + return true + } + brackets = 0 + default: + brackets = 0 + } + } +} + +// startTagIn returns whether the start tag in z.buf[z.data.start:z.data.end] +// case-insensitively matches any element of ss. +func (z *Tokenizer) startTagIn(ss ...string) bool { +loop: + for _, s := range ss { + if z.data.end-z.data.start != len(s) { + continue loop + } + for i := 0; i < len(s); i++ { + c := z.buf[z.data.start+i] + if 'A' <= c && c <= 'Z' { + c += 'a' - 'A' + } + if c != s[i] { + continue loop + } + } + return true + } + return false +} + +// readStartTag reads the next start tag token. The opening "". + if z.err == nil && z.buf[z.raw.end-2] == '/' { + return SelfClosingTagToken + } + return StartTagToken +} + +// readTag reads the next tag token and its attributes. If saveAttr, those +// attributes are saved in z.attr, otherwise z.attr is set to an empty slice. +// The opening "' { + break + } + z.raw.end-- + z.readTagAttrKey() + z.readTagAttrVal() + // Save pendingAttr if saveAttr and that attribute has a non-empty key. + if saveAttr && z.pendingAttr[0].start != z.pendingAttr[0].end { + z.attr = append(z.attr, z.pendingAttr) + } + if z.skipWhiteSpace(); z.err != nil { + break + } + } +} + +// readTagName sets z.data to the "div" in "". The reader (z.raw.end) +// is positioned such that the first byte of the tag name (the "d" in "': + z.raw.end-- + z.data.end = z.raw.end + return + } + } +} + +// readTagAttrKey sets z.pendingAttr[0] to the "k" in "". +// Precondition: z.err == nil. +func (z *Tokenizer) readTagAttrKey() { + z.pendingAttr[0].start = z.raw.end + for { + c := z.readByte() + if z.err != nil { + z.pendingAttr[0].end = z.raw.end + return + } + switch c { + case ' ', '\n', '\r', '\t', '\f', '/': + z.pendingAttr[0].end = z.raw.end - 1 + return + case '=', '>': + z.raw.end-- + z.pendingAttr[0].end = z.raw.end + return + } + } +} + +// readTagAttrVal sets z.pendingAttr[1] to the "v" in "". +func (z *Tokenizer) readTagAttrVal() { + z.pendingAttr[1].start = z.raw.end + z.pendingAttr[1].end = z.raw.end + if z.skipWhiteSpace(); z.err != nil { + return + } + c := z.readByte() + if z.err != nil { + return + } + if c != '=' { + z.raw.end-- + return + } + if z.skipWhiteSpace(); z.err != nil { + return + } + quote := z.readByte() + if z.err != nil { + return + } + switch quote { + case '>': + z.raw.end-- + return + + case '\'', '"': + z.pendingAttr[1].start = z.raw.end + for { + c := z.readByte() + if z.err != nil { + z.pendingAttr[1].end = z.raw.end + return + } + if c == quote { + z.pendingAttr[1].end = z.raw.end - 1 + return + } + } + + default: + z.pendingAttr[1].start = z.raw.end - 1 + for { + c := z.readByte() + if z.err != nil { + z.pendingAttr[1].end = z.raw.end + return + } + switch c { + case ' ', '\n', '\r', '\t', '\f': + z.pendingAttr[1].end = z.raw.end - 1 + return + case '>': + z.raw.end-- + z.pendingAttr[1].end = z.raw.end + return + } + } + } +} + +// Next scans the next token and returns its type. +func (z *Tokenizer) Next() TokenType { + z.raw.start = z.raw.end + z.data.start = z.raw.end + z.data.end = z.raw.end + if z.err != nil { + z.tt = ErrorToken + return z.tt + } + if z.rawTag != "" { + if z.rawTag == "plaintext" { + // Read everything up to EOF. + for z.err == nil { + z.readByte() + } + z.data.end = z.raw.end + z.textIsRaw = true + } else { + z.readRawOrRCDATA() + } + if z.data.end > z.data.start { + z.tt = TextToken + z.convertNUL = true + return z.tt + } + } + z.textIsRaw = false + z.convertNUL = false + +loop: + for { + c := z.readByte() + if z.err != nil { + break loop + } + if c != '<' { + continue loop + } + + // Check if the '<' we have just read is part of a tag, comment + // or doctype. If not, it's part of the accumulated text token. + c = z.readByte() + if z.err != nil { + break loop + } + var tokenType TokenType + switch { + case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z': + tokenType = StartTagToken + case c == '/': + tokenType = EndTagToken + case c == '!' || c == '?': + // We use CommentToken to mean any of "", + // "" and "". + tokenType = CommentToken + default: + // Reconsume the current character. + z.raw.end-- + continue + } + + // We have a non-text token, but we might have accumulated some text + // before that. If so, we return the text first, and return the non- + // text token on the subsequent call to Next. + if x := z.raw.end - len("' { + // ">" does not generate a token at all. Generate an empty comment + // to allow passthrough clients to pick up the data using Raw. + // Reset the tokenizer state and start again. + z.tt = CommentToken + return z.tt + } + if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { + z.readTag(false) + if z.err != nil { + z.tt = ErrorToken + } else { + z.tt = EndTagToken + } + return z.tt + } + z.raw.end-- + z.readUntilCloseAngle() + z.tt = CommentToken + return z.tt + case CommentToken: + if c == '!' { + z.tt = z.readMarkupDeclaration() + return z.tt + } + z.raw.end-- + z.readUntilCloseAngle() + z.tt = CommentToken + return z.tt + } + } + if z.raw.start < z.raw.end { + z.data.end = z.raw.end + z.tt = TextToken + return z.tt + } + z.tt = ErrorToken + return z.tt +} + +// Raw returns the unmodified text of the current token. Calling Next, Token, +// Text, TagName or TagAttr may change the contents of the returned slice. +// +// The token stream's raw bytes partition the byte stream (up until an +// ErrorToken). There are no overlaps or gaps between two consecutive token's +// raw bytes. One implication is that the byte offset of the current token is +// the sum of the lengths of all previous tokens' raw bytes. +func (z *Tokenizer) Raw() []byte { + return z.buf[z.raw.start:z.raw.end] +} + +// convertNewlines converts "\r" and "\r\n" in s to "\n". +// The conversion happens in place, but the resulting slice may be shorter. +func convertNewlines(s []byte) []byte { + for i, c := range s { + if c != '\r' { + continue + } + + src := i + 1 + if src >= len(s) || s[src] != '\n' { + s[i] = '\n' + continue + } + + dst := i + for src < len(s) { + if s[src] == '\r' { + if src+1 < len(s) && s[src+1] == '\n' { + src++ + } + s[dst] = '\n' + } else { + s[dst] = s[src] + } + src++ + dst++ + } + return s[:dst] + } + return s +} + +var ( + nul = []byte("\x00") + replacement = []byte("\ufffd") +) + +// Text returns the unescaped text of a text, comment or doctype token. The +// contents of the returned slice may change on the next call to Next. +func (z *Tokenizer) Text() []byte { + switch z.tt { + case TextToken, CommentToken, DoctypeToken: + s := z.buf[z.data.start:z.data.end] + z.data.start = z.raw.end + z.data.end = z.raw.end + s = convertNewlines(s) + if (z.convertNUL || z.tt == CommentToken) && bytes.Contains(s, nul) { + s = bytes.Replace(s, nul, replacement, -1) + } + if !z.textIsRaw { + s = unescape(s, false) + } + return s + } + return nil +} + +// TagName returns the lower-cased name of a tag token (the `img` out of +// ``) and whether the tag has attributes. +// The contents of the returned slice may change on the next call to Next. +func (z *Tokenizer) TagName() (name []byte, hasAttr bool) { + if z.data.start < z.data.end { + switch z.tt { + case StartTagToken, EndTagToken, SelfClosingTagToken: + s := z.buf[z.data.start:z.data.end] + z.data.start = z.raw.end + z.data.end = z.raw.end + return lower(s), z.nAttrReturned < len(z.attr) + } + } + return nil, false +} + +// TagAttr returns the lower-cased key and unescaped value of the next unparsed +// attribute for the current tag token and whether there are more attributes. +// The contents of the returned slices may change on the next call to Next. +func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) { + if z.nAttrReturned < len(z.attr) { + switch z.tt { + case StartTagToken, SelfClosingTagToken: + x := z.attr[z.nAttrReturned] + z.nAttrReturned++ + key = z.buf[x[0].start:x[0].end] + val = z.buf[x[1].start:x[1].end] + return lower(key), unescape(convertNewlines(val), true), z.nAttrReturned < len(z.attr) + } + } + return nil, nil, false +} + +// Token returns the current Token. The result's Data and Attr values remain +// valid after subsequent Next calls. +func (z *Tokenizer) Token() Token { + t := Token{Type: z.tt} + switch z.tt { + case TextToken, CommentToken, DoctypeToken: + t.Data = string(z.Text()) + case StartTagToken, SelfClosingTagToken, EndTagToken: + name, moreAttr := z.TagName() + for moreAttr { + var key, val []byte + key, val, moreAttr = z.TagAttr() + t.Attr = append(t.Attr, Attribute{"", atom.String(key), string(val)}) + } + if a := atom.Lookup(name); a != 0 { + t.DataAtom, t.Data = a, a.String() + } else { + t.DataAtom, t.Data = 0, string(name) + } + } + return t +} + +// SetMaxBuf sets a limit on the amount of data buffered during tokenization. +// A value of 0 means unlimited. +func (z *Tokenizer) SetMaxBuf(n int) { + z.maxBuf = n +} + +// NewTokenizer returns a new HTML Tokenizer for the given Reader. +// The input is assumed to be UTF-8 encoded. +func NewTokenizer(r io.Reader) *Tokenizer { + return NewTokenizerFragment(r, "") +} + +// NewTokenizerFragment returns a new HTML Tokenizer for the given Reader, for +// tokenizing an existing element's InnerHTML fragment. contextTag is that +// element's tag, such as "div" or "iframe". +// +// For example, how the InnerHTML "a tag or a
with two sibling +// children; the is reparented to the
" as text instead of an element. + // rawTag's contents are lower-cased. + rawTag string + // textIsRaw is whether the current text token's data is not escaped. + textIsRaw bool + // convertNUL is whether NUL bytes in the current token's data should + // be converted into \ufffd replacement characters. + convertNUL bool + // allowCDATA is whether CDATA sections are allowed in the current context. + allowCDATA bool +} + +// AllowCDATA sets whether or not the tokenizer recognizes as +// the text "foo". The default value is false, which means to recognize it as +// a bogus comment "" instead. +// +// Strictly speaking, an HTML5 compliant tokenizer should allow CDATA if and +// only if tokenizing foreign content, such as MathML and SVG. However, +// tracking foreign-contentness is difficult to do purely in the tokenizer, +// as opposed to the parser, due to HTML integration points: an element +// can contain a that is foreign-to-SVG but not foreign-to- +// HTML. For strict compliance with the HTML5 tokenization algorithm, it is the +// responsibility of the user of a tokenizer to call AllowCDATA as appropriate. +// In practice, if using the tokenizer without caring whether MathML or SVG +// CDATA is text or comments, such as tokenizing HTML to find all the anchor +// text, it is acceptable to ignore this responsibility. +func (z *Tokenizer) AllowCDATA(allowCDATA bool) { + z.allowCDATA = allowCDATA +} + +// NextIsNotRawText instructs the tokenizer that the next token should not be +// considered as 'raw text'. Some elements, such as script and title elements, +// normally require the next token after the opening tag to be 'raw text' that +// has no child elements. For example, tokenizing "acd" +// yields a start tag token for "", a text token for "acd", and +// an end tag token for "". There are no distinct start tag or end tag +// tokens for the "" and "". +// +// This tokenizer implementation will generally look for raw text at the right +// times. Strictly speaking, an HTML5 compliant tokenizer should not look for +// raw text if in foreign content: generally needs raw text, but a +// inside an does not. Another example is that a +// generally needs raw text, but a is not allowed as an immediate +// child of a ; in normal parsing, a implies , but +// one cannot close the implicit element when parsing a 's InnerHTML. +// Similarly to AllowCDATA, tracking the correct moment to override raw-text- +// ness is difficult to do purely in the tokenizer, as opposed to the parser. +// For strict compliance with the HTML5 tokenization algorithm, it is the +// responsibility of the user of a tokenizer to call NextIsNotRawText as +// appropriate. In practice, like AllowCDATA, it is acceptable to ignore this +// responsibility for basic usage. +// +// Note that this 'raw text' concept is different from the one offered by the +// Tokenizer.Raw method. +func (z *Tokenizer) NextIsNotRawText() { + z.rawTag = "" +} + +// Err returns the error associated with the most recent ErrorToken token. +// This is typically io.EOF, meaning the end of tokenization. +func (z *Tokenizer) Err() error { + if z.tt != ErrorToken { + return nil + } + return z.err +} + +// readByte returns the next byte from the input stream, doing a buffered read +// from z.r into z.buf if necessary. z.buf[z.raw.start:z.raw.end] remains a contiguous byte +// slice that holds all the bytes read so far for the current token. +// It sets z.err if the underlying reader returns an error. +// Pre-condition: z.err == nil. +func (z *Tokenizer) readByte() byte { + if z.raw.end >= len(z.buf) { + // Our buffer is exhausted and we have to read from z.r. Check if the + // previous read resulted in an error. + if z.readErr != nil { + z.err = z.readErr + return 0 + } + // We copy z.buf[z.raw.start:z.raw.end] to the beginning of z.buf. If the length + // z.raw.end - z.raw.start is more than half the capacity of z.buf, then we + // allocate a new buffer before the copy. + c := cap(z.buf) + d := z.raw.end - z.raw.start + var buf1 []byte + if 2*d > c { + buf1 = make([]byte, d, 2*c) + } else { + buf1 = z.buf[:d] + } + copy(buf1, z.buf[z.raw.start:z.raw.end]) + if x := z.raw.start; x != 0 { + // Adjust the data/attr spans to refer to the same contents after the copy. + z.data.start -= x + z.data.end -= x + z.pendingAttr[0].start -= x + z.pendingAttr[0].end -= x + z.pendingAttr[1].start -= x + z.pendingAttr[1].end -= x + for i := range z.attr { + z.attr[i][0].start -= x + z.attr[i][0].end -= x + z.attr[i][1].start -= x + z.attr[i][1].end -= x + } + } + z.raw.start, z.raw.end, z.buf = 0, d, buf1[:d] + // Now that we have copied the live bytes to the start of the buffer, + // we read from z.r into the remainder. + var n int + n, z.readErr = readAtLeastOneByte(z.r, buf1[d:cap(buf1)]) + if n == 0 { + z.err = z.readErr + return 0 + } + z.buf = buf1[:d+n] + } + x := z.buf[z.raw.end] + z.raw.end++ + if z.maxBuf > 0 && z.raw.end-z.raw.start >= z.maxBuf { + z.err = ErrBufferExceeded + return 0 + } + return x +} + +// Buffered returns a slice containing data buffered but not yet tokenized. +func (z *Tokenizer) Buffered() []byte { + return z.buf[z.raw.end:] +} + +// readAtLeastOneByte wraps an io.Reader so that reading cannot return (0, nil). +// It returns io.ErrNoProgress if the underlying r.Read method returns (0, nil) +// too many times in succession. +func readAtLeastOneByte(r io.Reader, b []byte) (int, error) { + for i := 0; i < 100; i++ { + if n, err := r.Read(b); n != 0 || err != nil { + return n, err + } + } + return 0, io.ErrNoProgress +} + +// skipWhiteSpace skips past any white space. +func (z *Tokenizer) skipWhiteSpace() { + if z.err != nil { + return + } + for { + c := z.readByte() + if z.err != nil { + return + } + switch c { + case ' ', '\n', '\r', '\t', '\f': + // No-op. + default: + z.raw.end-- + return + } + } +} + +// readRawOrRCDATA reads until the next "", where "foo" is z.rawTag and +// is typically something like "script" or "textarea". +func (z *Tokenizer) readRawOrRCDATA() { + if z.rawTag == "script" { + z.readScript() + z.textIsRaw = true + z.rawTag = "" + return + } +loop: + for { + c := z.readByte() + if z.err != nil { + break loop + } + if c != '<' { + continue loop + } + c = z.readByte() + if z.err != nil { + break loop + } + if c != '/' { + z.raw.end-- + continue loop + } + if z.readRawEndTag() || z.err != nil { + break loop + } + } + z.data.end = z.raw.end + // A textarea's or title's RCDATA can contain escaped entities. + z.textIsRaw = z.rawTag != "textarea" && z.rawTag != "title" + z.rawTag = "" +} + +// readRawEndTag attempts to read a tag like "", where "foo" is z.rawTag. +// If it succeeds, it backs up the input position to reconsume the tag and +// returns true. Otherwise it returns false. The opening "" has already been +// consumed. +func (z *Tokenizer) readRawEndTag() bool { + for i := 0; i < len(z.rawTag); i++ { + c := z.readByte() + if z.err != nil { + return false + } + if c != z.rawTag[i] && c != z.rawTag[i]-('a'-'A') { + z.raw.end-- + return false + } + } + c := z.readByte() + if z.err != nil { + return false + } + switch c { + case ' ', '\n', '\r', '\t', '\f', '/', '>': + // The 3 is 2 for the leading "" plus 1 for the trailing character c. + z.raw.end -= 3 + len(z.rawTag) + return true + } + z.raw.end-- + return false +} + +// readScript reads until the next tag, following the byzantine +// rules for escaping/hiding the closing tag. +func (z *Tokenizer) readScript() { + defer func() { + z.data.end = z.raw.end + }() + var c byte + +scriptData: + c = z.readByte() + if z.err != nil { + return + } + if c == '<' { + goto scriptDataLessThanSign + } + goto scriptData + +scriptDataLessThanSign: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '/': + goto scriptDataEndTagOpen + case '!': + goto scriptDataEscapeStart + } + z.raw.end-- + goto scriptData + +scriptDataEndTagOpen: + if z.readRawEndTag() || z.err != nil { + return + } + goto scriptData + +scriptDataEscapeStart: + c = z.readByte() + if z.err != nil { + return + } + if c == '-' { + goto scriptDataEscapeStartDash + } + z.raw.end-- + goto scriptData + +scriptDataEscapeStartDash: + c = z.readByte() + if z.err != nil { + return + } + if c == '-' { + goto scriptDataEscapedDashDash + } + z.raw.end-- + goto scriptData + +scriptDataEscaped: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataEscapedDash + case '<': + goto scriptDataEscapedLessThanSign + } + goto scriptDataEscaped + +scriptDataEscapedDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataEscapedDashDash + case '<': + goto scriptDataEscapedLessThanSign + } + goto scriptDataEscaped + +scriptDataEscapedDashDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataEscapedDashDash + case '<': + goto scriptDataEscapedLessThanSign + case '>': + goto scriptData + } + goto scriptDataEscaped + +scriptDataEscapedLessThanSign: + c = z.readByte() + if z.err != nil { + return + } + if c == '/' { + goto scriptDataEscapedEndTagOpen + } + if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { + goto scriptDataDoubleEscapeStart + } + z.raw.end-- + goto scriptData + +scriptDataEscapedEndTagOpen: + if z.readRawEndTag() || z.err != nil { + return + } + goto scriptDataEscaped + +scriptDataDoubleEscapeStart: + z.raw.end-- + for i := 0; i < len("script"); i++ { + c = z.readByte() + if z.err != nil { + return + } + if c != "script"[i] && c != "SCRIPT"[i] { + z.raw.end-- + goto scriptDataEscaped + } + } + c = z.readByte() + if z.err != nil { + return + } + switch c { + case ' ', '\n', '\r', '\t', '\f', '/', '>': + goto scriptDataDoubleEscaped + } + z.raw.end-- + goto scriptDataEscaped + +scriptDataDoubleEscaped: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataDoubleEscapedDash + case '<': + goto scriptDataDoubleEscapedLessThanSign + } + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapedDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataDoubleEscapedDashDash + case '<': + goto scriptDataDoubleEscapedLessThanSign + } + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapedDashDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataDoubleEscapedDashDash + case '<': + goto scriptDataDoubleEscapedLessThanSign + case '>': + goto scriptData + } + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapedLessThanSign: + c = z.readByte() + if z.err != nil { + return + } + if c == '/' { + goto scriptDataDoubleEscapeEnd + } + z.raw.end-- + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapeEnd: + if z.readRawEndTag() { + z.raw.end += len("") + goto scriptDataEscaped + } + if z.err != nil { + return + } + goto scriptDataDoubleEscaped +} + +// readComment reads the next comment token starting with ". + z.data.end = z.data.start + } + }() + for dashCount := 2; ; { + c := z.readByte() + if z.err != nil { + // Ignore up to two dashes at EOF. + if dashCount > 2 { + dashCount = 2 + } + z.data.end = z.raw.end - dashCount + return + } + switch c { + case '-': + dashCount++ + continue + case '>': + if dashCount >= 2 { + z.data.end = z.raw.end - len("-->") + return + } + case '!': + if dashCount >= 2 { + c = z.readByte() + if z.err != nil { + z.data.end = z.raw.end + return + } + if c == '>' { + z.data.end = z.raw.end - len("--!>") + return + } + } + } + dashCount = 0 + } +} + +// readUntilCloseAngle reads until the next ">". +func (z *Tokenizer) readUntilCloseAngle() { + z.data.start = z.raw.end + for { + c := z.readByte() + if z.err != nil { + z.data.end = z.raw.end + return + } + if c == '>' { + z.data.end = z.raw.end - len(">") + return + } + } +} + +// readMarkupDeclaration reads the next token starting with "", a "", a "" or +// "': + if brackets >= 2 { + z.data.end = z.raw.end - len("]]>") + return true + } + brackets = 0 + default: + brackets = 0 + } + } +} + +// startTagIn returns whether the start tag in z.buf[z.data.start:z.data.end] +// case-insensitively matches any element of ss. +func (z *Tokenizer) startTagIn(ss ...string) bool { +loop: + for _, s := range ss { + if z.data.end-z.data.start != len(s) { + continue loop + } + for i := 0; i < len(s); i++ { + c := z.buf[z.data.start+i] + if 'A' <= c && c <= 'Z' { + c += 'a' - 'A' + } + if c != s[i] { + continue loop + } + } + return true + } + return false +} + +// readStartTag reads the next start tag token. The opening "". + if z.err == nil && z.buf[z.raw.end-2] == '/' { + return SelfClosingTagToken + } + return StartTagToken +} + +// readTag reads the next tag token and its attributes. If saveAttr, those +// attributes are saved in z.attr, otherwise z.attr is set to an empty slice. +// The opening "' { + break + } + z.raw.end-- + z.readTagAttrKey() + z.readTagAttrVal() + // Save pendingAttr if saveAttr and that attribute has a non-empty key. + if saveAttr && z.pendingAttr[0].start != z.pendingAttr[0].end { + z.attr = append(z.attr, z.pendingAttr) + } + if z.skipWhiteSpace(); z.err != nil { + break + } + } +} + +// readTagName sets z.data to the "div" in "". The reader (z.raw.end) +// is positioned such that the first byte of the tag name (the "d" in "': + z.raw.end-- + z.data.end = z.raw.end + return + } + } +} + +// readTagAttrKey sets z.pendingAttr[0] to the "k" in "". +// Precondition: z.err == nil. +func (z *Tokenizer) readTagAttrKey() { + z.pendingAttr[0].start = z.raw.end + for { + c := z.readByte() + if z.err != nil { + z.pendingAttr[0].end = z.raw.end + return + } + switch c { + case ' ', '\n', '\r', '\t', '\f', '/': + z.pendingAttr[0].end = z.raw.end - 1 + return + case '=', '>': + z.raw.end-- + z.pendingAttr[0].end = z.raw.end + return + } + } +} + +// readTagAttrVal sets z.pendingAttr[1] to the "v" in "". +func (z *Tokenizer) readTagAttrVal() { + z.pendingAttr[1].start = z.raw.end + z.pendingAttr[1].end = z.raw.end + if z.skipWhiteSpace(); z.err != nil { + return + } + c := z.readByte() + if z.err != nil { + return + } + if c != '=' { + z.raw.end-- + return + } + if z.skipWhiteSpace(); z.err != nil { + return + } + quote := z.readByte() + if z.err != nil { + return + } + switch quote { + case '>': + z.raw.end-- + return + + case '\'', '"': + z.pendingAttr[1].start = z.raw.end + for { + c := z.readByte() + if z.err != nil { + z.pendingAttr[1].end = z.raw.end + return + } + if c == quote { + z.pendingAttr[1].end = z.raw.end - 1 + return + } + } + + default: + z.pendingAttr[1].start = z.raw.end - 1 + for { + c := z.readByte() + if z.err != nil { + z.pendingAttr[1].end = z.raw.end + return + } + switch c { + case ' ', '\n', '\r', '\t', '\f': + z.pendingAttr[1].end = z.raw.end - 1 + return + case '>': + z.raw.end-- + z.pendingAttr[1].end = z.raw.end + return + } + } + } +} + +// Next scans the next token and returns its type. +func (z *Tokenizer) Next() TokenType { + z.raw.start = z.raw.end + z.data.start = z.raw.end + z.data.end = z.raw.end + if z.err != nil { + z.tt = ErrorToken + return z.tt + } + if z.rawTag != "" { + if z.rawTag == "plaintext" { + // Read everything up to EOF. + for z.err == nil { + z.readByte() + } + z.data.end = z.raw.end + z.textIsRaw = true + } else { + z.readRawOrRCDATA() + } + if z.data.end > z.data.start { + z.tt = TextToken + z.convertNUL = true + return z.tt + } + } + z.textIsRaw = false + z.convertNUL = false + +loop: + for { + c := z.readByte() + if z.err != nil { + break loop + } + if c != '<' { + continue loop + } + + // Check if the '<' we have just read is part of a tag, comment + // or doctype. If not, it's part of the accumulated text token. + c = z.readByte() + if z.err != nil { + break loop + } + var tokenType TokenType + switch { + case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z': + tokenType = StartTagToken + case c == '/': + tokenType = EndTagToken + case c == '!' || c == '?': + // We use CommentToken to mean any of "", + // "" and "". + tokenType = CommentToken + default: + // Reconsume the current character. + z.raw.end-- + continue + } + + // We have a non-text token, but we might have accumulated some text + // before that. If so, we return the text first, and return the non- + // text token on the subsequent call to Next. + if x := z.raw.end - len("' { + // ">" does not generate a token at all. Generate an empty comment + // to allow passthrough clients to pick up the data using Raw. + // Reset the tokenizer state and start again. + z.tt = CommentToken + return z.tt + } + if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { + z.readTag(false) + if z.err != nil { + z.tt = ErrorToken + } else { + z.tt = EndTagToken + } + return z.tt + } + z.raw.end-- + z.readUntilCloseAngle() + z.tt = CommentToken + return z.tt + case CommentToken: + if c == '!' { + z.tt = z.readMarkupDeclaration() + return z.tt + } + z.raw.end-- + z.readUntilCloseAngle() + z.tt = CommentToken + return z.tt + } + } + if z.raw.start < z.raw.end { + z.data.end = z.raw.end + z.tt = TextToken + return z.tt + } + z.tt = ErrorToken + return z.tt +} + +// Raw returns the unmodified text of the current token. Calling Next, Token, +// Text, TagName or TagAttr may change the contents of the returned slice. +// +// The token stream's raw bytes partition the byte stream (up until an +// ErrorToken). There are no overlaps or gaps between two consecutive token's +// raw bytes. One implication is that the byte offset of the current token is +// the sum of the lengths of all previous tokens' raw bytes. +func (z *Tokenizer) Raw() []byte { + return z.buf[z.raw.start:z.raw.end] +} + +// convertNewlines converts "\r" and "\r\n" in s to "\n". +// The conversion happens in place, but the resulting slice may be shorter. +func convertNewlines(s []byte) []byte { + for i, c := range s { + if c != '\r' { + continue + } + + src := i + 1 + if src >= len(s) || s[src] != '\n' { + s[i] = '\n' + continue + } + + dst := i + for src < len(s) { + if s[src] == '\r' { + if src+1 < len(s) && s[src+1] == '\n' { + src++ + } + s[dst] = '\n' + } else { + s[dst] = s[src] + } + src++ + dst++ + } + return s[:dst] + } + return s +} + +var ( + nul = []byte("\x00") + replacement = []byte("\ufffd") +) + +// Text returns the unescaped text of a text, comment or doctype token. The +// contents of the returned slice may change on the next call to Next. +func (z *Tokenizer) Text() []byte { + switch z.tt { + case TextToken, CommentToken, DoctypeToken: + s := z.buf[z.data.start:z.data.end] + z.data.start = z.raw.end + z.data.end = z.raw.end + s = convertNewlines(s) + if (z.convertNUL || z.tt == CommentToken) && bytes.Contains(s, nul) { + s = bytes.Replace(s, nul, replacement, -1) + } + if !z.textIsRaw { + s = unescape(s, false) + } + return s + } + return nil +} + +// TagName returns the lower-cased name of a tag token (the `img` out of +// ``) and whether the tag has attributes. +// The contents of the returned slice may change on the next call to Next. +func (z *Tokenizer) TagName() (name []byte, hasAttr bool) { + if z.data.start < z.data.end { + switch z.tt { + case StartTagToken, EndTagToken, SelfClosingTagToken: + s := z.buf[z.data.start:z.data.end] + z.data.start = z.raw.end + z.data.end = z.raw.end + return lower(s), z.nAttrReturned < len(z.attr) + } + } + return nil, false +} + +// TagAttr returns the lower-cased key and unescaped value of the next unparsed +// attribute for the current tag token and whether there are more attributes. +// The contents of the returned slices may change on the next call to Next. +func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) { + if z.nAttrReturned < len(z.attr) { + switch z.tt { + case StartTagToken, SelfClosingTagToken: + x := z.attr[z.nAttrReturned] + z.nAttrReturned++ + key = z.buf[x[0].start:x[0].end] + val = z.buf[x[1].start:x[1].end] + return lower(key), unescape(convertNewlines(val), true), z.nAttrReturned < len(z.attr) + } + } + return nil, nil, false +} + +// Token returns the current Token. The result's Data and Attr values remain +// valid after subsequent Next calls. +func (z *Tokenizer) Token() Token { + t := Token{Type: z.tt} + switch z.tt { + case TextToken, CommentToken, DoctypeToken: + t.Data = string(z.Text()) + case StartTagToken, SelfClosingTagToken, EndTagToken: + name, moreAttr := z.TagName() + for moreAttr { + var key, val []byte + key, val, moreAttr = z.TagAttr() + t.Attr = append(t.Attr, Attribute{"", atom.String(key), string(val)}) + } + if a := atom.Lookup(name); a != 0 { + t.DataAtom, t.Data = a, a.String() + } else { + t.DataAtom, t.Data = 0, string(name) + } + } + return t +} + +// SetMaxBuf sets a limit on the amount of data buffered during tokenization. +// A value of 0 means unlimited. +func (z *Tokenizer) SetMaxBuf(n int) { + z.maxBuf = n +} + +// NewTokenizer returns a new HTML Tokenizer for the given Reader. +// The input is assumed to be UTF-8 encoded. +func NewTokenizer(r io.Reader) *Tokenizer { + return NewTokenizerFragment(r, "") +} + +// NewTokenizerFragment returns a new HTML Tokenizer for the given Reader, for +// tokenizing an existing element's InnerHTML fragment. contextTag is that +// element's tag, such as "div" or "iframe". +// +// For example, how the InnerHTML "a tag or a