Skip to content

Commit c7bb088

Browse files
committed
Support attributes on any HTML tag
1 parent 6b048f3 commit c7bb088

File tree

2 files changed

+16
-5
lines changed

2 files changed

+16
-5
lines changed

html2text.go

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ const (
1515
)
1616

1717
var legacyLBR = WIN_LBR
18-
var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`)
18+
var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)$`)
1919
var linkTagRE = regexp.MustCompile(`^(?i:a)(?:$|\s).*(?i:href)\s*=\s*('([^']*?)'|"([^"]*?)"|([^\s"'` + "`" + `=<>]+))`)
2020
var badLinkHrefRE = regexp.MustCompile(`javascript:`)
2121
var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
@@ -99,6 +99,15 @@ func parseHTMLEntity(entName string) (string, bool) {
9999
return "", false
100100
}
101101

102+
func firstWord(s string) string {
103+
for i := 0; i < len(s); i++ {
104+
if s[i] == ' ' {
105+
return s[:i]
106+
}
107+
}
108+
return s
109+
}
110+
102111
// SetUnixLbr with argument true sets Unix-style line-breaks in output ("\n")
103112
// with argument false sets Windows-style line-breaks in output ("\r\n", the default)
104113
// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
@@ -257,11 +266,12 @@ func HTML2TextWithOptions(html string, reqOpts ...Option) string {
257266
case r == '>': // end of a tag
258267
shouldOutput = true
259268
tag := html[tagStart:i]
260-
tagNameLowercase := strings.ToLower(tag)
269+
tagContentLowercase := strings.ToLower(tag)
270+
tagNameLowercase := firstWord(tagContentLowercase)
261271

262272
if tagNameLowercase == "/ul" || tagNameLowercase == "/ol" {
263273
outBuf.WriteString(opts.lbr)
264-
} else if tagNameLowercase == "li" || tagNameLowercase == "li/" || strings.HasPrefix(tagNameLowercase, "li ") {
274+
} else if tagNameLowercase == "li" || tagNameLowercase == "li/" {
265275
if opts.listPrefix != "" {
266276
outBuf.WriteString(opts.lbr + opts.listPrefix)
267277
} else {
@@ -289,7 +299,7 @@ func HTML2TextWithOptions(html string, reqOpts ...Option) string {
289299
outBuf.WriteString(">")
290300
hrefs = hrefs[1:]
291301
}
292-
} else if opts.linksInnerText && linkTagRE.MatchString(tagNameLowercase) {
302+
} else if opts.linksInnerText && linkTagRE.MatchString(tagContentLowercase) {
293303
// parse link href
294304
// add special handling for a tags
295305
m := linkTagRE.FindStringSubmatch(tag)

html2text_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,9 +127,10 @@ func TestHTML2Text(t *testing.T) {
127127
So(HTML2Text(`list of items<ul><li>One</li><li>Two</li><li>Three</li></ul>`), ShouldEqual, "list of items\r\nOne\r\nTwo\r\nThree\r\n")
128128
})
129129

130-
Convey("List with classes", func() {
130+
Convey("Tags with attributes", func() {
131131
So(HTML2Text(`list of items<ul><li class="menu-item">One</li><li class="menu-item">Two</li><li class="menu-item">Three</li></ul>`), ShouldEqual, "list of items\r\nOne\r\nTwo\r\nThree\r\n")
132132
So(HTML2Text(`list of items<ol><li class="menu-item">One</li><li class="menu-item">Two</li><li class="menu-item">Three</li></ol>`), ShouldEqual, "list of items\r\nOne\r\nTwo\r\nThree\r\n")
133+
So(HTML2Text(`<p class="content">content</p><div id="status">is ok</div>`), ShouldEqual, "content\r\n\r\nis ok")
133134
})
134135

135136
Convey("Optional list support", func() {

0 commit comments

Comments
 (0)