Skip to content

Commit 931105c

Browse files
author
Mario Hros
committed
add functional options for altering behavior
1 parent 35493e6 commit 931105c

File tree

2 files changed

+104
-22
lines changed

2 files changed

+104
-22
lines changed

html2text.go

Lines changed: 77 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,50 @@ import (
77
"strings"
88
)
99

10+
// Line break constants
11+
// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
1012
const (
1113
WIN_LBR = "\r\n"
1214
UNIX_LBR = "\n"
1315
)
1416

15-
var lbr = WIN_LBR
16-
var badTagnamesRE = regexp.MustCompile(`^(head|script|style)($|\s+)`)
17+
var legacyLBR = WIN_LBR
18+
var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`)
1719
var linkTagRE = regexp.MustCompile(`a.*href=('([^']*?)'|"([^"]*?)")`)
1820
var badLinkHrefRE = regexp.MustCompile(`javascript:`)
1921
var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
2022
var numericEntityRE = regexp.MustCompile(`(?i)^#(x?[a-f0-9]+)$`)
2123

24+
type options struct {
25+
lbr string
26+
linksInnerText bool
27+
}
28+
29+
func newOptions() *options {
30+
// apply defaults
31+
return &options{
32+
lbr: WIN_LBR,
33+
}
34+
}
35+
36+
// Option is a functional option
37+
type Option func(*options)
38+
39+
// WithUnixLineBreaks instructs the converter to use unix line breaks ("\n" instead of "\r\n" default)
40+
func WithUnixLineBreaks() Option {
41+
return func(o *options) {
42+
o.lbr = UNIX_LBR
43+
}
44+
}
45+
46+
// WithLinksInnerText instructs the converter to retain link tag inner text and append href URLs in angle brackets after the text
47+
// Example: click news <http://bit.ly/2n4wXRs>
48+
func WithLinksInnerText() Option {
49+
return func(o *options) {
50+
o.linksInnerText = true
51+
}
52+
}
53+
2254
func parseHTMLEntity(entName string) (string, bool) {
2355
if r, ok := entity[entName]; ok {
2456
return string(r), true
@@ -47,11 +79,12 @@ func parseHTMLEntity(entName string) (string, bool) {
4779

4880
// SetUnixLbr with argument true sets Unix-style line-breaks in output ("\n")
4981
// with argument false sets Windows-style line-breaks in output ("\r\n", the default)
82+
// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
5083
func SetUnixLbr(b bool) {
5184
if b {
52-
lbr = UNIX_LBR
85+
legacyLBR = UNIX_LBR
5386
} else {
54-
lbr = WIN_LBR
87+
legacyLBR = WIN_LBR
5588
}
5689
}
5790

@@ -113,12 +146,26 @@ func writeSpace(outBuf *bytes.Buffer) {
113146

114147
// HTML2Text converts html into a text form
115148
func HTML2Text(html string) string {
149+
var opts []Option
150+
if legacyLBR == UNIX_LBR {
151+
opts = append(opts, WithUnixLineBreaks())
152+
}
153+
return HTML2TextWithOptions(html, opts...)
154+
}
155+
156+
// HTML2TextWithOptions converts html into a text form with additional options
157+
func HTML2TextWithOptions(html string, reqOpts ...Option) string {
158+
opts := newOptions()
159+
for _, opt := range reqOpts {
160+
opt(opts)
161+
}
162+
116163
inLen := len(html)
117164
tagStart := 0
118165
inEnt := false
119166
badTagStackDepth := 0 // if == 1 it means we are inside <head>...</head>
120167
shouldOutput := true
121-
// maintain a stack of <a> tag href links and output it after the tag's inner text
168+
// maintain a stack of <a> tag href links and output it after the tag's inner text (for opts.linksInnerText only)
122169
hrefs := []string{}
123170
// new line cannot be printed at the beginning or
124171
// for <p> after a new line created by previous <p></p>
@@ -185,23 +232,23 @@ func HTML2Text(html string) string {
185232
tagNameLowercase := strings.ToLower(tag)
186233

187234
if tagNameLowercase == "/ul" {
188-
outBuf.WriteString(lbr)
235+
outBuf.WriteString(opts.lbr)
189236
} else if tagNameLowercase == "li" || tagNameLowercase == "li/" {
190-
outBuf.WriteString(lbr)
237+
outBuf.WriteString(opts.lbr)
191238
} else if headersRE.MatchString(tagNameLowercase) {
192239
if canPrintNewline {
193-
outBuf.WriteString(lbr + lbr)
240+
outBuf.WriteString(opts.lbr + opts.lbr)
194241
}
195242
canPrintNewline = false
196243
} else if tagNameLowercase == "br" || tagNameLowercase == "br/" {
197244
// new line
198-
outBuf.WriteString(lbr)
245+
outBuf.WriteString(opts.lbr)
199246
} else if tagNameLowercase == "p" || tagNameLowercase == "/p" {
200247
if canPrintNewline {
201-
outBuf.WriteString(lbr + lbr)
248+
outBuf.WriteString(opts.lbr + opts.lbr)
202249
}
203250
canPrintNewline = false
204-
} else if tagNameLowercase == "/a" {
251+
} else if opts.linksInnerText && tagNameLowercase == "/a" {
205252
// end of link
206253
// links can be empty can happen if the link matches the badLinkHrefRE
207254
if len(hrefs) > 0 {
@@ -210,7 +257,7 @@ func HTML2Text(html string) string {
210257
outBuf.WriteString(">")
211258
hrefs = hrefs[1:]
212259
}
213-
} else if linkTagRE.MatchString(tagNameLowercase) {
260+
} else if opts.linksInnerText && linkTagRE.MatchString(tagNameLowercase) {
214261
// parse link href
215262
// add special handling for a tags
216263
m := linkTagRE.FindStringSubmatch(tag)
@@ -220,13 +267,30 @@ func HTML2Text(html string) string {
220267
link = m[3]
221268
}
222269

223-
if !badLinkHrefRE.MatchString(link) {
270+
if opts.linksInnerText && !badLinkHrefRE.MatchString(link) {
224271
hrefs = append(hrefs, link)
225272
}
226273
}
227274
} else if badTagnamesRE.MatchString(tagNameLowercase) {
228275
// unwanted block
229276
badTagStackDepth++
277+
278+
// if link inner text preservation is not enabled
279+
// and the current tag is a link tag, parse its href and output that
280+
if !opts.linksInnerText {
281+
// parse link href
282+
m := linkTagRE.FindStringSubmatch(tag)
283+
if len(m) == 4 {
284+
link := m[2]
285+
if len(link) == 0 {
286+
link = m[3]
287+
}
288+
289+
if !badLinkHrefRE.MatchString(link) {
290+
outBuf.WriteString(HTMLEntitiesToText(link))
291+
}
292+
}
293+
}
230294
} else if len(tagNameLowercase) > 0 && tagNameLowercase[0] == '/' &&
231295
badTagnamesRE.MatchString(tagNameLowercase[1:]) {
232296
// end of unwanted block

html2text_test.go

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,25 @@ func TestHTML2Text(t *testing.T) {
1212
Convey("Links", func() {
1313
So(HTML2Text(`<div></div>`), ShouldEqual, "")
1414
So(HTML2Text(`<div>simple text</div>`), ShouldEqual, "simple text")
15-
So(HTML2Text(`click <a href="test">here</a>`), ShouldEqual, "click here <test>")
16-
So(HTML2Text(`click <a class="x" href="test">here</a>`), ShouldEqual, "click here <test>")
17-
So(HTML2Text(`click <a href="ents/&apos;x&apos;">here</a>`), ShouldEqual, "click here <ents/'x'>")
18-
So(HTML2Text(`click <a href="javascript:void(0)">here</a>`), ShouldEqual, "click here")
19-
So(HTML2Text(`click <a href="test"><span>here</span> or here</a>`), ShouldEqual, "click here or here <test>")
20-
So(HTML2Text(`click <a href="http://bit.ly/2n4wXRs">news</a>`), ShouldEqual, "click news <http://bit.ly/2n4wXRs>")
21-
So(HTML2Text(`<a rel="mw:WikiLink" href="/wiki/yet#English" title="yet">yet</a>, <a rel="mw:WikiLink" href="/wiki/not_yet#English" title="not yet">not yet</a>`), ShouldEqual, "yet </wiki/yet#English>, not yet </wiki/not_yet#English>")
22-
So(HTML2Text(`click <a href="one">here<a href="two"> or</a><span> here</span></a>`), ShouldEqual, "click here or <one> here <two>")
15+
16+
// the original behavior
17+
So(HTML2Text(`click <a href="test">here</a>`), ShouldEqual, "click test")
18+
So(HTML2Text(`click <a class="x" href="test">here</a>`), ShouldEqual, "click test")
19+
So(HTML2Text(`click <a href="ents/&apos;x&apos;">here</a>`), ShouldEqual, "click ents/'x'")
20+
So(HTML2Text(`click <a href="javascript:void(0)">here</a>`), ShouldEqual, "click ")
21+
So(HTML2Text(`click <a href="test"><span>here</span> or here</a>`), ShouldEqual, "click test")
22+
So(HTML2Text(`click <a href="http://bit.ly/2n4wXRs">news</a>`), ShouldEqual, "click http://bit.ly/2n4wXRs")
23+
So(HTML2Text(`<a rel="mw:WikiLink" href="/wiki/yet#English" title="yet">yet</a>, <a rel="mw:WikiLink" href="/wiki/not_yet#English" title="not yet">not yet</a>`), ShouldEqual, "/wiki/yet#English, /wiki/not_yet#English")
24+
25+
// with inner text
26+
So(HTML2TextWithOptions(`click <a href="test">here</a>`, WithLinksInnerText()), ShouldEqual, "click here <test>")
27+
So(HTML2TextWithOptions(`click <a class="x" href="test">here</a>`, WithLinksInnerText()), ShouldEqual, "click here <test>")
28+
So(HTML2TextWithOptions(`click <a href="ents/&apos;x&apos;">here</a>`, WithLinksInnerText()), ShouldEqual, "click here <ents/'x'>")
29+
So(HTML2TextWithOptions(`click <a href="javascript:void(0)">here</a>`, WithLinksInnerText()), ShouldEqual, "click here")
30+
So(HTML2TextWithOptions(`click <a href="test"><span>here</span> or here</a>`, WithLinksInnerText()), ShouldEqual, "click here or here <test>")
31+
So(HTML2TextWithOptions(`click <a href="http://bit.ly/2n4wXRs">news</a>`, WithLinksInnerText()), ShouldEqual, "click news <http://bit.ly/2n4wXRs>")
32+
So(HTML2TextWithOptions(`<a rel="mw:WikiLink" href="/wiki/yet#English" title="yet">yet</a>, <a rel="mw:WikiLink" href="/wiki/not_yet#English" title="not yet">not yet</a>`, WithLinksInnerText()), ShouldEqual, "yet </wiki/yet#English>, not yet </wiki/not_yet#English>")
33+
So(HTML2TextWithOptions(`click <a href="one">here<a href="two"> or</a><span> here</span></a>`, WithLinksInnerText()), ShouldEqual, "click here or <one> here <two>")
2334
})
2435

2536
Convey("Inlines", func() {
@@ -80,7 +91,7 @@ func TestHTML2Text(t *testing.T) {
8091
ShouldEqual, "we are not interested in scripts")
8192
})
8293

83-
Convey("Switching Unix and Windows line breaks", func() {
94+
Convey("Switching Unix and Windows line breaks (original behavior)", func() {
8495
SetUnixLbr(true)
8596
So(HTML2Text(`two<br>line<br/>breaks`), ShouldEqual, "two\nline\nbreaks")
8697
So(HTML2Text(`<p>two</p><p>paragraphs</p>`), ShouldEqual, "two\n\nparagraphs")
@@ -89,6 +100,13 @@ func TestHTML2Text(t *testing.T) {
89100
So(HTML2Text(`<p>two</p><p>paragraphs</p>`), ShouldEqual, "two\r\n\r\nparagraphs")
90101
})
91102

103+
Convey("Switching Unix and Windows line breaks (new options)", func() {
104+
So(HTML2TextWithOptions(`two<br>line<br/>breaks`, WithUnixLineBreaks()), ShouldEqual, "two\nline\nbreaks")
105+
So(HTML2TextWithOptions(`<p>two</p><p>paragraphs</p>`, WithUnixLineBreaks()), ShouldEqual, "two\n\nparagraphs")
106+
So(HTML2TextWithOptions(`two<br>line<br/>breaks`), ShouldEqual, "two\r\nline\r\nbreaks")
107+
So(HTML2TextWithOptions(`<p>two</p><p>paragraphs</p>`), ShouldEqual, "two\r\n\r\nparagraphs")
108+
})
109+
92110
Convey("Custom HTML Tags", func() {
93111
So(HTML2Text(`<aa>hello</aa>`), ShouldEqual, "hello")
94112
So(HTML2Text(`<aa >hello</aa>`), ShouldEqual, "hello")

0 commit comments

Comments
 (0)