Skip to content

Commit 532ffb3

Browse files
nightlyoneLukeShu
authored andcommitted
html: speed up UnescapeString
Add benchmarks for for sparsely escaped and densely escaped strings. Then speed up the sparse unescaping part heavily by using IndexByte and copy to skip the parts containing no escaping very fast. Unescaping densely escaped strings slower because of the new function call overhead. But sparsely encoded strings are seen more often in the utf8 enabled web. We win part of the speed back by looking up entityName differently. benchmark old ns/op new ns/op delta BenchmarkEscape 31680 31396 -0.90% BenchmarkEscapeNone 6507 6872 +5.61% BenchmarkUnescape 36481 48298 +32.39% BenchmarkUnescapeNone 332 325 -2.11% BenchmarkUnescapeSparse 8836 3221 -63.55% BenchmarkUnescapeDense 30639 32224 +5.17% Change-Id: If606cb01897a40eefe35ba98f2ff23bb25251606 Reviewed-on: https://go-review.googlesource.com/10172 Reviewed-by: Brad Fitzpatrick <[email protected]> Run-TryBot: Brad Fitzpatrick <[email protected]> TryBot-Result: Gobot Gobot <[email protected]> Cherry-picked-from: golang/go@5b92028
1 parent 8229daf commit 532ffb3

File tree

2 files changed

+68
-23
lines changed

2 files changed

+68
-23
lines changed

html/escape.go

Lines changed: 50 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -135,14 +135,14 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
135135
break
136136
}
137137

138-
entityName := string(s[1:i])
139-
if entityName == "" {
138+
entityName := s[1:i]
139+
if len(entityName) == 0 {
140140
// No-op.
141141
} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
142142
// No-op.
143-
} else if x := entity[entityName]; x != 0 {
143+
} else if x := entity[string(entityName)]; x != 0 {
144144
return dst + utf8.EncodeRune(b[dst:], x), src + i
145-
} else if x := entity2[entityName]; x[0] != 0 {
145+
} else if x := entity2[string(entityName)]; x[0] != 0 {
146146
dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
147147
return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
148148
} else if !attribute {
@@ -151,7 +151,7 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
151151
maxLen = longestEntityWithoutSemicolon
152152
}
153153
for j := maxLen; j > 1; j-- {
154-
if x := entity[entityName[:j]]; x != 0 {
154+
if x := entity[string(entityName[:j])]; x != 0 {
155155
return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
156156
}
157157
}
@@ -165,22 +165,30 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
165165
// unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
166166
// attribute should be true if parsing an attribute value.
167167
func unescape(b []byte, attribute bool) []byte {
168-
for i, c := range b {
169-
if c == '&' {
170-
dst, src := unescapeEntity(b, i, i, attribute)
171-
for src < len(b) {
172-
c := b[src]
173-
if c == '&' {
174-
dst, src = unescapeEntity(b, dst, src, attribute)
175-
} else {
176-
b[dst] = c
177-
dst, src = dst+1, src+1
178-
}
179-
}
180-
return b[0:dst]
168+
i := bytes.IndexByte(b, '&')
169+
170+
if i < 0 {
171+
return b
172+
}
173+
174+
dst, src := unescapeEntity(b, i, i, attribute)
175+
for len(b[src:]) > 0 {
176+
if b[src] == '&' {
177+
i = 0
178+
} else {
179+
i = bytes.IndexByte(b[src:], '&')
180+
}
181+
if i < 0 {
182+
dst += copy(b[dst:], b[src:])
183+
break
184+
}
185+
186+
if i > 0 {
187+
copy(b[dst:], b[src:src+i])
181188
}
189+
dst, src = unescapeEntity(b, dst+i, src+i, attribute)
182190
}
183-
return b
191+
return b[:dst]
184192
}
185193

186194
// lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
@@ -302,8 +310,29 @@ func EscapeString(s string) string {
302310
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
303311
// always true.
304312
func UnescapeString(s string) string {
305-
if !strings.Contains(s, "&") {
313+
i := strings.IndexByte(s, '&')
314+
315+
if i < 0 {
306316
return s
307317
}
308-
return string(unescape([]byte(s), false))
318+
319+
b := []byte(s)
320+
dst, src := unescapeEntity(b, i, i, false)
321+
for len(s[src:]) > 0 {
322+
if s[src] == '&' {
323+
i = 0
324+
} else {
325+
i = strings.IndexByte(s[src:], '&')
326+
}
327+
if i < 0 {
328+
dst += copy(b[dst:], s[src:])
329+
break
330+
}
331+
332+
if i > 0 {
333+
copy(b[dst:], s[src:src+i])
334+
}
335+
dst, src = unescapeEntity(b, dst+i, src+i, false)
336+
}
337+
return string(b[:dst])
309338
}

html/escape_test.go

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,10 @@ func TestUnescapeEscape(t *testing.T) {
118118
}
119119

120120
var (
121-
benchEscapeData = strings.Repeat("AAAAA < BBBBB > CCCCC & DDDDD ' EEEEE \" ", 100)
122-
benchEscapeNone = strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 100)
121+
benchEscapeData = strings.Repeat("AAAAA < BBBBB > CCCCC & DDDDD ' EEEEE \" ", 100)
122+
benchEscapeNone = strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 100)
123+
benchUnescapeSparse = strings.Repeat(strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 10)+"&amp;", 10)
124+
benchUnescapeDense = strings.Repeat("&amp;&lt; &amp; &lt;", 100)
123125
)
124126

125127
func BenchmarkEscape(b *testing.B) {
@@ -151,3 +153,17 @@ func BenchmarkUnescapeNone(b *testing.B) {
151153
n += len(UnescapeString(s))
152154
}
153155
}
156+
157+
func BenchmarkUnescapeSparse(b *testing.B) {
158+
n := 0
159+
for i := 0; i < b.N; i++ {
160+
n += len(UnescapeString(benchUnescapeSparse))
161+
}
162+
}
163+
164+
func BenchmarkUnescapeDense(b *testing.B) {
165+
n := 0
166+
for i := 0; i < b.N; i++ {
167+
n += len(UnescapeString(benchUnescapeDense))
168+
}
169+
}

0 commit comments

Comments
 (0)