Skip to content

Commit 217b01a

Browse files
committed
perf: Speed up UTF16 decoding with ASCII fast-path
1 parent 3ce4745 commit 217b01a

File tree

1 file changed

+33
-15
lines changed

1 file changed

+33
-15
lines changed

pkg/util/utf16/utf16.go

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -36,27 +36,45 @@ const (
3636
func isHighSurrogate(r rune) bool { return r >= surr1 && r <= 0xdbff }
3737
func isLowSurrogate(r rune) bool { return r >= surr2 && r <= 0xdfff }
3838

39-
// Decode decodes the UTF16-encoded string to UTF-8 string. This function
40-
// exhibits much better performance than the standard library counterpart.
41-
// All credits go to: https://gist.github.com/skeeto/09f1410183d246f9b18cba95c4e602f0
39+
// Decode decodes the UTF16-encoded string to UTF-8 string using fast ASCII path.
40+
// This function exhibits much better performance than the standard library counterpart.
4241
func Decode(p []uint16) string {
43-
s := make([]byte, 0, 2*len(p))
42+
n := len(p)
43+
if n == 0 {
44+
return ""
45+
}
46+
47+
s := make([]byte, 0, n*2)
48+
4449
for i := 0; i < len(p); i++ {
45-
r := rune(0xfffd)
50+
// ascii fast-path (0x0000–0x007F)
51+
if p[i] <= 0x7F {
52+
s = append(s, byte(p[i]))
53+
continue
54+
}
55+
4656
r1 := rune(p[i])
47-
if isHighSurrogate(r1) {
48-
if i+1 < len(p) {
49-
r2 := rune(p[i+1])
50-
if isLowSurrogate(r2) {
51-
i++
52-
r = 0x10000 + (r1-surr1)<<10 + (r2 - surr2)
53-
}
57+
58+
// surrogate pair handling
59+
if isHighSurrogate(r1) && i+1 < n {
60+
r2 := rune(p[i+1])
61+
if isLowSurrogate(r2) {
62+
i++
63+
r := 0x10000 + (r1-surr1)<<10 + (r2 - surr2)
64+
s = utf8.AppendRune(s, r)
65+
continue
5466
}
55-
} else if !isLowSurrogate(r) {
56-
r = r1
5767
}
58-
s = utf8.AppendRune(s, r)
68+
69+
// non-surrogate BMP code point or malformed surrogate
70+
if !isLowSurrogate(r1) {
71+
s = utf8.AppendRune(s, r1)
72+
} else {
73+
// lone low surrogate to replacement char
74+
s = utf8.AppendRune(s, utf8.RuneError)
75+
}
5976
}
77+
6078
return string(s)
6179
}
6280

0 commit comments

Comments
 (0)