diff --git a/pkg/util/utf16/utf16.go b/pkg/util/utf16/utf16.go index 7387cac16..18b44d9b3 100644 --- a/pkg/util/utf16/utf16.go +++ b/pkg/util/utf16/utf16.go @@ -36,27 +36,45 @@ const ( func isHighSurrogate(r rune) bool { return r >= surr1 && r <= 0xdbff } func isLowSurrogate(r rune) bool { return r >= surr2 && r <= 0xdfff } -// Decode decodes the UTF16-encoded string to UTF-8 string. This function -// exhibits much better performance than the standard library counterpart. -// All credits go to: https://gist.github.com/skeeto/09f1410183d246f9b18cba95c4e602f0 +// Decode decodes the UTF16-encoded string to UTF-8 string using fast ASCII path. +// This function exhibits much better performance than the standard library counterpart. func Decode(p []uint16) string { - s := make([]byte, 0, 2*len(p)) + n := len(p) + if n == 0 { + return "" + } + + s := make([]byte, 0, n*2) + for i := 0; i < len(p); i++ { - r := rune(0xfffd) + // ascii fast-path (0x0000–0x007F) + if p[i] <= 0x7F { + s = append(s, byte(p[i])) + continue + } + r1 := rune(p[i]) - if isHighSurrogate(r1) { - if i+1 < len(p) { - r2 := rune(p[i+1]) - if isLowSurrogate(r2) { - i++ - r = 0x10000 + (r1-surr1)<<10 + (r2 - surr2) - } + + // surrogate pair handling + if isHighSurrogate(r1) && i+1 < n { + r2 := rune(p[i+1]) + if isLowSurrogate(r2) { + i++ + r := 0x10000 + (r1-surr1)<<10 + (r2 - surr2) + s = utf8.AppendRune(s, r) + continue } - } else if !isLowSurrogate(r) { - r = r1 } - s = utf8.AppendRune(s, r) + + // non-surrogate BMP code point or malformed surrogate + if !isLowSurrogate(r1) { + s = utf8.AppendRune(s, r1) + } else { + // lone low surrogate to replacement char + s = utf8.AppendRune(s, utf8.RuneError) + } } + return string(s) }