Skip to content

Commit 7fed5f2

Browse files
committed
Merge pull request #99826 from kiroxas/improveParseUTF8Performance
Improve `parse_utf8` performance
2 parents 777c663 + e4f8a7f commit 7fed5f2

File tree

2 files changed

+176
-153
lines changed

2 files changed

+176
-153
lines changed

core/string/ustring.cpp

Lines changed: 138 additions & 145 deletions
Original file line numberDiff line numberDiff line change
@@ -1961,11 +1961,6 @@ Error String::parse_utf8(const char *p_utf8, int p_len, bool p_skip_cr) {
19611961
return ERR_INVALID_DATA;
19621962
}
19631963

1964-
String aux;
1965-
1966-
int cstr_size = 0;
1967-
int str_size = 0;
1968-
19691964
/* HANDLE BOM (Byte Order Mark) */
19701965
if (p_len < 0 || p_len >= 3) {
19711966
bool has_bom = uint8_t(p_utf8[0]) == 0xef && uint8_t(p_utf8[1]) == 0xbb && uint8_t(p_utf8[2]) == 0xbf;
@@ -1978,162 +1973,160 @@ Error String::parse_utf8(const char *p_utf8, int p_len, bool p_skip_cr) {
19781973
}
19791974
}
19801975

1981-
bool decode_error = false;
1982-
bool decode_failed = false;
1983-
{
1984-
const char *ptrtmp = p_utf8;
1985-
const char *ptrtmp_limit = p_len >= 0 ? &p_utf8[p_len] : nullptr;
1986-
int skip = 0;
1987-
uint8_t c_start = 0;
1988-
while (ptrtmp != ptrtmp_limit && *ptrtmp) {
1989-
#if CHAR_MIN == 0
1990-
uint8_t c = *ptrtmp;
1991-
#else
1992-
uint8_t c = *ptrtmp >= 0 ? *ptrtmp : uint8_t(256 + *ptrtmp);
1993-
#endif
1994-
1995-
if (skip == 0) {
1996-
if (p_skip_cr && c == '\r') {
1997-
ptrtmp++;
1998-
continue;
1999-
}
2000-
/* Determine the number of characters in sequence */
2001-
if ((c & 0x80) == 0) {
2002-
skip = 0;
2003-
} else if ((c & 0xe0) == 0xc0) {
2004-
skip = 1;
2005-
} else if ((c & 0xf0) == 0xe0) {
2006-
skip = 2;
2007-
} else if ((c & 0xf8) == 0xf0) {
2008-
skip = 3;
2009-
} else if ((c & 0xfc) == 0xf8) {
2010-
skip = 4;
2011-
} else if ((c & 0xfe) == 0xfc) {
2012-
skip = 5;
2013-
} else {
2014-
skip = 0;
2015-
print_unicode_error(vformat("Invalid UTF-8 leading byte (%x)", c), true);
2016-
decode_failed = true;
2017-
}
2018-
c_start = c;
1976+
if (p_len < 0) {
1977+
p_len = strlen(p_utf8);
1978+
}
20191979

2020-
if (skip == 1 && (c & 0x1e) == 0) {
2021-
print_unicode_error(vformat("Overlong encoding (%x ...)", c));
2022-
decode_error = true;
2023-
}
2024-
str_size++;
2025-
} else {
2026-
if ((c_start == 0xe0 && skip == 2 && c < 0xa0) || (c_start == 0xf0 && skip == 3 && c < 0x90) || (c_start == 0xf8 && skip == 4 && c < 0x88) || (c_start == 0xfc && skip == 5 && c < 0x84)) {
2027-
print_unicode_error(vformat("Overlong encoding (%x %x ...)", c_start, c));
2028-
decode_error = true;
2029-
}
2030-
if (c < 0x80 || c > 0xbf) {
2031-
print_unicode_error(vformat("Invalid UTF-8 continuation byte (%x ... %x ...)", c_start, c), true);
2032-
decode_failed = true;
2033-
skip = 0;
2034-
} else {
2035-
--skip;
2036-
}
2037-
}
1980+
// If all utf8 characters maps to ASCII, then the max size will be p_len, and we add +1 for the null termination.
1981+
resize(p_len + 1);
1982+
char32_t *dst = ptrw();
20381983

2039-
cstr_size++;
2040-
ptrtmp++;
2041-
}
1984+
Error result = Error::OK;
20421985

2043-
if (skip) {
2044-
print_unicode_error(vformat("Missing %d UTF-8 continuation byte(s)", skip), true);
2045-
decode_failed = true;
2046-
}
2047-
}
1986+
const uint8_t *ptrtmp = (uint8_t *)p_utf8;
1987+
const uint8_t *ptr_limit = (uint8_t *)p_utf8 + p_len;
20481988

2049-
if (str_size == 0) {
2050-
clear();
2051-
return OK; // empty string
2052-
}
1989+
while (ptrtmp < ptr_limit && *ptrtmp) {
1990+
uint8_t c = *ptrtmp;
20531991

2054-
resize(str_size + 1);
2055-
char32_t *dst = ptrw();
2056-
dst[str_size] = 0;
2057-
2058-
int skip = 0;
2059-
uint32_t unichar = 0;
2060-
while (cstr_size) {
2061-
#if CHAR_MIN == 0
2062-
uint8_t c = *p_utf8;
2063-
#else
2064-
uint8_t c = *p_utf8 >= 0 ? *p_utf8 : uint8_t(256 + *p_utf8);
2065-
#endif
1992+
if (p_skip_cr && c == '\r') {
1993+
++ptrtmp;
1994+
continue;
1995+
}
1996+
uint32_t unicode = _replacement_char;
1997+
uint32_t size = 1;
20661998

2067-
if (skip == 0) {
2068-
if (p_skip_cr && c == '\r') {
2069-
p_utf8++;
2070-
continue;
1999+
if ((c & 0b10000000) == 0) {
2000+
unicode = c;
2001+
if (unicode > 0x7F) {
2002+
unicode = _replacement_char;
2003+
print_unicode_error(vformat("Invalid unicode codepoint (%d)", unicode), true);
2004+
result = Error::ERR_INVALID_DATA;
20712005
}
2072-
/* Determine the number of characters in sequence */
2073-
if ((c & 0x80) == 0) {
2074-
*(dst++) = c;
2075-
unichar = 0;
2076-
skip = 0;
2077-
} else if ((c & 0xe0) == 0xc0) {
2078-
unichar = (0xff >> 3) & c;
2079-
skip = 1;
2080-
} else if ((c & 0xf0) == 0xe0) {
2081-
unichar = (0xff >> 4) & c;
2082-
skip = 2;
2083-
} else if ((c & 0xf8) == 0xf0) {
2084-
unichar = (0xff >> 5) & c;
2085-
skip = 3;
2086-
} else if ((c & 0xfc) == 0xf8) {
2087-
unichar = (0xff >> 6) & c;
2088-
skip = 4;
2089-
} else if ((c & 0xfe) == 0xfc) {
2090-
unichar = (0xff >> 7) & c;
2091-
skip = 5;
2006+
} else if ((c & 0b11100000) == 0b11000000) {
2007+
if (ptrtmp + 1 >= ptr_limit) {
2008+
print_unicode_error(vformat("Missing %x UTF-8 continuation byte", c), true);
2009+
result = Error::ERR_INVALID_DATA;
20922010
} else {
2093-
*(dst++) = _replacement_char;
2094-
unichar = 0;
2095-
skip = 0;
2011+
uint8_t c2 = *(ptrtmp + 1);
2012+
2013+
if ((c2 & 0b11000000) == 0b10000000) {
2014+
unicode = (uint32_t)((c & 0b00011111) << 6) | (uint32_t)(c2 & 0b00111111);
2015+
2016+
if (unicode < 0x80) {
2017+
unicode = _replacement_char;
2018+
print_unicode_error(vformat("Overlong encoding (%x %x)", c, c2));
2019+
result = Error::ERR_INVALID_DATA;
2020+
} else if (unicode > 0x7FF) {
2021+
unicode = _replacement_char;
2022+
print_unicode_error(vformat("Invalid unicode codepoint (%d)", unicode), true);
2023+
result = Error::ERR_INVALID_DATA;
2024+
} else {
2025+
size = 2;
2026+
}
2027+
} else {
2028+
print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x", c2, c));
2029+
result = Error::ERR_INVALID_DATA;
2030+
}
20962031
}
2097-
} else {
2098-
if (c < 0x80 || c > 0xbf) {
2099-
*(dst++) = _replacement_char;
2100-
skip = 0;
2032+
} else if ((c & 0b11110000) == 0b11100000) {
2033+
uint32_t range_min = (c == 0xE0) ? 0xA0 : 0x80;
2034+
uint32_t range_max = (c == 0xED) ? 0x9F : 0xBF;
2035+
uint8_t c2 = (ptrtmp + 1) < ptr_limit ? *(ptrtmp + 1) : 0;
2036+
uint8_t c3 = (ptrtmp + 2) < ptr_limit ? *(ptrtmp + 2) : 0;
2037+
bool c2_valid = c2 && (c2 >= range_min) && (c2 <= range_max);
2038+
bool c3_valid = c3 && ((c3 & 0b11000000) == 0b10000000);
2039+
2040+
if (c2_valid && c3_valid) {
2041+
unicode = (uint32_t)((c & 0b00001111) << 12) | (uint32_t)((c2 & 0b00111111) << 6) | (uint32_t)(c3 & 0b00111111);
2042+
2043+
if (unicode < 0x800) {
2044+
unicode = _replacement_char;
2045+
print_unicode_error(vformat("Overlong encoding (%x %x %x)", c, c2, c3));
2046+
result = Error::ERR_INVALID_DATA;
2047+
} else if (unicode > 0xFFFF) {
2048+
unicode = _replacement_char;
2049+
print_unicode_error(vformat("Invalid unicode codepoint (%d)", unicode), true);
2050+
result = Error::ERR_INVALID_DATA;
2051+
} else {
2052+
size = 3;
2053+
}
21012054
} else {
2102-
unichar = (unichar << 6) | (c & 0x3f);
2103-
--skip;
2104-
if (skip == 0) {
2105-
if (unichar == 0) {
2106-
print_unicode_error("NUL character", true);
2107-
decode_failed = true;
2108-
unichar = _replacement_char;
2109-
} else if ((unichar & 0xfffff800) == 0xd800) {
2110-
print_unicode_error(vformat("Unpaired surrogate (%x)", unichar), true);
2111-
decode_failed = true;
2112-
unichar = _replacement_char;
2113-
} else if (unichar > 0x10ffff) {
2114-
print_unicode_error(vformat("Invalid unicode codepoint (%x)", unichar), true);
2115-
decode_failed = true;
2116-
unichar = _replacement_char;
2117-
}
2118-
*(dst++) = unichar;
2055+
if (c2 == 0) {
2056+
print_unicode_error(vformat("Missing %x UTF-8 continuation byte", c), true);
2057+
} else if (c2_valid == false) {
2058+
print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x", c2, c));
2059+
} else if (c3 == 0) {
2060+
print_unicode_error(vformat("Missing %x %x UTF-8 continuation byte", c, c2), true);
2061+
} else {
2062+
print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x %x", c3, c, c2));
2063+
// The unicode specification, in paragraphe 3.9 "Unicode Encoding Forms" Conformance
2064+
// state : "Only when a sequence of two or three bytes is a truncated version of a sequence which is
2065+
// otherwise well-formed to that point, is more than one byte replaced with a single U+FFFD"
2066+
// So here we replace the first 2 bytes with one single replacement_char.
2067+
size = 2;
21192068
}
2069+
2070+
result = Error::ERR_INVALID_DATA;
2071+
}
2072+
} else if ((c & 0b11111000) == 0b11110000) {
2073+
uint32_t range_min = (c == 0xF0) ? 0x90 : 0x80;
2074+
uint32_t range_max = (c == 0xF4) ? 0x8F : 0xBF;
2075+
2076+
uint8_t c2 = ((ptrtmp + 1) < ptr_limit) ? *(ptrtmp + 1) : 0;
2077+
uint8_t c3 = ((ptrtmp + 2) < ptr_limit) ? *(ptrtmp + 2) : 0;
2078+
uint8_t c4 = ((ptrtmp + 3) < ptr_limit) ? *(ptrtmp + 3) : 0;
2079+
2080+
bool c2_valid = c2 && (c2 >= range_min) && (c2 <= range_max);
2081+
bool c3_valid = c3 && ((c3 & 0b11000000) == 0b10000000);
2082+
bool c4_valid = c4 && ((c4 & 0b11000000) == 0b10000000);
2083+
2084+
if (c2_valid && c3_valid && c4_valid) {
2085+
unicode = (uint32_t)((c & 0b00000111) << 18) | (uint32_t)((c2 & 0b00111111) << 12) | (uint32_t)((c3 & 0b00111111) << 6) | (uint32_t)(c4 & 0b00111111);
2086+
2087+
if (unicode < 0x10000) {
2088+
unicode = _replacement_char;
2089+
print_unicode_error(vformat("Overlong encoding (%x %x %x %x)", c, c2, c3, c4));
2090+
result = Error::ERR_INVALID_DATA;
2091+
} else if (unicode > 0x10FFFF) {
2092+
unicode = _replacement_char;
2093+
print_unicode_error(vformat("Invalid unicode codepoint (%d)", unicode), true);
2094+
result = Error::ERR_INVALID_DATA;
2095+
} else {
2096+
size = 4;
2097+
}
2098+
} else {
2099+
if (c2 == 0) {
2100+
print_unicode_error(vformat("Missing %x UTF-8 continuation byte", c), true);
2101+
} else if (c2_valid == false) {
2102+
print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x", c2, c));
2103+
} else if (c3 == 0) {
2104+
print_unicode_error(vformat("Missing %x %x UTF-8 continuation byte", c, c2), true);
2105+
} else if (c3_valid == false) {
2106+
print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x %x", c3, c, c2));
2107+
size = 2;
2108+
} else if (c4 == 0) {
2109+
print_unicode_error(vformat("Missing %x %x %x UTF-8 continuation byte", c, c2, c3), true);
2110+
} else {
2111+
print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x %x %x", c4, c, c2, c3));
2112+
size = 3;
2113+
}
2114+
2115+
result = Error::ERR_INVALID_DATA;
21202116
}
2117+
} else {
2118+
print_unicode_error(vformat("Invalid UTF-8 leading byte (%x)", c), true);
2119+
result = Error::ERR_INVALID_DATA;
21212120
}
21222121

2123-
cstr_size--;
2124-
p_utf8++;
2125-
}
2126-
if (skip) {
2127-
*(dst++) = 0x20;
2122+
(*dst++) = unicode;
2123+
ptrtmp += size;
21282124
}
21292125

2130-
if (decode_failed) {
2131-
return ERR_INVALID_DATA;
2132-
} else if (decode_error) {
2133-
return ERR_PARSE_ERROR;
2134-
} else {
2135-
return OK;
2136-
}
2126+
(*dst++) = 0;
2127+
resize(dst - ptr());
2128+
2129+
return result;
21372130
}
21382131

21392132
CharString String::utf8() const {

tests/core/string/test_string.h

Lines changed: 38 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -165,11 +165,11 @@ TEST_CASE("[String] UTF8 with CR") {
165165
CHECK(no_cr == base.replace("\r", ""));
166166
}
167167

168-
TEST_CASE("[String] Invalid UTF8 (non-standard)") {
168+
TEST_CASE("[String] Invalid UTF8 (non shortest form sequence)") {
169169
ERR_PRINT_OFF
170-
static const uint8_t u8str[] = { 0x45, 0xE3, 0x81, 0x8A, 0xE3, 0x82, 0x88, 0xE3, 0x81, 0x86, 0xF0, 0x9F, 0x8E, 0xA4, 0xF0, 0x82, 0x82, 0xAC, 0xED, 0xA0, 0x81, 0 };
171-
// + +2 +2 +2 +3 overlong +3 unpaired +2
172-
static const char32_t u32str[] = { 0x45, 0x304A, 0x3088, 0x3046, 0x1F3A4, 0x20AC, 0xFFFD, 0 };
170+
// Examples from the unicode standard : 3.9 Unicode Encoding Forms - Table 3.8.
171+
static const uint8_t u8str[] = { 0xC0, 0xAF, 0xE0, 0x80, 0xBF, 0xF0, 0x81, 0x82, 0x41, 0 };
172+
static const char32_t u32str[] = { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x41, 0 };
173173
String s;
174174
Error err = s.parse_utf8((const char *)u8str);
175175
CHECK(err == ERR_INVALID_DATA);
@@ -180,11 +180,41 @@ TEST_CASE("[String] Invalid UTF8 (non-standard)") {
180180
ERR_PRINT_ON
181181
}
182182

183-
TEST_CASE("[String] Invalid UTF8 (unrecoverable)") {
183+
TEST_CASE("[String] Invalid UTF8 (ill formed sequences for surrogates)") {
184184
ERR_PRINT_OFF
185-
static const uint8_t u8str[] = { 0x45, 0xE3, 0x81, 0x8A, 0x8F, 0xE3, 0xE3, 0x98, 0x8F, 0xE3, 0x82, 0x88, 0xE3, 0x81, 0x86, 0xC0, 0x80, 0xF0, 0x9F, 0x8E, 0xA4, 0xF0, 0x82, 0x82, 0xAC, 0xED, 0xA0, 0x81, 0 };
186-
// + +2 inv +2 inv inv inv +2 +2 ovl NUL +1 +3 overlong +3 unpaired +2
187-
static const char32_t u32str[] = { 0x45, 0x304A, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x3088, 0x3046, 0xFFFD, 0x1F3A4, 0x20AC, 0xFFFD, 0 };
185+
// Examples from the unicode standard : 3.9 Unicode Encoding Forms - Table 3.9.
186+
static const uint8_t u8str[] = { 0xED, 0xA0, 0x80, 0xED, 0xBF, 0xBF, 0xED, 0xAF, 0x41, 0 };
187+
static const char32_t u32str[] = { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x41, 0 };
188+
String s;
189+
Error err = s.parse_utf8((const char *)u8str);
190+
CHECK(err == ERR_INVALID_DATA);
191+
CHECK(s == u32str);
192+
193+
CharString cs = (const char *)u8str;
194+
CHECK(String::utf8(cs) == s);
195+
ERR_PRINT_ON
196+
}
197+
198+
TEST_CASE("[String] Invalid UTF8 (other ill formed sequences)") {
199+
ERR_PRINT_OFF
200+
// Examples from the unicode standard : 3.9 Unicode Encoding Forms - Table 3.10.
201+
static const uint8_t u8str[] = { 0xF4, 0x91, 0x92, 0x93, 0xFF, 0x41, 0x80, 0xBF, 0x42, 0 };
202+
static const char32_t u32str[] = { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x41, 0xFFFD, 0xFFFD, 0x42, 0 };
203+
String s;
204+
Error err = s.parse_utf8((const char *)u8str);
205+
CHECK(err == ERR_INVALID_DATA);
206+
CHECK(s == u32str);
207+
208+
CharString cs = (const char *)u8str;
209+
CHECK(String::utf8(cs) == s);
210+
ERR_PRINT_ON
211+
}
212+
213+
TEST_CASE("[String] Invalid UTF8 (truncated sequences)") {
214+
ERR_PRINT_OFF
215+
// Examples from the unicode standard : 3.9 Unicode Encoding Forms - Table 3.11.
216+
static const uint8_t u8str[] = { 0xE1, 0x80, 0xE2, 0xF0, 0x91, 0x92, 0xF1, 0xBF, 0x41, 0 };
217+
static const char32_t u32str[] = { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x41, 0 };
188218
String s;
189219
Error err = s.parse_utf8((const char *)u8str);
190220
CHECK(err == ERR_INVALID_DATA);

0 commit comments

Comments
 (0)