@@ -1961,11 +1961,6 @@ Error String::parse_utf8(const char *p_utf8, int p_len, bool p_skip_cr) {
19611961 return ERR_INVALID_DATA;
19621962 }
19631963
1964- String aux;
1965-
1966- int cstr_size = 0 ;
1967- int str_size = 0 ;
1968-
19691964 /* HANDLE BOM (Byte Order Mark) */
19701965 if (p_len < 0 || p_len >= 3 ) {
19711966 bool has_bom = uint8_t (p_utf8[0 ]) == 0xef && uint8_t (p_utf8[1 ]) == 0xbb && uint8_t (p_utf8[2 ]) == 0xbf ;
@@ -1978,162 +1973,160 @@ Error String::parse_utf8(const char *p_utf8, int p_len, bool p_skip_cr) {
19781973 }
19791974 }
19801975
1981- bool decode_error = false ;
1982- bool decode_failed = false ;
1983- {
1984- const char *ptrtmp = p_utf8;
1985- const char *ptrtmp_limit = p_len >= 0 ? &p_utf8[p_len] : nullptr ;
1986- int skip = 0 ;
1987- uint8_t c_start = 0 ;
1988- while (ptrtmp != ptrtmp_limit && *ptrtmp) {
1989- #if CHAR_MIN == 0
1990- uint8_t c = *ptrtmp;
1991- #else
1992- uint8_t c = *ptrtmp >= 0 ? *ptrtmp : uint8_t (256 + *ptrtmp);
1993- #endif
1994-
1995- if (skip == 0 ) {
1996- if (p_skip_cr && c == ' \r ' ) {
1997- ptrtmp++;
1998- continue ;
1999- }
2000- /* Determine the number of characters in sequence */
2001- if ((c & 0x80 ) == 0 ) {
2002- skip = 0 ;
2003- } else if ((c & 0xe0 ) == 0xc0 ) {
2004- skip = 1 ;
2005- } else if ((c & 0xf0 ) == 0xe0 ) {
2006- skip = 2 ;
2007- } else if ((c & 0xf8 ) == 0xf0 ) {
2008- skip = 3 ;
2009- } else if ((c & 0xfc ) == 0xf8 ) {
2010- skip = 4 ;
2011- } else if ((c & 0xfe ) == 0xfc ) {
2012- skip = 5 ;
2013- } else {
2014- skip = 0 ;
2015- print_unicode_error (vformat (" Invalid UTF-8 leading byte (%x)" , c), true );
2016- decode_failed = true ;
2017- }
2018- c_start = c;
1976+ if (p_len < 0 ) {
1977+ p_len = strlen (p_utf8);
1978+ }
20191979
2020- if (skip == 1 && (c & 0x1e ) == 0 ) {
2021- print_unicode_error (vformat (" Overlong encoding (%x ...)" , c));
2022- decode_error = true ;
2023- }
2024- str_size++;
2025- } else {
2026- if ((c_start == 0xe0 && skip == 2 && c < 0xa0 ) || (c_start == 0xf0 && skip == 3 && c < 0x90 ) || (c_start == 0xf8 && skip == 4 && c < 0x88 ) || (c_start == 0xfc && skip == 5 && c < 0x84 )) {
2027- print_unicode_error (vformat (" Overlong encoding (%x %x ...)" , c_start, c));
2028- decode_error = true ;
2029- }
2030- if (c < 0x80 || c > 0xbf ) {
2031- print_unicode_error (vformat (" Invalid UTF-8 continuation byte (%x ... %x ...)" , c_start, c), true );
2032- decode_failed = true ;
2033- skip = 0 ;
2034- } else {
2035- --skip;
2036- }
2037- }
1980+ // If all utf8 characters maps to ASCII, then the max size will be p_len, and we add +1 for the null termination.
1981+ resize (p_len + 1 );
1982+ char32_t *dst = ptrw ();
20381983
2039- cstr_size++;
2040- ptrtmp++;
2041- }
1984+ Error result = Error::OK;
20421985
2043- if (skip) {
2044- print_unicode_error (vformat (" Missing %d UTF-8 continuation byte(s)" , skip), true );
2045- decode_failed = true ;
2046- }
2047- }
1986+ const uint8_t *ptrtmp = (uint8_t *)p_utf8;
1987+ const uint8_t *ptr_limit = (uint8_t *)p_utf8 + p_len;
20481988
2049- if (str_size == 0 ) {
2050- clear ();
2051- return OK; // empty string
2052- }
1989+ while (ptrtmp < ptr_limit && *ptrtmp) {
1990+ uint8_t c = *ptrtmp;
20531991
2054- resize (str_size + 1 );
2055- char32_t *dst = ptrw ();
2056- dst[str_size] = 0 ;
2057-
2058- int skip = 0 ;
2059- uint32_t unichar = 0 ;
2060- while (cstr_size) {
2061- #if CHAR_MIN == 0
2062- uint8_t c = *p_utf8;
2063- #else
2064- uint8_t c = *p_utf8 >= 0 ? *p_utf8 : uint8_t (256 + *p_utf8);
2065- #endif
1992+ if (p_skip_cr && c == ' \r ' ) {
1993+ ++ptrtmp;
1994+ continue ;
1995+ }
1996+ uint32_t unicode = _replacement_char;
1997+ uint32_t size = 1 ;
20661998
2067- if (skip == 0 ) {
2068- if (p_skip_cr && c == ' \r ' ) {
2069- p_utf8++;
2070- continue ;
1999+ if ((c & 0b10000000 ) == 0 ) {
2000+ unicode = c;
2001+ if (unicode > 0x7F ) {
2002+ unicode = _replacement_char;
2003+ print_unicode_error (vformat (" Invalid unicode codepoint (%d)" , unicode), true );
2004+ result = Error::ERR_INVALID_DATA;
20712005 }
2072- /* Determine the number of characters in sequence */
2073- if ((c & 0x80 ) == 0 ) {
2074- *(dst++) = c;
2075- unichar = 0 ;
2076- skip = 0 ;
2077- } else if ((c & 0xe0 ) == 0xc0 ) {
2078- unichar = (0xff >> 3 ) & c;
2079- skip = 1 ;
2080- } else if ((c & 0xf0 ) == 0xe0 ) {
2081- unichar = (0xff >> 4 ) & c;
2082- skip = 2 ;
2083- } else if ((c & 0xf8 ) == 0xf0 ) {
2084- unichar = (0xff >> 5 ) & c;
2085- skip = 3 ;
2086- } else if ((c & 0xfc ) == 0xf8 ) {
2087- unichar = (0xff >> 6 ) & c;
2088- skip = 4 ;
2089- } else if ((c & 0xfe ) == 0xfc ) {
2090- unichar = (0xff >> 7 ) & c;
2091- skip = 5 ;
2006+ } else if ((c & 0b11100000 ) == 0b11000000 ) {
2007+ if (ptrtmp + 1 >= ptr_limit) {
2008+ print_unicode_error (vformat (" Missing %x UTF-8 continuation byte" , c), true );
2009+ result = Error::ERR_INVALID_DATA;
20922010 } else {
2093- *(dst++) = _replacement_char;
2094- unichar = 0 ;
2095- skip = 0 ;
2011+ uint8_t c2 = *(ptrtmp + 1 );
2012+
2013+ if ((c2 & 0b11000000 ) == 0b10000000 ) {
2014+ unicode = (uint32_t )((c & 0b00011111 ) << 6 ) | (uint32_t )(c2 & 0b00111111 );
2015+
2016+ if (unicode < 0x80 ) {
2017+ unicode = _replacement_char;
2018+ print_unicode_error (vformat (" Overlong encoding (%x %x)" , c, c2));
2019+ result = Error::ERR_INVALID_DATA;
2020+ } else if (unicode > 0x7FF ) {
2021+ unicode = _replacement_char;
2022+ print_unicode_error (vformat (" Invalid unicode codepoint (%d)" , unicode), true );
2023+ result = Error::ERR_INVALID_DATA;
2024+ } else {
2025+ size = 2 ;
2026+ }
2027+ } else {
2028+ print_unicode_error (vformat (" Byte %x is not a correct continuation byte after %x" , c2, c));
2029+ result = Error::ERR_INVALID_DATA;
2030+ }
20962031 }
2097- } else {
2098- if (c < 0x80 || c > 0xbf ) {
2099- *(dst++) = _replacement_char;
2100- skip = 0 ;
2032+ } else if ((c & 0b11110000 ) == 0b11100000 ) {
2033+ uint32_t range_min = (c == 0xE0 ) ? 0xA0 : 0x80 ;
2034+ uint32_t range_max = (c == 0xED ) ? 0x9F : 0xBF ;
2035+ uint8_t c2 = (ptrtmp + 1 ) < ptr_limit ? *(ptrtmp + 1 ) : 0 ;
2036+ uint8_t c3 = (ptrtmp + 2 ) < ptr_limit ? *(ptrtmp + 2 ) : 0 ;
2037+ bool c2_valid = c2 && (c2 >= range_min) && (c2 <= range_max);
2038+ bool c3_valid = c3 && ((c3 & 0b11000000 ) == 0b10000000 );
2039+
2040+ if (c2_valid && c3_valid) {
2041+ unicode = (uint32_t )((c & 0b00001111 ) << 12 ) | (uint32_t )((c2 & 0b00111111 ) << 6 ) | (uint32_t )(c3 & 0b00111111 );
2042+
2043+ if (unicode < 0x800 ) {
2044+ unicode = _replacement_char;
2045+ print_unicode_error (vformat (" Overlong encoding (%x %x %x)" , c, c2, c3));
2046+ result = Error::ERR_INVALID_DATA;
2047+ } else if (unicode > 0xFFFF ) {
2048+ unicode = _replacement_char;
2049+ print_unicode_error (vformat (" Invalid unicode codepoint (%d)" , unicode), true );
2050+ result = Error::ERR_INVALID_DATA;
2051+ } else {
2052+ size = 3 ;
2053+ }
21012054 } else {
2102- unichar = (unichar << 6 ) | (c & 0x3f );
2103- --skip;
2104- if (skip == 0 ) {
2105- if (unichar == 0 ) {
2106- print_unicode_error (" NUL character" , true );
2107- decode_failed = true ;
2108- unichar = _replacement_char;
2109- } else if ((unichar & 0xfffff800 ) == 0xd800 ) {
2110- print_unicode_error (vformat (" Unpaired surrogate (%x)" , unichar), true );
2111- decode_failed = true ;
2112- unichar = _replacement_char;
2113- } else if (unichar > 0x10ffff ) {
2114- print_unicode_error (vformat (" Invalid unicode codepoint (%x)" , unichar), true );
2115- decode_failed = true ;
2116- unichar = _replacement_char;
2117- }
2118- *(dst++) = unichar;
2055+ if (c2 == 0 ) {
2056+ print_unicode_error (vformat (" Missing %x UTF-8 continuation byte" , c), true );
2057+ } else if (c2_valid == false ) {
2058+ print_unicode_error (vformat (" Byte %x is not a correct continuation byte after %x" , c2, c));
2059+ } else if (c3 == 0 ) {
2060+ print_unicode_error (vformat (" Missing %x %x UTF-8 continuation byte" , c, c2), true );
2061+ } else {
2062+ print_unicode_error (vformat (" Byte %x is not a correct continuation byte after %x %x" , c3, c, c2));
2063+ // The unicode specification, in paragraphe 3.9 "Unicode Encoding Forms" Conformance
2064+ // state : "Only when a sequence of two or three bytes is a truncated version of a sequence which is
2065+ // otherwise well-formed to that point, is more than one byte replaced with a single U+FFFD"
2066+ // So here we replace the first 2 bytes with one single replacement_char.
2067+ size = 2 ;
21192068 }
2069+
2070+ result = Error::ERR_INVALID_DATA;
2071+ }
2072+ } else if ((c & 0b11111000 ) == 0b11110000 ) {
2073+ uint32_t range_min = (c == 0xF0 ) ? 0x90 : 0x80 ;
2074+ uint32_t range_max = (c == 0xF4 ) ? 0x8F : 0xBF ;
2075+
2076+ uint8_t c2 = ((ptrtmp + 1 ) < ptr_limit) ? *(ptrtmp + 1 ) : 0 ;
2077+ uint8_t c3 = ((ptrtmp + 2 ) < ptr_limit) ? *(ptrtmp + 2 ) : 0 ;
2078+ uint8_t c4 = ((ptrtmp + 3 ) < ptr_limit) ? *(ptrtmp + 3 ) : 0 ;
2079+
2080+ bool c2_valid = c2 && (c2 >= range_min) && (c2 <= range_max);
2081+ bool c3_valid = c3 && ((c3 & 0b11000000 ) == 0b10000000 );
2082+ bool c4_valid = c4 && ((c4 & 0b11000000 ) == 0b10000000 );
2083+
2084+ if (c2_valid && c3_valid && c4_valid) {
2085+ unicode = (uint32_t )((c & 0b00000111 ) << 18 ) | (uint32_t )((c2 & 0b00111111 ) << 12 ) | (uint32_t )((c3 & 0b00111111 ) << 6 ) | (uint32_t )(c4 & 0b00111111 );
2086+
2087+ if (unicode < 0x10000 ) {
2088+ unicode = _replacement_char;
2089+ print_unicode_error (vformat (" Overlong encoding (%x %x %x %x)" , c, c2, c3, c4));
2090+ result = Error::ERR_INVALID_DATA;
2091+ } else if (unicode > 0x10FFFF ) {
2092+ unicode = _replacement_char;
2093+ print_unicode_error (vformat (" Invalid unicode codepoint (%d)" , unicode), true );
2094+ result = Error::ERR_INVALID_DATA;
2095+ } else {
2096+ size = 4 ;
2097+ }
2098+ } else {
2099+ if (c2 == 0 ) {
2100+ print_unicode_error (vformat (" Missing %x UTF-8 continuation byte" , c), true );
2101+ } else if (c2_valid == false ) {
2102+ print_unicode_error (vformat (" Byte %x is not a correct continuation byte after %x" , c2, c));
2103+ } else if (c3 == 0 ) {
2104+ print_unicode_error (vformat (" Missing %x %x UTF-8 continuation byte" , c, c2), true );
2105+ } else if (c3_valid == false ) {
2106+ print_unicode_error (vformat (" Byte %x is not a correct continuation byte after %x %x" , c3, c, c2));
2107+ size = 2 ;
2108+ } else if (c4 == 0 ) {
2109+ print_unicode_error (vformat (" Missing %x %x %x UTF-8 continuation byte" , c, c2, c3), true );
2110+ } else {
2111+ print_unicode_error (vformat (" Byte %x is not a correct continuation byte after %x %x %x" , c4, c, c2, c3));
2112+ size = 3 ;
2113+ }
2114+
2115+ result = Error::ERR_INVALID_DATA;
21202116 }
2117+ } else {
2118+ print_unicode_error (vformat (" Invalid UTF-8 leading byte (%x)" , c), true );
2119+ result = Error::ERR_INVALID_DATA;
21212120 }
21222121
2123- cstr_size--;
2124- p_utf8++;
2125- }
2126- if (skip) {
2127- *(dst++) = 0x20 ;
2122+ (*dst++) = unicode;
2123+ ptrtmp += size;
21282124 }
21292125
2130- if (decode_failed) {
2131- return ERR_INVALID_DATA;
2132- } else if (decode_error) {
2133- return ERR_PARSE_ERROR;
2134- } else {
2135- return OK;
2136- }
2126+ (*dst++) = 0 ;
2127+ resize (dst - ptr ());
2128+
2129+ return result;
21372130}
21382131
21392132CharString String::utf8 () const {
0 commit comments