@@ -64,94 +64,132 @@ namespace hud
6464 return true ;
6565 }
6666
67- [[nodiscard]] static constexpr bool is_valid_utf8 (const char8 *string, usize byte_count) noexcept
67+ [[nodiscard]] static constexpr bool is_valid_utf8_generic (const char8 *string, usize byte_count) noexcept
6868 {
69- u64 pos = 0 ;
69+ usize pos = 0 ;
7070 u32 code_point = 0 ;
7171 while (pos < byte_count) {
72- // check of the next 16 bytes are ascii.
73- u64 next_pos = pos + 16 ;
74- if (next_pos <= byte_count) { // if it is safe to read 16 more bytes, check that they are ascii
75- u64 v1 = hud::memory::unaligned_load64 (string + pos);
76- // std::memcpy(&v1, string + pos, sizeof(u64));
77- u64 v2 = hud::memory::unaligned_load64 (string + pos + sizeof (u64 ));
78- // std::memcpy(&v2, string + pos + sizeof(u64), sizeof(u64));
72+ // Optimization step:
73+ // If the next 16 bytes are guaranteed to be ASCII (all < 128),
74+ // we can skip them all at once instead of checking byte by byte.
75+ usize next_pos = pos + 16 ;
76+ if (next_pos <= byte_count) { // Make sure we don't read past the buffer
77+ u64 v1 = hud::memory::unaligned_load64 (string + pos); // load first 8 bytes
78+ u64 v2 = hud::memory::unaligned_load64 (string + pos + sizeof (u64 )); // load next 8 bytes
79+ // Bitwise OR combines both 8-byte blocks so we only need a single mask test below.
80+ // If any byte in v1 or v2 has its high bit set (>= 0x80, non-ASCII),
81+ // the result will also have that bit set. This lets us quickly check
82+ // if all 16 bytes are ASCII with one comparison instead of two.
7983 u64 v {v1 | v2};
8084 if ((v & 0x8080808080808080 ) == 0 ) {
81- pos = next_pos;
85+ pos = next_pos; // all 16 bytes are ASCII → skip them at once
8286 continue ;
8387 }
8488 }
89+
90+ // Now process byte by byte
8591 unsigned char byte = string[pos];
8692
87- while (byte < 0b10000000 ) {
93+ // Consume consecutive ASCII bytes.
94+ // This inner loop skips multiple ASCII chars in a row efficiently.
95+ while ((byte & 0x80 ) == 0 ) {
8896 if (++pos == byte_count) {
8997 return true ;
9098 }
9199 byte = string[pos];
92100 }
93101
102+ // Case: 2-byte sequence -> 110xxxxx 10xxxxxx
103+ // If we catch leading byte 110xxxxx
94104 if ((byte & 0b11100000 ) == 0b11000000 ) {
105+
106+ // Jump to next supposed code point (after 110xxxxx 10xxxxxx)
107+ // If we go too far, then there is no continuous byte 10xxxxxx
95108 next_pos = pos + 2 ;
96109 if (next_pos > byte_count) {
97110 return false ;
98111 }
112+ // Ensure 1st continuous byte is 10xxxxxx
99113 if ((string[pos + 1 ] & 0b11000000 ) != 0b10000000 ) {
100114 return false ;
101115 }
102- // range check
116+ // Read the code point
103117 code_point = (byte & 0b00011111 ) << 6 | (string[pos + 1 ] & 0b00111111 );
118+ // Ensure code point is [0x80, 0x7FF] aka [U+0080, U+07FF]
104119 if ((code_point < 0x80 ) || (0x7ff < code_point)) {
105120 return false ;
106121 }
107122 }
123+ // Case: 3-byte sequence -> 1110xxxx 10xxxxxx 10xxxxxx
124+ // If we catch leading byte 1110xxxx
108125 else if ((byte & 0b11110000 ) == 0b11100000 ) {
126+
127+ // Jump to next supposed code point (after 1110xxxx 10xxxxxx 10xxxxxx)
128+ // If we go too far, then there is no continuous bytes 10xxxxxx 10xxxxxx
109129 next_pos = pos + 3 ;
110130 if (next_pos > byte_count) {
111131 return false ;
112132 }
133+ // Ensure 1st continuous byte is 10xxxxxx
113134 if ((string[pos + 1 ] & 0b11000000 ) != 0b10000000 ) {
114135 return false ;
115136 }
137+ // Ensure 2nd continuous byte is 10xxxxxx
116138 if ((string[pos + 2 ] & 0b11000000 ) != 0b10000000 ) {
117139 return false ;
118140 }
119- // range check
141+ // Read the code point
120142 code_point = (byte & 0b00001111 ) << 12 | (string[pos + 1 ] & 0b00111111 ) << 6 | (string[pos + 2 ] & 0b00111111 );
143+ // Check code point valid value
144+ // - must not be overlong encoding (< 0x800 is invalid)
145+ // - must be [0x0800, 0xFFFF] aka [U+0800, U+FFFF]
146+ // - must not be in surrogate range [0xD800, 0xDFFF] aka [U+D800, U+DFFF]
121147 if ((code_point < 0x800 ) || (0xffff < code_point) || (0xd7ff < code_point && code_point < 0xe000 )) {
122148 return false ;
123149 }
124150 }
125- else if ((byte & 0b11111000 ) == 0b11110000 ) { // 0b11110000
151+ // Case: 4-byte sequence -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
152+ // If we catch leading byte 11110xxx
153+ else if ((byte & 0b11111000 ) == 0b11110000 ) {
154+ // Jump to next supposed code point (after 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
155+ // If we go too far, then there is no continuous bytes 10xxxxxx 10xxxxxx 10xxxxxx
126156 next_pos = pos + 4 ;
127157 if (next_pos > byte_count) {
128158 return false ;
129159 }
160+ // Ensure 1st continuous byte is 10xxxxxx
130161 if ((string[pos + 1 ] & 0b11000000 ) != 0b10000000 ) {
131162 return false ;
132163 }
164+ // Ensure 2nd continuous byte is 10xxxxxx
133165 if ((string[pos + 2 ] & 0b11000000 ) != 0b10000000 ) {
134166 return false ;
135167 }
168+ // Ensure 3rd continuous byte is 10xxxxxx
136169 if ((string[pos + 3 ] & 0b11000000 ) != 0b10000000 ) {
137170 return false ;
138171 }
139- // range check
140- code_point =
141- (byte & 0b00000111 ) << 18 | (string[pos + 1 ] & 0b00111111 ) << 12 | (string[pos + 2 ] & 0b00111111 ) << 6 | (string[pos + 3 ] & 0b00111111 );
172+ // Read the code point
173+ code_point = (byte & 0b00000111 ) << 18 | (string[pos + 1 ] & 0b00111111 ) << 12 | (string[pos + 2 ] & 0b00111111 ) << 6 | (string[pos + 3 ] & 0b00111111 );
174+ // Check code point valid value
175+ // - must be > 0xFFFF (otherwise it's overlong)
176+ // - must not exceed Unicode max (0x10FFFF)
142177 if (code_point <= 0xffff || 0x10ffff < code_point) {
143178 return false ;
144179 }
145180 }
146181 else {
147- // we may have a continuation
182+ // Any other pattern is invalid:
183+ // e.g. a continuation byte without a proper leading byte
148184 return false ;
149185 }
186+ // Move to the next character after validating the current one
150187 pos = next_pos;
151188 }
152189 return true ;
153190 }
154191
192+ [[nodiscard]] static bool is_valid_utf8_sse () noexcept ;
155193 /* *
156194 * Test whether wide null-terminated string contains only pure ansi characters, checking string_size is not bigger than length of the string.
157195 * @param string The null-terminated string
0 commit comments