Skip to content

Commit 3ce5993

Browse files
author
Julian LALU
committed
Improve is_valid_utf8_generic
1 parent 2d3966d commit 3ce5993

File tree

3 files changed

+70
-28
lines changed

3 files changed

+70
-28
lines changed

interface/core/string/cstring.h

Lines changed: 56 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -64,94 +64,132 @@ namespace hud
6464
return true;
6565
}
6666

67-
[[nodiscard]] static constexpr bool is_valid_utf8(const char8 *string, usize byte_count) noexcept
67+
[[nodiscard]] static constexpr bool is_valid_utf8_generic(const char8 *string, usize byte_count) noexcept
6868
{
69-
u64 pos = 0;
69+
usize pos = 0;
7070
u32 code_point = 0;
7171
while (pos < byte_count) {
72-
// check of the next 16 bytes are ascii.
73-
u64 next_pos = pos + 16;
74-
if (next_pos <= byte_count) { // if it is safe to read 16 more bytes, check that they are ascii
75-
u64 v1 = hud::memory::unaligned_load64(string + pos);
76-
// std::memcpy(&v1, string + pos, sizeof(u64));
77-
u64 v2 = hud::memory::unaligned_load64(string + pos + sizeof(u64));
78-
// std::memcpy(&v2, string + pos + sizeof(u64), sizeof(u64));
72+
// Optimization step:
73+
// If the next 16 bytes are guaranteed to be ASCII (all < 128),
74+
// we can skip them all at once instead of checking byte by byte.
75+
usize next_pos = pos + 16;
76+
if (next_pos <= byte_count) { // Make sure we don't read past the buffer
77+
u64 v1 = hud::memory::unaligned_load64(string + pos); // load first 8 bytes
78+
u64 v2 = hud::memory::unaligned_load64(string + pos + sizeof(u64)); // load next 8 bytes
79+
// Bitwise OR combines both 8-byte blocks so we only need a single mask test below.
80+
// If any byte in v1 or v2 has its high bit set (>= 0x80, non-ASCII),
81+
// the result will also have that bit set. This lets us quickly check
82+
// if all 16 bytes are ASCII with one comparison instead of two.
7983
u64 v {v1 | v2};
8084
if ((v & 0x8080808080808080) == 0) {
81-
pos = next_pos;
85+
pos = next_pos; // all 16 bytes are ASCII → skip them at once
8286
continue;
8387
}
8488
}
89+
90+
// Now process byte by byte
8591
unsigned char byte = string[pos];
8692

87-
while (byte < 0b10000000) {
93+
// Consume consecutive ASCII bytes.
94+
// This inner loop skips multiple ASCII chars in a row efficiently.
95+
while ((byte & 0x80) == 0) {
8896
if (++pos == byte_count) {
8997
return true;
9098
}
9199
byte = string[pos];
92100
}
93101

102+
// Case: 2-byte sequence -> 110xxxxx 10xxxxxx
103+
// If we catch leading byte 110xxxxx
94104
if ((byte & 0b11100000) == 0b11000000) {
105+
106+
// Jump to next supposed code point (after 110xxxxx 10xxxxxx)
107+
// If we go too far, then there is no continuous byte 10xxxxxx
95108
next_pos = pos + 2;
96109
if (next_pos > byte_count) {
97110
return false;
98111
}
112+
// Ensure 1st continuous byte is 10xxxxxx
99113
if ((string[pos + 1] & 0b11000000) != 0b10000000) {
100114
return false;
101115
}
102-
// range check
116+
// Read the code point
103117
code_point = (byte & 0b00011111) << 6 | (string[pos + 1] & 0b00111111);
118+
// Ensure code point is [0x80, 0x7FF] aka [U+0080, U+07FF]
104119
if ((code_point < 0x80) || (0x7ff < code_point)) {
105120
return false;
106121
}
107122
}
123+
// Case: 3-byte sequence -> 1110xxxx 10xxxxxx 10xxxxxx
124+
// If we catch leading byte 1110xxxx
108125
else if ((byte & 0b11110000) == 0b11100000) {
126+
127+
// Jump to next supposed code point (after 1110xxxx 10xxxxxx 10xxxxxx)
128+
// If we go too far, then there is no continuous bytes 10xxxxxx 10xxxxxx
109129
next_pos = pos + 3;
110130
if (next_pos > byte_count) {
111131
return false;
112132
}
133+
// Ensure 1st continuous byte is 10xxxxxx
113134
if ((string[pos + 1] & 0b11000000) != 0b10000000) {
114135
return false;
115136
}
137+
// Ensure 2nd continuous byte is 10xxxxxx
116138
if ((string[pos + 2] & 0b11000000) != 0b10000000) {
117139
return false;
118140
}
119-
// range check
141+
// Read the code point
120142
code_point = (byte & 0b00001111) << 12 | (string[pos + 1] & 0b00111111) << 6 | (string[pos + 2] & 0b00111111);
143+
// Check code point valid value
144+
// - must not be overlong encoding (< 0x800 is invalid)
145+
// - must be [0x0800, 0xFFFF] aka [U+0800, U+FFFF]
146+
// - must not be in surrogate range [0xD800, 0xDFFF] aka [U+D800, U+DFFF]
121147
if ((code_point < 0x800) || (0xffff < code_point) || (0xd7ff < code_point && code_point < 0xe000)) {
122148
return false;
123149
}
124150
}
125-
else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
151+
// Case: 4-byte sequence -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
152+
// If we catch leading byte 11110xxx
153+
else if ((byte & 0b11111000) == 0b11110000) {
154+
// Jump to next supposed code point (after 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
155+
// If we go too far, then there is no continuous bytes 10xxxxxx 10xxxxxx 10xxxxxx
126156
next_pos = pos + 4;
127157
if (next_pos > byte_count) {
128158
return false;
129159
}
160+
// Ensure 1st continuous byte is 10xxxxxx
130161
if ((string[pos + 1] & 0b11000000) != 0b10000000) {
131162
return false;
132163
}
164+
// Ensure 2nd continuous byte is 10xxxxxx
133165
if ((string[pos + 2] & 0b11000000) != 0b10000000) {
134166
return false;
135167
}
168+
// Ensure 3rd continuous byte is 10xxxxxx
136169
if ((string[pos + 3] & 0b11000000) != 0b10000000) {
137170
return false;
138171
}
139-
// range check
140-
code_point =
141-
(byte & 0b00000111) << 18 | (string[pos + 1] & 0b00111111) << 12 | (string[pos + 2] & 0b00111111) << 6 | (string[pos + 3] & 0b00111111);
172+
// Read the code point
173+
code_point = (byte & 0b00000111) << 18 | (string[pos + 1] & 0b00111111) << 12 | (string[pos + 2] & 0b00111111) << 6 | (string[pos + 3] & 0b00111111);
174+
// Check code point valid value
175+
// - must be > 0xFFFF (otherwise it's overlong)
176+
// - must not exceed Unicode max (0x10FFFF)
142177
if (code_point <= 0xffff || 0x10ffff < code_point) {
143178
return false;
144179
}
145180
}
146181
else {
147-
// we may have a continuation
182+
// Any other pattern is invalid:
183+
// e.g. a continuation byte without a proper leading byte
148184
return false;
149185
}
186+
// Move to the next character after validating the current one
150187
pos = next_pos;
151188
}
152189
return true;
153190
}
154191

192+
[[nodiscard]] static bool is_valid_utf8_sse() noexcept;
155193
/**
156194
* Test whether wide null-terminated string contains only pure ansi characters, checking string_size is not bigger than length of the string.
157195
* @param string The null-terminated string

src/string/cstring.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,9 @@
22
#include <core/memory.h>
33
namespace hud
44
{
5+
bool cstring::is_valid_utf8_sse() noexcept
6+
{
7+
return false;
8+
}
59

610
} // namespace hud

test/cstring.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -66,54 +66,54 @@ GTEST_TEST(cstring, is_ascii)
6666
}
6767
}
6868

69-
GTEST_TEST(cstring, is_vaid_utf8)
69+
GTEST_TEST(cstring, is_valid_utf8_generic)
7070
{
7171
// Test with values from
7272
// https://github.com/lemire/unicode_lipsum
7373
const auto test_latin = []() {
7474
// asm volatile("nop");
7575
const char8 latin_lipsum[] = LATIN_LIPSUM;
76-
return hud::cstring::is_valid_utf8(LATIN_LIPSUM, sizeof(latin_lipsum));
76+
return hud::cstring::is_valid_utf8_generic(LATIN_LIPSUM, sizeof(latin_lipsum));
7777
};
7878
const auto test_russian = []() {
7979
// asm volatile("nop");
8080
const char8 russian_lipsum[] = RUSSIAN_LIPSUM;
81-
return hud::cstring::is_valid_utf8(RUSSIAN_LIPSUM, sizeof(russian_lipsum));
81+
return hud::cstring::is_valid_utf8_generic(RUSSIAN_LIPSUM, sizeof(russian_lipsum));
8282
};
8383
const auto test_korean = []() {
8484
// asm volatile("nop");
8585
const char8 korean_lipsum[] = KOREAN_LIPSUM;
86-
return hud::cstring::is_valid_utf8(KOREAN_LIPSUM, sizeof(korean_lipsum));
86+
return hud::cstring::is_valid_utf8_generic(KOREAN_LIPSUM, sizeof(korean_lipsum));
8787
};
8888
const auto test_japanese = []() {
8989
// asm volatile("nop");
9090
const char8 japanese_lipsum[] = JAPANESE_LIPSUM;
91-
return hud::cstring::is_valid_utf8(JAPANESE_LIPSUM, sizeof(japanese_lipsum));
91+
return hud::cstring::is_valid_utf8_generic(JAPANESE_LIPSUM, sizeof(japanese_lipsum));
9292
};
9393
const auto test_hindi = []() {
9494
// asm volatile("nop");
9595
const char8 hindi_lipsum[] = HINDI_LIPSUM;
96-
return hud::cstring::is_valid_utf8(HINDI_LIPSUM, sizeof(hindi_lipsum));
96+
return hud::cstring::is_valid_utf8_generic(HINDI_LIPSUM, sizeof(hindi_lipsum));
9797
};
9898
const auto test_hebrew = []() {
9999
// asm volatile("nop");
100100
const char8 hebrew_lipsum[] = HEBREW_LIPSUM;
101-
return hud::cstring::is_valid_utf8(HEBREW_LIPSUM, sizeof(hebrew_lipsum));
101+
return hud::cstring::is_valid_utf8_generic(HEBREW_LIPSUM, sizeof(hebrew_lipsum));
102102
};
103103
const auto test_emoji = []() {
104104
// asm volatile("nop");
105105
const char8 emoji_lipsum[] = EMOJI_LIPSUM;
106-
return hud::cstring::is_valid_utf8(EMOJI_LIPSUM, sizeof(emoji_lipsum));
106+
return hud::cstring::is_valid_utf8_generic(EMOJI_LIPSUM, sizeof(emoji_lipsum));
107107
};
108108
const auto test_chinese = []() {
109109
// asm volatile("nop");
110110
const char8 chinese_lipsum[] = CHINESE_LIPSUM;
111-
return hud::cstring::is_valid_utf8(CHINESE_LIPSUM, sizeof(chinese_lipsum));
111+
return hud::cstring::is_valid_utf8_generic(CHINESE_LIPSUM, sizeof(chinese_lipsum));
112112
};
113113
const auto test_arabic = []() {
114114
// asm volatile("nop");
115115
const char8 arabic_lipsum[] = ARABIC_LIPSUM;
116-
return hud::cstring::is_valid_utf8(ARABIC_LIPSUM, sizeof(arabic_lipsum));
116+
return hud::cstring::is_valid_utf8_generic(ARABIC_LIPSUM, sizeof(arabic_lipsum));
117117
};
118118
// Non constant
119119
{

0 commit comments

Comments
 (0)