diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp index 1f81de4248ff0..c54a1b751f402 100644 --- a/libc/src/__support/wchar/character_converter.cpp +++ b/libc/src/__support/wchar/character_converter.cpp @@ -25,6 +25,9 @@ constexpr size_t ENCODED_BITS_PER_UTF8 = 6; // Information not metadata (# of bits excluding the byte headers) constexpr uint32_t MASK_ENCODED_BITS = mask_trailing_ones(); +// Maximum value for utf-32 for a utf-8 sequence of a given length +constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff}; +constexpr int MAX_UTF8_LENGTH = 4; CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; } @@ -40,6 +43,17 @@ bool CharacterConverter::isFull() { bool CharacterConverter::isEmpty() { return state->bytes_stored == 0; } +bool CharacterConverter::isValidState() { + if (state->total_bytes > MAX_UTF8_LENGTH) + return false; + + const char32_t max_utf32_value = + state->total_bytes == 0 ? 0 + : MAX_VALUE_PER_UTF8_LEN[state->total_bytes - 1]; + return state->bytes_stored <= state->total_bytes && + state->partial <= max_utf32_value; +} + int CharacterConverter::push(char8_t utf8_byte) { uint8_t num_ones = static_cast(cpp::countl_one(utf8_byte)); // Checking the first byte if first push @@ -90,9 +104,7 @@ int CharacterConverter::push(char32_t utf32) { state->partial = utf32; // determine number of utf-8 bytes needed to represent this utf32 value - constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff}; - constexpr int NUM_RANGES = 4; - for (uint8_t i = 0; i < NUM_RANGES; i++) { + for (uint8_t i = 0; i < MAX_UTF8_LENGTH; i++) { if (state->partial <= MAX_VALUE_PER_UTF8_LEN[i]) { state->total_bytes = i + 1; state->bytes_stored = i + 1; diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h index be0e6129df236..d9a63fdc0522c 100644 --- a/libc/src/__support/wchar/character_converter.h +++ b/libc/src/__support/wchar/character_converter.h @@ -28,6 +28,7 @@ class CharacterConverter { void clear(); bool isFull(); bool isEmpty(); + bool isValidState(); int push(char8_t utf8_byte); int push(char32_t utf32); diff --git a/libc/test/src/__support/wchar/utf32_to_8_test.cpp b/libc/test/src/__support/wchar/utf32_to_8_test.cpp index a6a7bc4aa6f4c..1ad523e148845 100644 --- a/libc/test/src/__support/wchar/utf32_to_8_test.cpp +++ b/libc/test/src/__support/wchar/utf32_to_8_test.cpp @@ -186,3 +186,45 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, CantPushMidConversion) { int err = cr.push(utf32); ASSERT_EQ(err, -1); } + +TEST(LlvmLibcCharacterConverterUTF32To8Test, InvalidState) { + LIBC_NAMESPACE::internal::mbstate s1; + LIBC_NAMESPACE::internal::CharacterConverter c1(&s1); + ASSERT_TRUE(c1.isValidState()); + + LIBC_NAMESPACE::internal::mbstate s2{0, 2, 0}; + LIBC_NAMESPACE::internal::CharacterConverter c2(&s2); + ASSERT_FALSE(c2.isValidState()); + + LIBC_NAMESPACE::internal::mbstate s3{0x7f, 1, 1}; + LIBC_NAMESPACE::internal::CharacterConverter c3(&s3); + ASSERT_TRUE(c3.isValidState()); + LIBC_NAMESPACE::internal::mbstate s4{0x80, 1, 1}; + LIBC_NAMESPACE::internal::CharacterConverter c4(&s4); + ASSERT_FALSE(c4.isValidState()); + + LIBC_NAMESPACE::internal::mbstate s5{0x7ff, 1, 2}; + LIBC_NAMESPACE::internal::CharacterConverter c5(&s5); + ASSERT_TRUE(c5.isValidState()); + LIBC_NAMESPACE::internal::mbstate s6{0x800, 1, 2}; + LIBC_NAMESPACE::internal::CharacterConverter c6(&s6); + ASSERT_FALSE(c6.isValidState()); + + LIBC_NAMESPACE::internal::mbstate s7{0xffff, 1, 3}; + LIBC_NAMESPACE::internal::CharacterConverter c7(&s7); + ASSERT_TRUE(c7.isValidState()); + LIBC_NAMESPACE::internal::mbstate s8{0x10000, 1, 3}; + LIBC_NAMESPACE::internal::CharacterConverter c8(&s8); + ASSERT_FALSE(c8.isValidState()); + + LIBC_NAMESPACE::internal::mbstate s9{0x10ffff, 1, 4}; + LIBC_NAMESPACE::internal::CharacterConverter c9(&s9); + ASSERT_TRUE(c9.isValidState()); + LIBC_NAMESPACE::internal::mbstate s10{0x110000, 1, 2}; + LIBC_NAMESPACE::internal::CharacterConverter c10(&s10); + ASSERT_FALSE(c10.isValidState()); + + LIBC_NAMESPACE::internal::mbstate s11{0, 0, 5}; + LIBC_NAMESPACE::internal::CharacterConverter c11(&s11); + ASSERT_FALSE(c11.isValidState()); +}