Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 62 additions & 6 deletions libc/src/__support/wchar/character_converter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include "hdr/types/char32_t.h"
#include "hdr/types/char8_t.h"
#include "src/__support/CPP/bit.h"
#include "src/__support/wchar/mbstate.h"
#include "src/__support/wchar/utf_ret.h"

Expand All @@ -22,13 +23,68 @@ bool CharacterConverter::isComplete() {
return state->bytes_processed == state->total_bytes;
}

int CharacterConverter::push(char8_t utf8_byte) {}

int CharacterConverter::push(char32_t utf32) {}

utf_ret<char8_t> CharacterConverter::pop_utf8() {}
int CharacterConverter::push(char8_t utf8_byte) {
// Checking the first byte if first push
if (state->bytes_processed == 0 && state->total_bytes == 0) {
state->partial = static_cast<char32_t>(0);
// 1 byte total
if (cpp::countl_one(utf8_byte) == 0) {
state->total_bytes = 1;
}
// 2 bytes total
else if (cpp::countl_one(utf8_byte) == 2) {
state->total_bytes = 2;
utf8_byte &= 0x1F;
}
// 3 bytes total
else if (cpp::countl_one(utf8_byte) == 3) {
state->total_bytes = 3;
utf8_byte &= 0x0F;
}
// 4 bytes total
else if (cpp::countl_one(utf8_byte) == 4) {
state->total_bytes = 4;
utf8_byte &= 0x07;
}
// Invalid byte -> reset mbstate
else {
state->partial = static_cast<char32_t>(0);
state->bytes_processed = 0;
state->total_bytes = 0;
return -1;
}
state->partial = static_cast<char32_t>(utf8_byte);
state->bytes_processed++;
return 0;
}
// Any subsequent push
if (cpp::countl_one(utf8_byte) == 1 && !isComplete()) {
char32_t byte = utf8_byte & 0x3F;
state->partial = state->partial << 6;
state->partial |= byte;
state->bytes_processed++;
return 0;
}
// Invalid byte -> reset if we didn't get successful complete read
if (!isComplete()) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a little confused by the logic here. Why are we only resetting this if we try to push an invalid byte into an incomplete character. Shouldn't an attempt to push a byte into a complete state also cause the state to be reset?

With the current arrangement, it's possible to get the CharacterConverter into a state where it is erroring on pushes and the only way to get it to stop erroring is to call "pop" on it. That seems very counterintuitive, because "the push function returned -1" should mean "calling pop is invalid".

state->partial = static_cast<char32_t>(0);
state->bytes_processed = 0;
state->total_bytes = 0;
}
return -1;
}

utf_ret<char32_t> CharacterConverter::pop_utf32() {}
utf_ret<char32_t> CharacterConverter::pop_utf32() {
utf_ret<char32_t> utf32;
utf32.error = 0;
utf32.out = state->partial;
if (!isComplete())
utf32.error = -1;
state->bytes_processed = 0;
state->total_bytes = 0;
state->partial = static_cast<char32_t>(0);
return utf32;
}

} // namespace internal
} // namespace LIBC_NAMESPACE_DECL
5 changes: 5 additions & 0 deletions libc/test/src/__support/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -275,3 +275,8 @@ add_subdirectory(fixed_point)
add_subdirectory(HashTable)
add_subdirectory(time)
add_subdirectory(threads)
# Requires access to uchar header which is not on MacOS
# Cannot currently build this on MacOS in overlay mode
if(NOT(LIBC_TARGET_OS_IS_DARWIN))
add_subdirectory(wchar)
endif()
11 changes: 11 additions & 0 deletions libc/test/src/__support/wchar/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
add_custom_target(libc-support-wchar-tests)

add_libc_test(
utf8_to_32_test
SUITE
libc-support-tests
SRCS
utf8_to_32_test.cpp
DEPENDS
libc.src.__support.wchar.character_converter
)
176 changes: 176 additions & 0 deletions libc/test/src/__support/wchar/utf8_to_32_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
//===-- Unittests for character_converter utf8->utf32 ---------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "src/__support/wchar/character_converter.h"
#include "src/__support/wchar/mbstate.h"
#include "src/__support/wchar/utf_ret.h"
#include "test/UnitTest/Test.h"

TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
char ch = 'A';

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch));
LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();

EXPECT_EQ(err, 0);
EXPECT_EQ(wch.error, 0);
EXPECT_EQ(static_cast<int>(wch.out), 65);
}

TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)}; // Ž

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
char_conv.push(static_cast<char8_t>(ch[0]));
char_conv.push(static_cast<char8_t>(ch[1]));
LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();

ASSERT_EQ(wch.error, 0);
ASSERT_EQ(static_cast<int>(wch.out), 142);
}

TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
static_cast<char>(0x91)}; // ∑

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
char_conv.push(static_cast<char8_t>(ch[0]));
char_conv.push(static_cast<char8_t>(ch[1]));
char_conv.push(static_cast<char8_t>(ch[2]));
LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();

ASSERT_EQ(wch.error, 0);
ASSERT_EQ(static_cast<int>(wch.out), 8721);
}

TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
static_cast<char>(0xA4), static_cast<char>(0xA1)}; // 🤡

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
char_conv.push(static_cast<char8_t>(ch[0]));
char_conv.push(static_cast<char8_t>(ch[1]));
char_conv.push(static_cast<char8_t>(ch[2]));
char_conv.push(static_cast<char8_t>(ch[3]));
LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();

ASSERT_EQ(wch.error, 0);
ASSERT_EQ(static_cast<int>(wch.out), 129313);
}

TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch = static_cast<char>(0x80); // invalid starting bit sequence

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch));

ASSERT_EQ(err, -1);
}

TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch[4] = {
static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
static_cast<char>(0x00)}; // first, third, and last bytes are invalid

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch[0]));
ASSERT_EQ(err, -1);
err = char_conv.push(static_cast<char8_t>(ch[1]));
ASSERT_EQ(err, 0);
// Prev byte was single byte so trying to read another should error.
err = char_conv.push(static_cast<char8_t>(ch[2]));
ASSERT_EQ(err, -1);
err = char_conv.push(static_cast<char8_t>(ch[3]));
ASSERT_EQ(err, -1);
}

TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0x80),
static_cast<char>(0x80),
static_cast<char>(0xC0)}; // invalid last byte

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch[0]));
ASSERT_EQ(err, 0);
err = char_conv.push(static_cast<char8_t>(ch[1]));
ASSERT_EQ(err, 0);
err = char_conv.push(static_cast<char8_t>(ch[2]));
ASSERT_EQ(err, 0);
err = char_conv.push(static_cast<char8_t>(ch[3]));
ASSERT_EQ(err, -1);
}

TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
static_cast<char>(0x80)};

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch[0]));
ASSERT_EQ(err, 0);
err = char_conv.push(static_cast<char8_t>(ch[1]));
ASSERT_EQ(err, 0);
// Should produce an error on 3rd byte
err = char_conv.push(static_cast<char8_t>(ch[2]));
ASSERT_EQ(err, -1);

LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
ASSERT_EQ(wch.error, 0);
// Should still output the correct result.
ASSERT_EQ(static_cast<int>(wch.out), 142);
}

TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
static_cast<char>(0xC7), static_cast<char>(0x8C)};

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch[0]));
ASSERT_EQ(err, 0);
err = char_conv.push(static_cast<char8_t>(ch[1]));
ASSERT_EQ(err, 0);
LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
ASSERT_EQ(wch.error, 0);
ASSERT_EQ(static_cast<int>(wch.out), 142);

// Second two byte character
err = char_conv.push(static_cast<char8_t>(ch[2]));
ASSERT_EQ(err, 0);
err = char_conv.push(static_cast<char8_t>(ch[3]));
ASSERT_EQ(err, 0);
wch = char_conv.pop_utf32();
ASSERT_EQ(wch.error, 0);
ASSERT_EQ(static_cast<int>(wch.out), 460);
}
Loading