Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 57 additions & 6 deletions libc/src/__support/wchar/character_converter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,64 @@ bool CharacterConverter::isComplete() {
return state->bytes_processed == state->total_bytes;
}

int CharacterConverter::push(char8_t utf8_byte) {}

int CharacterConverter::push(char32_t utf32) {}

utf_ret<char8_t> CharacterConverter::pop_utf8() {}
int CharacterConverter::push(char8_t utf8_byte) {
// Checking the first byte if first push
if (state->bytes_processed == 0 && state->total_bytes == 0) {
// 1 byte total
if ((utf8_byte & 128) == 0) {
state->total_bytes = 1;
state->bytes_processed = 1;
state->partial = static_cast<char32_t>(utf8_byte);
return 0;
}
// 2 bytes total
else if ((utf8_byte & 0xE0) == 0xC0) {
state->total_bytes = 2;
state->bytes_processed = 1;
utf8_byte &= 0x1F;
state->partial = static_cast<char32_t>(utf8_byte);
return 0;
}
// 3 bytes total
else if ((utf8_byte & 0xF0) == 0xE0) {
state->total_bytes = 3;
state->bytes_processed = 1;
utf8_byte &= 0x0F;
state->partial = static_cast<char32_t>(utf8_byte);
return 0;
}
// 4 bytes total
else if ((utf8_byte & 0xF8) == 0xF0) {
state->total_bytes = 4;
state->bytes_processed = 1;
utf8_byte &= 0x07;
state->partial = static_cast<char32_t>(utf8_byte);
return 0;
}
// Invalid
else {
state->bytes_processed++;
return -1;
}
}
// Any subsequent push
if ((utf8_byte & 0xC0) == 0x80) {
state->partial = state->partial << 6;
char32_t byte = utf8_byte & 0x3F;
state->partial |= byte;
state->bytes_processed++;
return 0;
}
state->bytes_processed++;
return -1;
}

utf_ret<char32_t> CharacterConverter::pop_utf32() {}
utf_ret<char32_t> CharacterConverter::pop_utf32() {
utf_ret<char32_t> utf32;
utf32.error = 0;
utf32.out = state->partial;
return utf32;
}

} // namespace internal
} // namespace LIBC_NAMESPACE_DECL
5 changes: 5 additions & 0 deletions libc/test/src/__support/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -275,3 +275,8 @@ add_subdirectory(fixed_point)
add_subdirectory(HashTable)
add_subdirectory(time)
add_subdirectory(threads)
# Requires access to uchar header which is not on MacOS
# Cannot currently build this on MacOS in overlay mode
if(NOT(LIBC_TARGET_OS_IS_DARWIN))
add_subdirectory(wchar)
endif()
11 changes: 11 additions & 0 deletions libc/test/src/__support/wchar/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
add_custom_target(libc-support-wchar-tests)

add_libc_test(
utf8_to_32_test
SUITE
libc-support-tests
SRCS
utf8_to_32_test.cpp
DEPENDS
libc.src.__support.wchar.character_converter
)
125 changes: 125 additions & 0 deletions libc/test/src/__support/wchar/utf8_to_32_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
//===-- Unittests for character_converter utf8->3 -------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "src/__support/wchar/character_converter.h"
#include "src/__support/wchar/mbstate.h"
#include "src/__support/wchar/utf_ret.h"
#include "test/UnitTest/Test.h"

TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
char ch = 'A';

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch));
LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();

EXPECT_EQ(err, 0);
EXPECT_EQ(wch.error, 0);
EXPECT_EQ(static_cast<int>(wch.out), 65);
}

TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char *ch = "Ž"; // hex 0xC2, 0x8E

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
char_conv.push(static_cast<char8_t>(ch[0]));
char_conv.push(static_cast<char8_t>(ch[1]));
LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();

ASSERT_EQ(wch.error, 0);
ASSERT_EQ(static_cast<int>(wch.out), 142);
}

TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char *ch = ""; // hex 0xE2, 0x88, 0x91

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
char_conv.push(static_cast<char8_t>(ch[0]));
char_conv.push(static_cast<char8_t>(ch[1]));
char_conv.push(static_cast<char8_t>(ch[2]));
LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();

ASSERT_EQ(wch.error, 0);
ASSERT_EQ(static_cast<int>(wch.out), 8721);
}

TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char *ch = "🤡"; // hex 0xF0, 0x9F, 0xA4, 0xA1

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
char_conv.push(static_cast<char8_t>(ch[0]));
char_conv.push(static_cast<char8_t>(ch[1]));
char_conv.push(static_cast<char8_t>(ch[2]));
char_conv.push(static_cast<char8_t>(ch[3]));
LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();

ASSERT_EQ(wch.error, 0);
ASSERT_EQ(static_cast<int>(wch.out), 129313);
}

TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch = static_cast<char>(0x80); // invalid starting bit sequence

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch));

ASSERT_EQ(err, -1);
}

TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch[4] = {static_cast<char>(0x80), static_cast<char>(0x00),
static_cast<char>(0x00),
static_cast<char>(0x00)}; // All bytes are invalid

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch[0]));
ASSERT_EQ(err, -1);
err = char_conv.push(static_cast<char8_t>(ch[1]));
ASSERT_EQ(err, -1);
err = char_conv.push(static_cast<char8_t>(ch[2]));
ASSERT_EQ(err, -1);
err = char_conv.push(static_cast<char8_t>(ch[3]));
ASSERT_EQ(err, -1);
}

TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMiddleByte) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0xC0),
static_cast<char>(0x80),
static_cast<char>(0x80)}; // invalid second byte

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch[0]));
ASSERT_EQ(err, 0);
err = char_conv.push(static_cast<char8_t>(ch[1]));
ASSERT_EQ(err, -1);
err = char_conv.push(static_cast<char8_t>(ch[2]));
ASSERT_EQ(err, 0);
err = char_conv.push(static_cast<char8_t>(ch[3]));
ASSERT_EQ(err, 0);
}
Loading