Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions libc/src/__support/wchar/character_converter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include "hdr/types/char32_t.h"
#include "hdr/types/char8_t.h"
#include "src/__support/CPP/bit.h"
#include "src/__support/common.h"
#include "src/__support/error_or.h"
#include "src/__support/math_extras.h"
Expand All @@ -30,6 +31,50 @@ bool CharacterConverter::isComplete() {
return state->bytes_processed == state->total_bytes;
}

int CharacterConverter::push(char8_t utf8_byte) {
uint8_t num_ones = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
// Checking the first byte if first push
if (state->bytes_processed == 0) {
// UTF-8 char has 1 byte total
if (num_ones == 0) {
state->total_bytes = 1;
}
// UTF-8 char has 2 through 4 bytes total
else if (num_ones >= 2 && num_ones <= 4) {
/* Since the format is 110xxxxx, 1110xxxx, and 11110xxx for 2, 3, and 4,
we will make the base mask with 7 ones and right shift it as necessary. */
constexpr size_t SIGNIFICANT_BITS = 7;
char8_t base_mask =
static_cast<char8_t>(mask_trailing_ones<uint8_t, SIGNIFICANT_BITS>());
state->total_bytes = num_ones;
utf8_byte &= (base_mask >> num_ones);
}
// Invalid first byte
else {
// bytes_processed and total_bytes will always be 0 here
state->partial = static_cast<char32_t>(0);
return -1;
}
state->partial = static_cast<char32_t>(utf8_byte);
state->bytes_processed++;
return 0;
}
// Any subsequent push
// Adding 6 more bits so need to left shift
constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
if (num_ones == 1 && !isComplete()) {
char32_t byte =
utf8_byte & mask_trailing_ones<uint32_t, ENCODED_BITS_PER_UTF8>();
state->partial = state->partial << ENCODED_BITS_PER_UTF8;
state->partial |= byte;
state->bytes_processed++;
return 0;
}
// Invalid byte -> reset the state
clear();
return -1;
}

int CharacterConverter::push(char32_t utf32) {
// we can't be partially through a conversion when pushing a utf32 value
if (!isComplete())
Expand All @@ -54,6 +99,17 @@ int CharacterConverter::push(char32_t utf32) {
return -1;
}

ErrorOr<char32_t> CharacterConverter::pop_utf32() {
// If pop is called too early, do not reset the state, use error to determine
// whether enough bytes have been pushed
if (!isComplete() || state->bytes_processed == 0)
return Error(-1);
char32_t utf32 = state->partial;
// reset if successful pop
clear();
return utf32;
}

ErrorOr<char8_t> CharacterConverter::pop_utf8() {
if (isComplete())
return Error(-1);
Expand Down
5 changes: 2 additions & 3 deletions libc/test/src/__support/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -275,9 +275,8 @@ add_subdirectory(fixed_point)
add_subdirectory(HashTable)
add_subdirectory(time)
add_subdirectory(threads)

# Requires access to uchar header which is not on macos
# Therefore, cannot currently build this on macos in overlay mode
# Requires access to uchar header which is not on MacOS
# Cannot currently build this on MacOS in overlay mode
if(NOT(LIBC_TARGET_OS_IS_DARWIN))
add_subdirectory(wchar)
endif()
10 changes: 10 additions & 0 deletions libc/test/src/__support/wchar/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
add_custom_target(libc-support-wchar-tests)

add_libc_test(
utf8_to_32_test
SUITE
libc-support-tests
SRCS
utf8_to_32_test.cpp
DEPENDS
libc.src.__support.wchar.character_converter
)

add_libc_test(
utf32_to_8_test
SUITE
Expand Down
196 changes: 196 additions & 0 deletions libc/test/src/__support/wchar/utf8_to_32_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
//===-- Unittests for character_converter utf8->utf32 ---------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "src/__support/error_or.h"
#include "src/__support/wchar/character_converter.h"
#include "src/__support/wchar/mbstate.h"
#include "test/UnitTest/Test.h"

TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
char ch = 'A';

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch));
auto wch = char_conv.pop_utf32();

ASSERT_EQ(err, 0);
ASSERT_TRUE(wch.has_value());
ASSERT_EQ(static_cast<int>(wch.value()), 65);
}

TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch[2] = {static_cast<char>(0xC2),
static_cast<char>(0x8E)}; // Ž car symbol

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
char_conv.push(static_cast<char8_t>(ch[0]));
char_conv.push(static_cast<char8_t>(ch[1]));
auto wch = char_conv.pop_utf32();

ASSERT_TRUE(wch.has_value());
ASSERT_EQ(static_cast<int>(wch.value()), 142);
}

TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
static_cast<char>(0x91)}; // ∑ sigma symbol

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
char_conv.push(static_cast<char8_t>(ch[0]));
char_conv.push(static_cast<char8_t>(ch[1]));
char_conv.push(static_cast<char8_t>(ch[2]));
auto wch = char_conv.pop_utf32();

ASSERT_TRUE(wch.has_value());
ASSERT_EQ(static_cast<int>(wch.value()), 8721);
}

TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
static_cast<char>(0xA4),
static_cast<char>(0xA1)}; // 🤡 clown emoji

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
char_conv.push(static_cast<char8_t>(ch[0]));
char_conv.push(static_cast<char8_t>(ch[1]));
char_conv.push(static_cast<char8_t>(ch[2]));
char_conv.push(static_cast<char8_t>(ch[3]));
auto wch = char_conv.pop_utf32();

ASSERT_TRUE(wch.has_value());
ASSERT_EQ(static_cast<int>(wch.value()), 129313);
}

TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch = static_cast<char>(0x80); // invalid starting bit sequence

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch));

ASSERT_EQ(err, -1);
}

TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch[4] = {
static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
static_cast<char>(0x00)}; // first and third bytes are invalid

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch[0]));
ASSERT_EQ(err, -1);
err = char_conv.push(static_cast<char8_t>(ch[1]));
ASSERT_EQ(err, 0);
// Prev byte was single byte so trying to push another should error.
err = char_conv.push(static_cast<char8_t>(ch[2]));
ASSERT_EQ(err, -1);
err = char_conv.push(static_cast<char8_t>(ch[3]));
ASSERT_EQ(err, 0);
}

TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
// Last byte is invalid since it does not have correct starting sequence.
// 0xC0 --> 11000000 starting sequence should be 10xxxxxx
const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0x80),
static_cast<char>(0x80), static_cast<char>(0xC0)};

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch[0]));
ASSERT_EQ(err, 0);
err = char_conv.push(static_cast<char8_t>(ch[1]));
ASSERT_EQ(err, 0);
err = char_conv.push(static_cast<char8_t>(ch[2]));
ASSERT_EQ(err, 0);
err = char_conv.push(static_cast<char8_t>(ch[3]));
ASSERT_EQ(err, -1);
}

TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
static_cast<char>(0x80)};

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch[0]));
ASSERT_EQ(err, 0);
err = char_conv.push(static_cast<char8_t>(ch[1]));
ASSERT_EQ(err, 0);
// Should produce an error on 3rd byte
err = char_conv.push(static_cast<char8_t>(ch[2]));
ASSERT_EQ(err, -1);

// Should produce an error since mbstate was reset
auto wch = char_conv.pop_utf32();
ASSERT_FALSE(wch.has_value());
}

TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
static_cast<char>(0xC7), static_cast<char>(0x8C)};

LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch[0]));
ASSERT_EQ(err, 0);
err = char_conv.push(static_cast<char8_t>(ch[1]));
ASSERT_EQ(err, 0);
auto wch = char_conv.pop_utf32();
ASSERT_TRUE(wch.has_value());
ASSERT_EQ(static_cast<int>(wch.value()), 142);

// Second two byte character
err = char_conv.push(static_cast<char8_t>(ch[2]));
ASSERT_EQ(err, 0);
err = char_conv.push(static_cast<char8_t>(ch[3]));
ASSERT_EQ(err, 0);
wch = char_conv.pop_utf32();
ASSERT_TRUE(wch.has_value());
ASSERT_EQ(static_cast<int>(wch.value()), 460);
}

TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)};
int err = char_conv.push(static_cast<char8_t>(ch[0]));
ASSERT_EQ(err, 0);
auto wch = char_conv.pop_utf32();
ASSERT_FALSE(
wch.has_value()); // Should fail since we have not read enough bytes
err = char_conv.push(static_cast<char8_t>(ch[1]));
ASSERT_EQ(err, 0);
wch = char_conv.pop_utf32();
ASSERT_TRUE(wch.has_value());
ASSERT_EQ(static_cast<int>(wch.value()), 142);
}
Loading