Skip to content

Commit 9561ab5

Browse files
author
Sriya Pratipati
committed
[libc] CharacterConverter utf8 to 32 push and pop
Implemented push and pop for utf8 to 32 conversion and tests.
1 parent 8a8ea8f commit 9561ab5

File tree

4 files changed

+203
-4
lines changed

4 files changed

+203
-4
lines changed

libc/src/__support/wchar/character_converter.cpp

Lines changed: 66 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,75 @@ bool CharacterConverter::isComplete() {
2222
return state->bytes_processed == state->total_bytes;
2323
}
2424

25-
int CharacterConverter::push(char8_t utf8_byte) {}
25+
int CharacterConverter::push(char8_t utf8_byte) {
26+
// Checking the first byte if first push
27+
if (state->bytes_processed == 0 && state->total_bytes == 0) {
28+
// 1 byte total
29+
if ((utf8_byte & 128) == 0) {
30+
state->total_bytes = 1;
31+
state->bytes_processed = 1;
32+
state->partial = static_cast<char32_t>(utf8_byte);
33+
return 0;
34+
}
35+
// 2 bytes total
36+
else if ((utf8_byte & 0xE0) == 0xC0) {
37+
state->total_bytes = 2;
38+
state->bytes_processed = 1;
39+
utf8_byte &= 0x1F;
40+
state->partial = static_cast<char32_t>(utf8_byte);
41+
return 0;
42+
}
43+
// 3 bytes total
44+
else if ((utf8_byte & 0xF0) == 0xE0) {
45+
state->total_bytes = 3;
46+
state->bytes_processed = 1;
47+
utf8_byte &= 0x0F;
48+
state->partial = static_cast<char32_t>(utf8_byte);
49+
return 0;
50+
}
51+
// 4 bytes total
52+
else if ((utf8_byte & 0xF8) == 0xF0) {
53+
state->total_bytes = 4;
54+
state->bytes_processed = 1;
55+
utf8_byte &= 0x07;
56+
state->partial = static_cast<char32_t>(utf8_byte);
57+
return 0;
58+
}
59+
// Invalid
60+
else {
61+
state->bytes_processed++;
62+
return -1;
63+
}
64+
}
65+
// Any subsequent push
66+
if ((utf8_byte & 0xC0) == 0x80) {
67+
state->partial = state->partial << 6;
68+
char32_t byte = utf8_byte & 0x3F;
69+
state->partial |= byte;
70+
state->bytes_processed++;
71+
return 0;
72+
}
73+
state->bytes_processed++;
74+
return -1;
75+
}
2676

27-
int CharacterConverter::push(char32_t utf32) {}
77+
int CharacterConverter::push(char32_t utf32) {
78+
return utf32;
79+
}
2880

29-
utf_ret<char8_t> CharacterConverter::pop_utf8() {}
81+
utf_ret<char8_t> CharacterConverter::pop_utf8() {
82+
utf_ret<char8_t> utf8;
83+
utf8.error = 0;
84+
utf8.out = 0;
85+
return utf8;
86+
}
3087

31-
utf_ret<char32_t> CharacterConverter::pop_utf32() {}
88+
utf_ret<char32_t> CharacterConverter::pop_utf32() {
89+
utf_ret<char32_t> utf32;
90+
utf32.error = 0;
91+
utf32.out = state->partial;
92+
return utf32;
93+
}
3294

3395
} // namespace internal
3496
} // namespace LIBC_NAMESPACE_DECL

libc/test/src/__support/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,3 +275,4 @@ add_subdirectory(fixed_point)
275275
add_subdirectory(HashTable)
276276
add_subdirectory(time)
277277
add_subdirectory(threads)
278+
add_subdirectory(wchar)
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
add_custom_target(libc-support-wchar-tests)
2+
3+
add_libc_test(
4+
utf8_to_32_test
5+
SUITE
6+
libc-support-tests
7+
SRCS
8+
utf8_to_32_test.cpp
9+
DEPENDS
10+
libc.src.__support.wchar.character_converter
11+
)
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
//===-- Unittests for character_converter utf8->3 -------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "src/__support/wchar/character_converter.h"
10+
#include "src/__support/wchar/mbstate.h"
11+
#include "src/__support/wchar/utf_ret.h"
12+
#include "test/UnitTest/Test.h"
13+
14+
TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
15+
LIBC_NAMESPACE::internal::mbstate state;
16+
state.bytes_processed = 0;
17+
state.total_bytes = 0;
18+
char ch = 'A';
19+
20+
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
21+
int err = char_conv.push(static_cast<char8_t>(ch));
22+
LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
23+
24+
EXPECT_EQ(err, 0);
25+
EXPECT_EQ(wch.error, 0);
26+
EXPECT_EQ(static_cast<int>(wch.out), 65);
27+
}
28+
29+
TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
30+
LIBC_NAMESPACE::internal::mbstate state;
31+
state.bytes_processed = 0;
32+
state.total_bytes = 0;
33+
const char *ch = "Ž"; // hex 0xC2, 0x8E
34+
35+
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
36+
char_conv.push(static_cast<char8_t>(ch[0]));
37+
char_conv.push(static_cast<char8_t>(ch[1]));
38+
LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
39+
40+
ASSERT_EQ(wch.error, 0);
41+
ASSERT_EQ(static_cast<int>(wch.out), 142);
42+
}
43+
44+
TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
45+
LIBC_NAMESPACE::internal::mbstate state;
46+
state.bytes_processed = 0;
47+
state.total_bytes = 0;
48+
const char *ch = ""; // hex 0xE2, 0x88, 0x91
49+
50+
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
51+
char_conv.push(static_cast<char8_t>(ch[0]));
52+
char_conv.push(static_cast<char8_t>(ch[1]));
53+
char_conv.push(static_cast<char8_t>(ch[2]));
54+
LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
55+
56+
ASSERT_EQ(wch.error, 0);
57+
ASSERT_EQ(static_cast<int>(wch.out), 8721);
58+
}
59+
60+
TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
61+
LIBC_NAMESPACE::internal::mbstate state;
62+
state.bytes_processed = 0;
63+
state.total_bytes = 0;
64+
const char *ch = "🤡"; // hex 0xF0, 0x9F, 0xA4, 0xA1
65+
66+
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
67+
char_conv.push(static_cast<char8_t>(ch[0]));
68+
char_conv.push(static_cast<char8_t>(ch[1]));
69+
char_conv.push(static_cast<char8_t>(ch[2]));
70+
char_conv.push(static_cast<char8_t>(ch[3]));
71+
LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
72+
73+
ASSERT_EQ(wch.error, 0);
74+
ASSERT_EQ(static_cast<int>(wch.out), 129313);
75+
}
76+
77+
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
78+
LIBC_NAMESPACE::internal::mbstate state;
79+
state.bytes_processed = 0;
80+
state.total_bytes = 0;
81+
const char ch = static_cast<char>(0x80); // invalid starting bit sequence
82+
83+
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
84+
int err = char_conv.push(static_cast<char8_t>(ch));
85+
86+
ASSERT_EQ(err, -1);
87+
}
88+
89+
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
90+
LIBC_NAMESPACE::internal::mbstate state;
91+
state.bytes_processed = 0;
92+
state.total_bytes = 0;
93+
const char ch[4] = {static_cast<char>(0x80), static_cast<char>(0x00),
94+
static_cast<char>(0x00),
95+
static_cast<char>(0x00)}; // All bytes are invalid
96+
97+
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
98+
int err = char_conv.push(static_cast<char8_t>(ch[0]));
99+
ASSERT_EQ(err, -1);
100+
err = char_conv.push(static_cast<char8_t>(ch[1]));
101+
ASSERT_EQ(err, -1);
102+
err = char_conv.push(static_cast<char8_t>(ch[2]));
103+
ASSERT_EQ(err, -1);
104+
err = char_conv.push(static_cast<char8_t>(ch[3]));
105+
ASSERT_EQ(err, -1);
106+
}
107+
108+
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMiddleByte) {
109+
LIBC_NAMESPACE::internal::mbstate state;
110+
state.bytes_processed = 0;
111+
state.total_bytes = 0;
112+
const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0xC0),
113+
static_cast<char>(0x80),
114+
static_cast<char>(0x80)}; // invalid second byte
115+
116+
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
117+
int err = char_conv.push(static_cast<char8_t>(ch[0]));
118+
ASSERT_EQ(err, 0);
119+
err = char_conv.push(static_cast<char8_t>(ch[1]));
120+
ASSERT_EQ(err, -1);
121+
err = char_conv.push(static_cast<char8_t>(ch[2]));
122+
ASSERT_EQ(err, 0);
123+
err = char_conv.push(static_cast<char8_t>(ch[3]));
124+
ASSERT_EQ(err, 0);
125+
}

0 commit comments

Comments
 (0)