Skip to content

Commit fe7f23d

Browse files
committed
implemented characterconverter push/pop for utf32 -> 8; added tests
1 parent e7ebd78 commit fe7f23d

File tree

4 files changed

+280
-9
lines changed

4 files changed

+280
-9
lines changed

libc/src/__support/wchar/character_converter.cpp

Lines changed: 134 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include "hdr/types/char32_t.h"
1010
#include "hdr/types/char8_t.h"
11+
#include "src/__support/common.h"
1112
#include "src/__support/wchar/mbstate.h"
1213
#include "src/__support/wchar/utf_ret.h"
1314

@@ -16,17 +17,144 @@
1617
namespace LIBC_NAMESPACE_DECL {
1718
namespace internal {
1819

19-
CharacterConverter::CharacterConverter(mbstate_t *mbstate) { state = mbstate; }
20+
CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
2021

21-
bool CharacterConverter::isComplete() {}
22+
bool CharacterConverter::isComplete() {
23+
return state->bits_processed / 8 == state->total_bytes;
24+
}
2225

23-
int CharacterConverter::push(char8_t utf8_byte) {}
26+
int CharacterConverter::push(char8_t utf8_byte) { return utf8_byte; }
2427

25-
int CharacterConverter::push(char32_t utf32) {}
28+
int CharacterConverter::push(char32_t utf32) {
29+
state->partial = utf32;
30+
state->bits_processed = 0;
31+
state->total_bytes = 0;
2632

27-
utf_ret<char8_t> CharacterConverter::pop_utf8() {}
33+
// determine number of utf-8 bytes needed to represent this utf32 value
34+
char32_t ranges[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
35+
const int num_ranges = 4;
36+
for (uint8_t i = 0; i < num_ranges; i++) {
37+
if (state->partial <= ranges[i]) {
38+
state->total_bytes = i + 1;
39+
break;
40+
}
41+
}
42+
if (state->total_bytes == 0) {
43+
return -1;
44+
}
2845

29-
utf_ret<char32_t> CharacterConverter::pop_utf32() {}
46+
return 0;
47+
}
48+
49+
utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength1() {
50+
utf_ret<char8_t> result;
51+
result.error = 0;
52+
53+
// 0xxxxxxx
54+
switch (state->bits_processed) {
55+
case 0:
56+
result.out = (char8_t)(state->partial);
57+
break;
58+
default:
59+
result.error = -1;
60+
return result;
61+
}
62+
63+
state->bits_processed += 8;
64+
return result;
65+
}
66+
67+
utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength2() {
68+
utf_ret<char8_t> result;
69+
result.error = 0;
70+
71+
// 110xxxxx 10xxxxxx
72+
char32_t utf32 = state->partial;
73+
switch (state->bits_processed) {
74+
case 0:
75+
result.out = (char8_t)(0xC0 | (utf32 >> 6));
76+
break;
77+
case 8:
78+
result.out = (char8_t)(0x80 | (utf32 & 0x3f));
79+
break;
80+
default:
81+
result.error = -1;
82+
return result;
83+
}
84+
85+
state->bits_processed += 8;
86+
return result;
87+
}
88+
89+
utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength3() {
90+
utf_ret<char8_t> result;
91+
result.error = 0;
92+
93+
// 1110xxxx 10xxxxxx 10xxxxxx
94+
char32_t utf32 = state->partial;
95+
switch (state->bits_processed) {
96+
case 0:
97+
result.out = (char8_t)(0xE0 | (utf32 >> 12));
98+
break;
99+
case 8:
100+
result.out = (char8_t)(0x80 | ((utf32 >> 6) & 0x3f));
101+
break;
102+
case 16:
103+
result.out = (char8_t)(0x80 | (utf32 & 0x3f));
104+
break;
105+
default:
106+
result.error = -1;
107+
return result;
108+
}
109+
110+
state->bits_processed += 8;
111+
return result;
112+
}
113+
114+
utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength4() {
115+
utf_ret<char8_t> result;
116+
result.error = 0;
117+
118+
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
119+
char32_t utf32 = state->partial;
120+
switch (state->bits_processed) {
121+
case 0:
122+
result.out = (char8_t)(0xF0 | (utf32 >> 18));
123+
break;
124+
case 8:
125+
result.out = (char8_t)(0x80 | ((utf32 >> 12) & 0x3f));
126+
break;
127+
case 16:
128+
result.out = (char8_t)(0x80 | ((utf32 >> 6) & 0x3f));
129+
break;
130+
case 24:
131+
result.out = (char8_t)(0x80 | (utf32 & 0x3f));
132+
break;
133+
default:
134+
result.error = -1;
135+
return result;
136+
}
137+
138+
state->bits_processed += 8;
139+
return result;
140+
}
141+
142+
utf_ret<char8_t> CharacterConverter::pop_utf8() {
143+
switch (state->total_bytes) {
144+
case 1:
145+
return pop_utf8_seqlength1();
146+
case 2:
147+
return pop_utf8_seqlength2();
148+
case 3:
149+
return pop_utf8_seqlength3();
150+
case 4:
151+
return pop_utf8_seqlength4();
152+
}
153+
154+
return {.out = 0, .error = -1};
155+
}
156+
157+
utf_ret<char32_t> CharacterConverter::pop_utf32() { return {0, -1}; }
30158

31159
} // namespace internal
32160
} // namespace LIBC_NAMESPACE_DECL

libc/src/__support/wchar/character_converter.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,23 @@
1313
#include "hdr/types/char8_t.h"
1414
#include "src/__support/wchar/mbstate.h"
1515
#include "src/__support/wchar/utf_ret.h"
16+
#include "src/__support/common.h"
17+
1618

1719
namespace LIBC_NAMESPACE_DECL {
1820
namespace internal {
1921

2022
class CharacterConverter {
2123
private:
22-
mbstate_t *state;
24+
mbstate *state;
25+
26+
utf_ret<char8_t> pop_utf8_seqlength1();
27+
utf_ret<char8_t> pop_utf8_seqlength2();
28+
utf_ret<char8_t> pop_utf8_seqlength3();
29+
utf_ret<char8_t> pop_utf8_seqlength4();
2330

2431
public:
25-
CharacterConverter(mbstate_t *mbstate);
32+
CharacterConverter(mbstate *mbstate);
2633

2734
bool isComplete();
2835

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,11 @@
11
add_custom_target(libc-support-wchar-tests)
2+
3+
add_libc_test(
4+
utf32_to_8_test
5+
SUITE
6+
libc-support-tests
7+
SRCS
8+
utf32_to_8_test.cpp
9+
DEPENDS
10+
libc.src.__support.wchar.character_converter
11+
)

libc/test/src/__support/wchar/utf32_to_8_test.cpp

Lines changed: 127 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,130 @@
1212

1313
#include "test/UnitTest/Test.h"
1414

15-
TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {}
15+
TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
16+
LIBC_NAMESPACE::internal::mbstate state;
17+
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
18+
19+
char32_t utf32_A = 0x41;
20+
cr.push(utf32_A);
21+
auto popped = cr.pop_utf8();
22+
ASSERT_EQ(popped.error, 0);
23+
ASSERT_EQ(static_cast<char>(popped.out), 'A');
24+
ASSERT_TRUE(cr.isComplete());
25+
26+
char32_t utf32_B = 0x42;
27+
cr.push(utf32_B);
28+
popped = cr.pop_utf8();
29+
ASSERT_EQ(popped.error, 0);
30+
ASSERT_EQ(static_cast<char>(popped.out), 'B');
31+
ASSERT_TRUE(cr.isComplete());
32+
33+
popped = cr.pop_utf8();
34+
ASSERT_NE(popped.error, 0);
35+
}
36+
37+
TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
38+
LIBC_NAMESPACE::internal::mbstate state;
39+
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
40+
41+
char32_t utf32 = 0xff;
42+
cr.push(utf32);
43+
auto popped = cr.pop_utf8();
44+
ASSERT_EQ(popped.error, 0);
45+
ASSERT_EQ(static_cast<int>(popped.out), 0xc3);
46+
ASSERT_TRUE(!cr.isComplete());
47+
popped = cr.pop_utf8();
48+
ASSERT_EQ(popped.error, 0);
49+
ASSERT_EQ(static_cast<int>(popped.out), 0xbf);
50+
ASSERT_TRUE(cr.isComplete());
51+
52+
utf32 = 0x58e;
53+
cr.push(utf32);
54+
popped = cr.pop_utf8();
55+
ASSERT_EQ(popped.error, 0);
56+
ASSERT_EQ(static_cast<int>(popped.out), 0xd6);
57+
ASSERT_TRUE(!cr.isComplete());
58+
popped = cr.pop_utf8();
59+
ASSERT_EQ(popped.error, 0);
60+
ASSERT_EQ(static_cast<int>(popped.out), 0x8e);
61+
ASSERT_TRUE(cr.isComplete());
62+
63+
popped = cr.pop_utf8();
64+
ASSERT_NE(popped.error, 0);
65+
}
66+
67+
TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
68+
LIBC_NAMESPACE::internal::mbstate state;
69+
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
70+
71+
char32_t utf32 = 0xac15;
72+
cr.push(utf32);
73+
auto popped = cr.pop_utf8();
74+
ASSERT_EQ(popped.error, 0);
75+
ASSERT_EQ(static_cast<int>(popped.out), 0xea);
76+
ASSERT_TRUE(!cr.isComplete());
77+
popped = cr.pop_utf8();
78+
ASSERT_EQ(popped.error, 0);
79+
ASSERT_EQ(static_cast<int>(popped.out), 0xb0);
80+
ASSERT_TRUE(!cr.isComplete());
81+
popped = cr.pop_utf8();
82+
ASSERT_EQ(popped.error, 0);
83+
ASSERT_EQ(static_cast<int>(popped.out), 0x95);
84+
ASSERT_TRUE(cr.isComplete());
85+
86+
utf32 = 0x267b;
87+
cr.push(utf32);
88+
popped = cr.pop_utf8();
89+
ASSERT_EQ(popped.error, 0);
90+
ASSERT_EQ(static_cast<int>(popped.out), 0xe2);
91+
ASSERT_TRUE(!cr.isComplete());
92+
popped = cr.pop_utf8();
93+
ASSERT_EQ(popped.error, 0);
94+
ASSERT_EQ(static_cast<int>(popped.out), 0x99);
95+
ASSERT_TRUE(!cr.isComplete());
96+
popped = cr.pop_utf8();
97+
ASSERT_EQ(popped.error, 0);
98+
ASSERT_EQ(static_cast<int>(popped.out), 0xbb);
99+
ASSERT_TRUE(cr.isComplete());
100+
101+
popped = cr.pop_utf8();
102+
ASSERT_NE(popped.error, 0);
103+
}
104+
105+
TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
106+
LIBC_NAMESPACE::internal::mbstate state;
107+
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
108+
109+
char32_t utf32 = 0xac15;
110+
cr.push(utf32);
111+
auto popped = cr.pop_utf8();
112+
ASSERT_EQ(popped.error, 0);
113+
ASSERT_EQ(static_cast<int>(popped.out), 0xea);
114+
ASSERT_TRUE(!cr.isComplete());
115+
popped = cr.pop_utf8();
116+
ASSERT_EQ(popped.error, 0);
117+
ASSERT_EQ(static_cast<int>(popped.out), 0xb0);
118+
ASSERT_TRUE(!cr.isComplete());
119+
popped = cr.pop_utf8();
120+
ASSERT_EQ(popped.error, 0);
121+
ASSERT_EQ(static_cast<int>(popped.out), 0x95);
122+
ASSERT_TRUE(cr.isComplete());
123+
124+
utf32 = 0x267b;
125+
cr.push(utf32);
126+
popped = cr.pop_utf8();
127+
ASSERT_EQ(popped.error, 0);
128+
ASSERT_EQ(static_cast<int>(popped.out), 0xe2);
129+
ASSERT_TRUE(!cr.isComplete());
130+
popped = cr.pop_utf8();
131+
ASSERT_EQ(popped.error, 0);
132+
ASSERT_EQ(static_cast<int>(popped.out), 0x99);
133+
ASSERT_TRUE(!cr.isComplete());
134+
popped = cr.pop_utf8();
135+
ASSERT_EQ(popped.error, 0);
136+
ASSERT_EQ(static_cast<int>(popped.out), 0xbb);
137+
ASSERT_TRUE(cr.isComplete());
138+
139+
popped = cr.pop_utf8();
140+
ASSERT_NE(popped.error, 0);
141+
}

0 commit comments

Comments
 (0)