Skip to content

Commit 3bdf7ce

Browse files
committed
Merge branch 'wchar-utf32to8' into wchar-utf32-to-8
2 parents b1f5e26 + 3b6fccf commit 3bdf7ce

File tree

5 files changed

+308
-4
lines changed

5 files changed

+308
-4
lines changed

libc/src/__support/wchar/character_converter.cpp

Lines changed: 130 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include "hdr/types/char32_t.h"
1010
#include "hdr/types/char8_t.h"
11+
#include "src/__support/common.h"
1112
#include "src/__support/wchar/mbstate.h"
1213
#include "src/__support/wchar/utf_ret.h"
1314

@@ -22,13 +23,138 @@ bool CharacterConverter::isComplete() {
2223
return state->bytes_processed == state->total_bytes;
2324
}
2425

25-
int CharacterConverter::push(char8_t utf8_byte) {}
26+
int CharacterConverter::push(char8_t utf8_byte) { return utf8_byte; }
2627

27-
int CharacterConverter::push(char32_t utf32) {}
28+
int CharacterConverter::push(char32_t utf32) {
29+
state->partial = utf32;
30+
state->bytes_processed = 0;
31+
state->total_bytes = 0;
2832

29-
utf_ret<char8_t> CharacterConverter::pop_utf8() {}
33+
// determine number of utf-8 bytes needed to represent this utf32 value
34+
char32_t ranges[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
35+
const int num_ranges = 4;
36+
for (uint8_t i = 0; i < num_ranges; i++) {
37+
if (state->partial <= ranges[i]) {
38+
state->total_bytes = i + 1;
39+
break;
40+
}
41+
}
42+
if (state->total_bytes == 0) {
43+
return -1;
44+
}
3045

31-
utf_ret<char32_t> CharacterConverter::pop_utf32() {}
46+
return 0;
47+
}
48+
49+
utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength1() {
50+
utf_ret<char8_t> result;
51+
result.error = 0;
52+
53+
// 0xxxxxxx
54+
switch (state->bytes_processed) {
55+
case 0:
56+
result.out = (char8_t)(state->partial);
57+
break;
58+
default:
59+
result.error = -1;
60+
return result;
61+
}
62+
63+
state->bytes_processed++;
64+
return result;
65+
}
66+
67+
utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength2() {
68+
utf_ret<char8_t> result;
69+
result.error = 0;
70+
71+
// 110xxxxx 10xxxxxx
72+
char32_t utf32 = state->partial;
73+
switch (state->bytes_processed) {
74+
case 0:
75+
result.out = (char8_t)(0xC0 | (utf32 >> 6));
76+
break;
77+
case 1:
78+
result.out = (char8_t)(0x80 | (utf32 & 0x3f));
79+
break;
80+
default:
81+
result.error = -1;
82+
return result;
83+
}
84+
85+
state->bytes_processed++;
86+
return result;
87+
}
88+
89+
utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength3() {
90+
utf_ret<char8_t> result;
91+
result.error = 0;
92+
93+
// 1110xxxx 10xxxxxx 10xxxxxx
94+
char32_t utf32 = state->partial;
95+
switch (state->bytes_processed) {
96+
case 0:
97+
result.out = (char8_t)(0xE0 | (utf32 >> 12));
98+
break;
99+
case 1:
100+
result.out = (char8_t)(0x80 | ((utf32 >> 6) & 0x3f));
101+
break;
102+
case 2:
103+
result.out = (char8_t)(0x80 | (utf32 & 0x3f));
104+
break;
105+
default:
106+
result.error = -1;
107+
return result;
108+
}
109+
110+
state->bytes_processed++;
111+
return result;
112+
}
113+
114+
utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength4() {
115+
utf_ret<char8_t> result;
116+
result.error = 0;
117+
118+
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
119+
char32_t utf32 = state->partial;
120+
switch (state->bytes_processed) {
121+
case 0:
122+
result.out = (char8_t)(0xF0 | (utf32 >> 18));
123+
break;
124+
case 1:
125+
result.out = (char8_t)(0x80 | ((utf32 >> 12) & 0x3f));
126+
break;
127+
case 2:
128+
result.out = (char8_t)(0x80 | ((utf32 >> 6) & 0x3f));
129+
break;
130+
case 3:
131+
result.out = (char8_t)(0x80 | (utf32 & 0x3f));
132+
break;
133+
default:
134+
result.error = -1;
135+
return result;
136+
}
137+
138+
state->bytes_processed++;
139+
return result;
140+
}
141+
142+
utf_ret<char8_t> CharacterConverter::pop_utf8() {
143+
switch (state->total_bytes) {
144+
case 1:
145+
return pop_utf8_seqlength1();
146+
case 2:
147+
return pop_utf8_seqlength2();
148+
case 3:
149+
return pop_utf8_seqlength3();
150+
case 4:
151+
return pop_utf8_seqlength4();
152+
}
153+
154+
return {.out = 0, .error = -1};
155+
}
156+
157+
utf_ret<char32_t> CharacterConverter::pop_utf32() { return {0, -1}; }
32158

33159
} // namespace internal
34160
} // namespace LIBC_NAMESPACE_DECL

libc/src/__support/wchar/character_converter.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
#include "hdr/types/char32_t.h"
1313
#include "hdr/types/char8_t.h"
14+
#include "src/__support/common.h"
1415
#include "src/__support/wchar/mbstate.h"
1516
#include "src/__support/wchar/utf_ret.h"
1617

@@ -21,6 +22,11 @@ class CharacterConverter {
2122
private:
2223
mbstate *state;
2324

25+
utf_ret<char8_t> pop_utf8_seqlength1();
26+
utf_ret<char8_t> pop_utf8_seqlength2();
27+
utf_ret<char8_t> pop_utf8_seqlength3();
28+
utf_ret<char8_t> pop_utf8_seqlength4();
29+
2430
public:
2531
CharacterConverter(mbstate *mbstate);
2632

libc/test/src/__support/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,3 +275,4 @@ add_subdirectory(fixed_point)
275275
add_subdirectory(HashTable)
276276
add_subdirectory(time)
277277
add_subdirectory(threads)
278+
add_subdirectory(wchar)
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
add_custom_target(libc-support-wchar-tests)
2+
3+
add_libc_test(
4+
utf32_to_8_test
5+
SUITE
6+
libc-support-tests
7+
SRCS
8+
utf32_to_8_test.cpp
9+
DEPENDS
10+
libc.src.__support.wchar.character_converter
11+
)
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
//===-- Unittests for the CharacterConverter class (utf32 -> 8) -----------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "src/__support/common.h"
10+
#include "src/__support/wchar/character_converter.h"
11+
#include "src/__support/wchar/mbstate.h"
12+
13+
#include "test/UnitTest/Test.h"
14+
15+
TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
16+
LIBC_NAMESPACE::internal::mbstate state;
17+
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
18+
19+
// utf8 1-byte encodings are identical to their utf32 representations
20+
char32_t utf32_A = 0x41; // 'A'
21+
cr.push(utf32_A);
22+
auto popped = cr.pop_utf8();
23+
ASSERT_EQ(popped.error, 0);
24+
ASSERT_EQ(static_cast<char>(popped.out), 'A');
25+
ASSERT_TRUE(cr.isComplete());
26+
27+
char32_t utf32_B = 0x42; // 'B'
28+
cr.push(utf32_B);
29+
popped = cr.pop_utf8();
30+
ASSERT_EQ(popped.error, 0);
31+
ASSERT_EQ(static_cast<char>(popped.out), 'B');
32+
ASSERT_TRUE(cr.isComplete());
33+
34+
// should error if we try to pop another utf8 byte out
35+
popped = cr.pop_utf8();
36+
ASSERT_NE(popped.error, 0);
37+
}
38+
39+
TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
40+
LIBC_NAMESPACE::internal::mbstate state;
41+
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
42+
43+
// testing utf32: 0xff -> utf8: 0xc3 0xbf
44+
char32_t utf32 = 0xff;
45+
cr.push(utf32);
46+
auto popped = cr.pop_utf8();
47+
ASSERT_EQ(popped.error, 0);
48+
ASSERT_EQ(static_cast<int>(popped.out), 0xc3);
49+
ASSERT_TRUE(!cr.isComplete());
50+
popped = cr.pop_utf8();
51+
ASSERT_EQ(popped.error, 0);
52+
ASSERT_EQ(static_cast<int>(popped.out), 0xbf);
53+
ASSERT_TRUE(cr.isComplete());
54+
55+
// testing utf32: 0x58e -> utf8: 0xd6 0x8e
56+
utf32 = 0x58e;
57+
cr.push(utf32);
58+
popped = cr.pop_utf8();
59+
ASSERT_EQ(popped.error, 0);
60+
ASSERT_EQ(static_cast<int>(popped.out), 0xd6);
61+
ASSERT_TRUE(!cr.isComplete());
62+
popped = cr.pop_utf8();
63+
ASSERT_EQ(popped.error, 0);
64+
ASSERT_EQ(static_cast<int>(popped.out), 0x8e);
65+
ASSERT_TRUE(cr.isComplete());
66+
67+
// should error if we try to pop another utf8 byte out
68+
popped = cr.pop_utf8();
69+
ASSERT_NE(popped.error, 0);
70+
}
71+
72+
TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
73+
LIBC_NAMESPACE::internal::mbstate state;
74+
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
75+
76+
// testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95
77+
char32_t utf32 = 0xac15;
78+
cr.push(utf32);
79+
auto popped = cr.pop_utf8();
80+
ASSERT_EQ(popped.error, 0);
81+
ASSERT_EQ(static_cast<int>(popped.out), 0xea);
82+
ASSERT_TRUE(!cr.isComplete());
83+
popped = cr.pop_utf8();
84+
ASSERT_EQ(popped.error, 0);
85+
ASSERT_EQ(static_cast<int>(popped.out), 0xb0);
86+
ASSERT_TRUE(!cr.isComplete());
87+
popped = cr.pop_utf8();
88+
ASSERT_EQ(popped.error, 0);
89+
ASSERT_EQ(static_cast<int>(popped.out), 0x95);
90+
ASSERT_TRUE(cr.isComplete());
91+
92+
// testing utf32: 0x267b -> utf8: 0xe2 0x99 0xbb
93+
utf32 = 0x267b;
94+
cr.push(utf32);
95+
popped = cr.pop_utf8();
96+
ASSERT_EQ(popped.error, 0);
97+
ASSERT_EQ(static_cast<int>(popped.out), 0xe2);
98+
ASSERT_TRUE(!cr.isComplete());
99+
popped = cr.pop_utf8();
100+
ASSERT_EQ(popped.error, 0);
101+
ASSERT_EQ(static_cast<int>(popped.out), 0x99);
102+
ASSERT_TRUE(!cr.isComplete());
103+
popped = cr.pop_utf8();
104+
ASSERT_EQ(popped.error, 0);
105+
ASSERT_EQ(static_cast<int>(popped.out), 0xbb);
106+
ASSERT_TRUE(cr.isComplete());
107+
108+
// should error if we try to pop another utf8 byte out
109+
popped = cr.pop_utf8();
110+
ASSERT_NE(popped.error, 0);
111+
}
112+
113+
TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
114+
LIBC_NAMESPACE::internal::mbstate state;
115+
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
116+
117+
// testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1
118+
char32_t utf32 = 0x1f921;
119+
cr.push(utf32);
120+
auto popped = cr.pop_utf8();
121+
ASSERT_EQ(popped.error, 0);
122+
ASSERT_EQ(static_cast<int>(popped.out), 0xf0);
123+
ASSERT_TRUE(!cr.isComplete());
124+
popped = cr.pop_utf8();
125+
ASSERT_EQ(popped.error, 0);
126+
ASSERT_EQ(static_cast<int>(popped.out), 0x9f);
127+
ASSERT_TRUE(!cr.isComplete());
128+
popped = cr.pop_utf8();
129+
ASSERT_EQ(popped.error, 0);
130+
ASSERT_EQ(static_cast<int>(popped.out), 0xa4);
131+
ASSERT_TRUE(!cr.isComplete());
132+
popped = cr.pop_utf8();
133+
ASSERT_EQ(popped.error, 0);
134+
ASSERT_EQ(static_cast<int>(popped.out), 0xa1);
135+
ASSERT_TRUE(cr.isComplete());
136+
137+
// testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
138+
utf32 = 0x12121;
139+
cr.push(utf32);
140+
popped = cr.pop_utf8();
141+
ASSERT_EQ(popped.error, 0);
142+
ASSERT_EQ(static_cast<int>(popped.out), 0xf0);
143+
ASSERT_TRUE(!cr.isComplete());
144+
popped = cr.pop_utf8();
145+
ASSERT_EQ(popped.error, 0);
146+
ASSERT_EQ(static_cast<int>(popped.out), 0x92);
147+
ASSERT_TRUE(!cr.isComplete());
148+
popped = cr.pop_utf8();
149+
ASSERT_EQ(popped.error, 0);
150+
ASSERT_EQ(static_cast<int>(popped.out), 0x84);
151+
ASSERT_TRUE(!cr.isComplete());
152+
popped = cr.pop_utf8();
153+
ASSERT_EQ(popped.error, 0);
154+
ASSERT_EQ(static_cast<int>(popped.out), 0xa1);
155+
ASSERT_TRUE(cr.isComplete());
156+
157+
// should error if we try to pop another utf8 byte out
158+
popped = cr.pop_utf8();
159+
ASSERT_NE(popped.error, 0);
160+
}

0 commit comments

Comments
 (0)