Skip to content

Commit 3b6fccf

Browse files
committed
switched to bytes_processed
1 parent fe7f23d commit 3b6fccf

File tree

2 files changed

+44
-29
lines changed

2 files changed

+44
-29
lines changed

libc/src/__support/wchar/character_converter.cpp

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,11 @@ namespace internal {
1919

2020
CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
2121

22-
bool CharacterConverter::isComplete() {
23-
return state->bits_processed / 8 == state->total_bytes;
24-
}
25-
2622
int CharacterConverter::push(char8_t utf8_byte) { return utf8_byte; }
2723

2824
int CharacterConverter::push(char32_t utf32) {
2925
state->partial = utf32;
30-
state->bits_processed = 0;
26+
state->bytes_processed = 0;
3127
state->total_bytes = 0;
3228

3329
// determine number of utf-8 bytes needed to represent this utf32 value
@@ -51,7 +47,7 @@ utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength1() {
5147
result.error = 0;
5248

5349
// 0xxxxxxx
54-
switch (state->bits_processed) {
50+
switch (state->bytes_processed) {
5551
case 0:
5652
result.out = (char8_t)(state->partial);
5753
break;
@@ -60,7 +56,7 @@ utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength1() {
6056
return result;
6157
}
6258

63-
state->bits_processed += 8;
59+
state->bytes_processed++;
6460
return result;
6561
}
6662

@@ -70,19 +66,19 @@ utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength2() {
7066

7167
// 110xxxxx 10xxxxxx
7268
char32_t utf32 = state->partial;
73-
switch (state->bits_processed) {
69+
switch (state->bytes_processed) {
7470
case 0:
7571
result.out = (char8_t)(0xC0 | (utf32 >> 6));
7672
break;
77-
case 8:
73+
case 1:
7874
result.out = (char8_t)(0x80 | (utf32 & 0x3f));
7975
break;
8076
default:
8177
result.error = -1;
8278
return result;
8379
}
8480

85-
state->bits_processed += 8;
81+
state->bytes_processed++;
8682
return result;
8783
}
8884

@@ -92,22 +88,22 @@ utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength3() {
9288

9389
// 1110xxxx 10xxxxxx 10xxxxxx
9490
char32_t utf32 = state->partial;
95-
switch (state->bits_processed) {
91+
switch (state->bytes_processed) {
9692
case 0:
9793
result.out = (char8_t)(0xE0 | (utf32 >> 12));
9894
break;
99-
case 8:
95+
case 1:
10096
result.out = (char8_t)(0x80 | ((utf32 >> 6) & 0x3f));
10197
break;
102-
case 16:
98+
case 2:
10399
result.out = (char8_t)(0x80 | (utf32 & 0x3f));
104100
break;
105101
default:
106102
result.error = -1;
107103
return result;
108104
}
109105

110-
state->bits_processed += 8;
106+
state->bytes_processed++;
111107
return result;
112108
}
113109

@@ -117,25 +113,25 @@ utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength4() {
117113

118114
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
119115
char32_t utf32 = state->partial;
120-
switch (state->bits_processed) {
116+
switch (state->bytes_processed) {
121117
case 0:
122118
result.out = (char8_t)(0xF0 | (utf32 >> 18));
123119
break;
124-
case 8:
120+
case 1:
125121
result.out = (char8_t)(0x80 | ((utf32 >> 12) & 0x3f));
126122
break;
127-
case 16:
123+
case 2:
128124
result.out = (char8_t)(0x80 | ((utf32 >> 6) & 0x3f));
129125
break;
130-
case 24:
126+
case 3:
131127
result.out = (char8_t)(0x80 | (utf32 & 0x3f));
132128
break;
133129
default:
134130
result.error = -1;
135131
return result;
136132
}
137133

138-
state->bits_processed += 8;
134+
state->bytes_processed++;
139135
return result;
140136
}
141137

libc/test/src/__support/wchar/utf32_to_8_test.cpp

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,20 +16,22 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
1616
LIBC_NAMESPACE::internal::mbstate state;
1717
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
1818

19-
char32_t utf32_A = 0x41;
19+
// utf8 1-byte encodings are identical to their utf32 representations
20+
char32_t utf32_A = 0x41; // 'A'
2021
cr.push(utf32_A);
2122
auto popped = cr.pop_utf8();
2223
ASSERT_EQ(popped.error, 0);
2324
ASSERT_EQ(static_cast<char>(popped.out), 'A');
2425
ASSERT_TRUE(cr.isComplete());
2526

26-
char32_t utf32_B = 0x42;
27+
char32_t utf32_B = 0x42; // 'B'
2728
cr.push(utf32_B);
2829
popped = cr.pop_utf8();
2930
ASSERT_EQ(popped.error, 0);
3031
ASSERT_EQ(static_cast<char>(popped.out), 'B');
3132
ASSERT_TRUE(cr.isComplete());
3233

34+
// should error if we try to pop another utf8 byte out
3335
popped = cr.pop_utf8();
3436
ASSERT_NE(popped.error, 0);
3537
}
@@ -38,6 +40,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
3840
LIBC_NAMESPACE::internal::mbstate state;
3941
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
4042

43+
// testing utf32: 0xff -> utf8: 0xc3 0xbf
4144
char32_t utf32 = 0xff;
4245
cr.push(utf32);
4346
auto popped = cr.pop_utf8();
@@ -49,6 +52,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
4952
ASSERT_EQ(static_cast<int>(popped.out), 0xbf);
5053
ASSERT_TRUE(cr.isComplete());
5154

55+
// testing utf32: 0x58e -> utf8: 0xd6 0x8e
5256
utf32 = 0x58e;
5357
cr.push(utf32);
5458
popped = cr.pop_utf8();
@@ -60,6 +64,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
6064
ASSERT_EQ(static_cast<int>(popped.out), 0x8e);
6165
ASSERT_TRUE(cr.isComplete());
6266

67+
// should error if we try to pop another utf8 byte out
6368
popped = cr.pop_utf8();
6469
ASSERT_NE(popped.error, 0);
6570
}
@@ -68,6 +73,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
6873
LIBC_NAMESPACE::internal::mbstate state;
6974
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
7075

76+
// testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95
7177
char32_t utf32 = 0xac15;
7278
cr.push(utf32);
7379
auto popped = cr.pop_utf8();
@@ -83,6 +89,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
8389
ASSERT_EQ(static_cast<int>(popped.out), 0x95);
8490
ASSERT_TRUE(cr.isComplete());
8591

92+
// testing utf32: 0x267b -> utf8: 0xe2 0x99 0xbb
8693
utf32 = 0x267b;
8794
cr.push(utf32);
8895
popped = cr.pop_utf8();
@@ -98,6 +105,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
98105
ASSERT_EQ(static_cast<int>(popped.out), 0xbb);
99106
ASSERT_TRUE(cr.isComplete());
100107

108+
// should error if we try to pop another utf8 byte out
101109
popped = cr.pop_utf8();
102110
ASSERT_NE(popped.error, 0);
103111
}
@@ -106,36 +114,47 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
106114
LIBC_NAMESPACE::internal::mbstate state;
107115
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
108116

109-
char32_t utf32 = 0xac15;
117+
// testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1
118+
char32_t utf32 = 0x1f921;
110119
cr.push(utf32);
111120
auto popped = cr.pop_utf8();
112121
ASSERT_EQ(popped.error, 0);
113-
ASSERT_EQ(static_cast<int>(popped.out), 0xea);
122+
ASSERT_EQ(static_cast<int>(popped.out), 0xf0);
114123
ASSERT_TRUE(!cr.isComplete());
115124
popped = cr.pop_utf8();
116125
ASSERT_EQ(popped.error, 0);
117-
ASSERT_EQ(static_cast<int>(popped.out), 0xb0);
126+
ASSERT_EQ(static_cast<int>(popped.out), 0x9f);
118127
ASSERT_TRUE(!cr.isComplete());
119128
popped = cr.pop_utf8();
120129
ASSERT_EQ(popped.error, 0);
121-
ASSERT_EQ(static_cast<int>(popped.out), 0x95);
130+
ASSERT_EQ(static_cast<int>(popped.out), 0xa4);
131+
ASSERT_TRUE(!cr.isComplete());
132+
popped = cr.pop_utf8();
133+
ASSERT_EQ(popped.error, 0);
134+
ASSERT_EQ(static_cast<int>(popped.out), 0xa1);
122135
ASSERT_TRUE(cr.isComplete());
123136

124-
utf32 = 0x267b;
137+
// testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
138+
utf32 = 0x12121;
125139
cr.push(utf32);
126140
popped = cr.pop_utf8();
127141
ASSERT_EQ(popped.error, 0);
128-
ASSERT_EQ(static_cast<int>(popped.out), 0xe2);
142+
ASSERT_EQ(static_cast<int>(popped.out), 0xf0);
129143
ASSERT_TRUE(!cr.isComplete());
130144
popped = cr.pop_utf8();
131145
ASSERT_EQ(popped.error, 0);
132-
ASSERT_EQ(static_cast<int>(popped.out), 0x99);
146+
ASSERT_EQ(static_cast<int>(popped.out), 0x92);
133147
ASSERT_TRUE(!cr.isComplete());
134148
popped = cr.pop_utf8();
135149
ASSERT_EQ(popped.error, 0);
136-
ASSERT_EQ(static_cast<int>(popped.out), 0xbb);
150+
ASSERT_EQ(static_cast<int>(popped.out), 0x84);
151+
ASSERT_TRUE(!cr.isComplete());
152+
popped = cr.pop_utf8();
153+
ASSERT_EQ(popped.error, 0);
154+
ASSERT_EQ(static_cast<int>(popped.out), 0xa1);
137155
ASSERT_TRUE(cr.isComplete());
138156

157+
// should error if we try to pop another utf8 byte out
139158
popped = cr.pop_utf8();
140159
ASSERT_NE(popped.error, 0);
141160
}

0 commit comments

Comments
 (0)