Skip to content

Commit 20cd8e5

Browse files
author
Sriya Pratipati
committed
Cleaned up code, added edge cases, added new test cases for edge cases
1 parent 84913d8 commit 20cd8e5

File tree

3 files changed

+89
-33
lines changed

3 files changed

+89
-33
lines changed

libc/src/__support/wchar/character_converter.cpp

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include "hdr/types/char32_t.h"
1010
#include "hdr/types/char8_t.h"
11+
#include "src/__support/CPP/bit.h"
1112
#include "src/__support/wchar/mbstate.h"
1213
#include "src/__support/wchar/utf_ret.h"
1314

@@ -25,59 +26,63 @@ bool CharacterConverter::isComplete() {
2526
int CharacterConverter::push(char8_t utf8_byte) {
2627
// Checking the first byte if first push
2728
if (state->bytes_processed == 0 && state->total_bytes == 0) {
29+
state->partial = static_cast<char32_t>(0);
2830
// 1 byte total
29-
if ((utf8_byte & 128) == 0) {
31+
if (cpp::countl_one(utf8_byte) == 0) {
3032
state->total_bytes = 1;
31-
state->bytes_processed = 1;
32-
state->partial = static_cast<char32_t>(utf8_byte);
33-
return 0;
3433
}
3534
// 2 bytes total
36-
else if ((utf8_byte & 0xE0) == 0xC0) {
35+
else if (cpp::countl_one(utf8_byte) == 2) {
3736
state->total_bytes = 2;
38-
state->bytes_processed = 1;
3937
utf8_byte &= 0x1F;
40-
state->partial = static_cast<char32_t>(utf8_byte);
41-
return 0;
4238
}
4339
// 3 bytes total
44-
else if ((utf8_byte & 0xF0) == 0xE0) {
40+
else if (cpp::countl_one(utf8_byte) == 3) {
4541
state->total_bytes = 3;
46-
state->bytes_processed = 1;
4742
utf8_byte &= 0x0F;
48-
state->partial = static_cast<char32_t>(utf8_byte);
49-
return 0;
5043
}
5144
// 4 bytes total
52-
else if ((utf8_byte & 0xF8) == 0xF0) {
45+
else if (cpp::countl_one(utf8_byte) == 4) {
5346
state->total_bytes = 4;
54-
state->bytes_processed = 1;
5547
utf8_byte &= 0x07;
56-
state->partial = static_cast<char32_t>(utf8_byte);
57-
return 0;
5848
}
59-
// Invalid
49+
// Invalid byte -> reset mbstate
6050
else {
61-
state->bytes_processed++;
51+
state->partial = static_cast<char32_t>(0);
52+
state->bytes_processed = 0;
53+
state->total_bytes = 0;
6254
return -1;
6355
}
56+
state->partial = static_cast<char32_t>(utf8_byte);
57+
state->bytes_processed++;
58+
return 0;
6459
}
6560
// Any subsequent push
66-
if ((utf8_byte & 0xC0) == 0x80) {
67-
state->partial = state->partial << 6;
61+
if (cpp::countl_one(utf8_byte) == 1 && !isComplete()) {
6862
char32_t byte = utf8_byte & 0x3F;
63+
state->partial = state->partial << 6;
6964
state->partial |= byte;
7065
state->bytes_processed++;
7166
return 0;
7267
}
73-
state->bytes_processed++;
68+
// Invalid byte -> reset if we didn't get successful complete read
69+
if (!isComplete()) {
70+
state->partial = static_cast<char32_t>(0);
71+
state->bytes_processed = 0;
72+
state->total_bytes = 0;
73+
}
7474
return -1;
7575
}
7676

7777
utf_ret<char32_t> CharacterConverter::pop_utf32() {
7878
utf_ret<char32_t> utf32;
7979
utf32.error = 0;
8080
utf32.out = state->partial;
81+
if (!isComplete())
82+
utf32.error = -1;
83+
state->bytes_processed = 0;
84+
state->total_bytes = 0;
85+
state->partial = static_cast<char32_t>(0);
8186
return utf32;
8287
}
8388

libc/test/src/__support/wchar/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@ add_libc_test(
88
utf8_to_32_test.cpp
99
DEPENDS
1010
libc.src.__support.wchar.character_converter
11-
)
11+
)

libc/test/src/__support/wchar/utf8_to_32_test.cpp

Lines changed: 62 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
//===-- Unittests for character_converter utf8->3 -------------------------===//
1+
//===-- Unittests for character_converter utf8->utf32 ---------------------===//
22
//
33
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
44
// See https://llvm.org/LICENSE.txt for license information.
@@ -30,7 +30,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
3030
LIBC_NAMESPACE::internal::mbstate state;
3131
state.bytes_processed = 0;
3232
state.total_bytes = 0;
33-
const char *ch = "Ž"; // hex 0xC2, 0x8E
33+
const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)}; // Ž
3434

3535
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
3636
char_conv.push(static_cast<char8_t>(ch[0]));
@@ -45,7 +45,8 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
4545
LIBC_NAMESPACE::internal::mbstate state;
4646
state.bytes_processed = 0;
4747
state.total_bytes = 0;
48-
const char *ch = ""; // hex 0xE2, 0x88, 0x91
48+
const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
49+
static_cast<char>(0x91)}; //
4950

5051
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
5152
char_conv.push(static_cast<char8_t>(ch[0]));
@@ -61,7 +62,8 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
6162
LIBC_NAMESPACE::internal::mbstate state;
6263
state.bytes_processed = 0;
6364
state.total_bytes = 0;
64-
const char *ch = "🤡"; // hex 0xF0, 0x9F, 0xA4, 0xA1
65+
const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
66+
static_cast<char>(0xA4), static_cast<char>(0xA1)}; // 🤡
6567

6668
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
6769
char_conv.push(static_cast<char8_t>(ch[0]));
@@ -90,36 +92,85 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
9092
LIBC_NAMESPACE::internal::mbstate state;
9193
state.bytes_processed = 0;
9294
state.total_bytes = 0;
93-
const char ch[4] = {static_cast<char>(0x80), static_cast<char>(0x00),
94-
static_cast<char>(0x00),
95-
static_cast<char>(0x00)}; // All bytes are invalid
95+
const char ch[4] = {
96+
static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
97+
static_cast<char>(0x00)}; // first, third, and last bytes are invalid
9698

9799
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
98100
int err = char_conv.push(static_cast<char8_t>(ch[0]));
99101
ASSERT_EQ(err, -1);
100102
err = char_conv.push(static_cast<char8_t>(ch[1]));
101-
ASSERT_EQ(err, -1);
103+
ASSERT_EQ(err, 0);
104+
// Prev byte was single byte so trying to read another should error.
102105
err = char_conv.push(static_cast<char8_t>(ch[2]));
103106
ASSERT_EQ(err, -1);
104107
err = char_conv.push(static_cast<char8_t>(ch[3]));
105108
ASSERT_EQ(err, -1);
106109
}
107110

108-
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMiddleByte) {
111+
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
109112
LIBC_NAMESPACE::internal::mbstate state;
110113
state.bytes_processed = 0;
111114
state.total_bytes = 0;
112-
const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0xC0),
115+
const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0x80),
113116
static_cast<char>(0x80),
114-
static_cast<char>(0x80)}; // invalid second byte
117+
static_cast<char>(0xC0)}; // invalid last byte
115118

116119
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
117120
int err = char_conv.push(static_cast<char8_t>(ch[0]));
118121
ASSERT_EQ(err, 0);
119122
err = char_conv.push(static_cast<char8_t>(ch[1]));
123+
ASSERT_EQ(err, 0);
124+
err = char_conv.push(static_cast<char8_t>(ch[2]));
125+
ASSERT_EQ(err, 0);
126+
err = char_conv.push(static_cast<char8_t>(ch[3]));
120127
ASSERT_EQ(err, -1);
128+
}
129+
130+
TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
131+
LIBC_NAMESPACE::internal::mbstate state;
132+
state.bytes_processed = 0;
133+
state.total_bytes = 0;
134+
const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
135+
static_cast<char>(0x80)};
136+
137+
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
138+
int err = char_conv.push(static_cast<char8_t>(ch[0]));
139+
ASSERT_EQ(err, 0);
140+
err = char_conv.push(static_cast<char8_t>(ch[1]));
141+
ASSERT_EQ(err, 0);
142+
// Should produce an error on 3rd byte
143+
err = char_conv.push(static_cast<char8_t>(ch[2]));
144+
ASSERT_EQ(err, -1);
145+
146+
LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
147+
ASSERT_EQ(wch.error, 0);
148+
// Should still output the correct result.
149+
ASSERT_EQ(static_cast<int>(wch.out), 142);
150+
}
151+
152+
TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
153+
LIBC_NAMESPACE::internal::mbstate state;
154+
state.bytes_processed = 0;
155+
state.total_bytes = 0;
156+
const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
157+
static_cast<char>(0xC7), static_cast<char>(0x8C)};
158+
159+
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
160+
int err = char_conv.push(static_cast<char8_t>(ch[0]));
161+
ASSERT_EQ(err, 0);
162+
err = char_conv.push(static_cast<char8_t>(ch[1]));
163+
ASSERT_EQ(err, 0);
164+
LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
165+
ASSERT_EQ(wch.error, 0);
166+
ASSERT_EQ(static_cast<int>(wch.out), 142);
167+
168+
// Second two byte character
121169
err = char_conv.push(static_cast<char8_t>(ch[2]));
122170
ASSERT_EQ(err, 0);
123171
err = char_conv.push(static_cast<char8_t>(ch[3]));
124172
ASSERT_EQ(err, 0);
173+
wch = char_conv.pop_utf32();
174+
ASSERT_EQ(wch.error, 0);
175+
ASSERT_EQ(static_cast<int>(wch.out), 460);
125176
}

0 commit comments

Comments
 (0)