Skip to content

Commit 9698a8d

Browse files
committed
condensed conversion math
1 parent ac92fff commit 9698a8d

File tree

2 files changed

+20
-108
lines changed

2 files changed

+20
-108
lines changed

libc/src/__support/wchar/character_converter.cpp

Lines changed: 20 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -37,119 +37,36 @@ int CharacterConverter::push(char32_t utf32) {
3737
break;
3838
}
3939
}
40-
if (state->total_bytes == 0) {
40+
if (state->total_bytes == 0)
4141
return -1;
42-
}
4342

4443
return 0;
4544
}
4645

47-
utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength1() {
48-
utf_ret<char8_t> result;
49-
result.error = 0;
50-
51-
// 0xxxxxxx
52-
switch (state->bytes_processed) {
53-
case 0:
54-
result.out = (char8_t)(state->partial);
55-
break;
56-
default:
57-
result.error = -1;
58-
return result;
59-
}
60-
61-
state->bytes_processed++;
62-
return result;
63-
}
64-
65-
utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength2() {
66-
utf_ret<char8_t> result;
67-
result.error = 0;
68-
69-
// 110xxxxx 10xxxxxx
70-
char32_t utf32 = state->partial;
71-
switch (state->bytes_processed) {
72-
case 0:
73-
result.out = (char8_t)(0xC0 | (utf32 >> 6));
74-
break;
75-
case 1:
76-
result.out = (char8_t)(0x80 | (utf32 & 0x3f));
77-
break;
78-
default:
79-
result.error = -1;
80-
return result;
81-
}
82-
83-
state->bytes_processed++;
84-
return result;
85-
}
86-
87-
utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength3() {
88-
utf_ret<char8_t> result;
89-
result.error = 0;
90-
91-
// 1110xxxx 10xxxxxx 10xxxxxx
92-
char32_t utf32 = state->partial;
93-
switch (state->bytes_processed) {
94-
case 0:
95-
result.out = (char8_t)(0xE0 | (utf32 >> 12));
96-
break;
97-
case 1:
98-
result.out = (char8_t)(0x80 | ((utf32 >> 6) & 0x3f));
99-
break;
100-
case 2:
101-
result.out = (char8_t)(0x80 | (utf32 & 0x3f));
102-
break;
103-
default:
104-
result.error = -1;
105-
return result;
106-
}
107-
108-
state->bytes_processed++;
109-
return result;
110-
}
111-
112-
utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength4() {
113-
utf_ret<char8_t> result;
114-
result.error = 0;
46+
utf_ret<char8_t> CharacterConverter::pop_utf8() {
47+
if (state->bytes_processed >= state->total_bytes)
48+
return {.out = 0, .error = -1};
11549

116-
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
50+
char8_t first_byte_headers[] = {0, 0xC0, 0xE0, 0xF0};
11751
char32_t utf32 = state->partial;
118-
switch (state->bytes_processed) {
119-
case 0:
120-
result.out = (char8_t)(0xF0 | (utf32 >> 18));
121-
break;
122-
case 1:
123-
result.out = (char8_t)(0x80 | ((utf32 >> 12) & 0x3f));
124-
break;
125-
case 2:
126-
result.out = (char8_t)(0x80 | ((utf32 >> 6) & 0x3f));
127-
break;
128-
case 3:
129-
result.out = (char8_t)(0x80 | (utf32 & 0x3f));
130-
break;
131-
default:
132-
result.error = -1;
133-
return result;
52+
char32_t tb = state->total_bytes;
53+
char32_t bp = state->bytes_processed;
54+
char32_t output;
55+
if (state->bytes_processed == 0) {
56+
/*
57+
Choose the correct set of most significant bits to encode the length
58+
of the utf8 sequence. The remaining bits contain the most significant
59+
bits of the unicode value of the character.
60+
*/
61+
output = first_byte_headers[tb - 1] | (utf32 >> ((tb - 1) * 6));
62+
} else {
63+
// Get the next 6 bits and format it like so: 10xxxxxx
64+
const char32_t shift_amount = (tb - bp - 1) * 6;
65+
output = 0x80 | ((utf32 >> shift_amount) & 0x3f);
13466
}
13567

13668
state->bytes_processed++;
137-
return result;
138-
}
139-
140-
utf_ret<char8_t> CharacterConverter::pop_utf8() {
141-
switch (state->total_bytes) {
142-
case 1:
143-
return pop_utf8_seqlength1();
144-
case 2:
145-
return pop_utf8_seqlength2();
146-
case 3:
147-
return pop_utf8_seqlength3();
148-
case 4:
149-
return pop_utf8_seqlength4();
150-
}
151-
152-
return {.out = 0, .error = -1};
69+
return {.out = (char8_t)output, .error = 0};
15370
}
15471

15572
} // namespace internal

libc/src/__support/wchar/character_converter.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,6 @@ class CharacterConverter {
2222
private:
2323
mbstate *state;
2424

25-
utf_ret<char8_t> pop_utf8_seqlength1();
26-
utf_ret<char8_t> pop_utf8_seqlength2();
27-
utf_ret<char8_t> pop_utf8_seqlength3();
28-
utf_ret<char8_t> pop_utf8_seqlength4();
29-
3025
public:
3126
CharacterConverter(mbstate *mbstate);
3227

0 commit comments

Comments
 (0)