@@ -37,119 +37,36 @@ int CharacterConverter::push(char32_t utf32) {
3737 break ;
3838 }
3939 }
40- if (state->total_bytes == 0 ) {
40+ if (state->total_bytes == 0 )
4141 return -1 ;
42- }
4342
4443 return 0 ;
4544}
4645
47- utf_ret<char8_t > CharacterConverter::pop_utf8_seqlength1 () {
48- utf_ret<char8_t > result;
49- result.error = 0 ;
50-
51- // 0xxxxxxx
52- switch (state->bytes_processed ) {
53- case 0 :
54- result.out = (char8_t )(state->partial );
55- break ;
56- default :
57- result.error = -1 ;
58- return result;
59- }
60-
61- state->bytes_processed ++;
62- return result;
63- }
64-
65- utf_ret<char8_t > CharacterConverter::pop_utf8_seqlength2 () {
66- utf_ret<char8_t > result;
67- result.error = 0 ;
68-
69- // 110xxxxx 10xxxxxx
70- char32_t utf32 = state->partial ;
71- switch (state->bytes_processed ) {
72- case 0 :
73- result.out = (char8_t )(0xC0 | (utf32 >> 6 ));
74- break ;
75- case 1 :
76- result.out = (char8_t )(0x80 | (utf32 & 0x3f ));
77- break ;
78- default :
79- result.error = -1 ;
80- return result;
81- }
82-
83- state->bytes_processed ++;
84- return result;
85- }
86-
87- utf_ret<char8_t > CharacterConverter::pop_utf8_seqlength3 () {
88- utf_ret<char8_t > result;
89- result.error = 0 ;
90-
91- // 1110xxxx 10xxxxxx 10xxxxxx
92- char32_t utf32 = state->partial ;
93- switch (state->bytes_processed ) {
94- case 0 :
95- result.out = (char8_t )(0xE0 | (utf32 >> 12 ));
96- break ;
97- case 1 :
98- result.out = (char8_t )(0x80 | ((utf32 >> 6 ) & 0x3f ));
99- break ;
100- case 2 :
101- result.out = (char8_t )(0x80 | (utf32 & 0x3f ));
102- break ;
103- default :
104- result.error = -1 ;
105- return result;
106- }
107-
108- state->bytes_processed ++;
109- return result;
110- }
111-
112- utf_ret<char8_t > CharacterConverter::pop_utf8_seqlength4 () {
113- utf_ret<char8_t > result;
114- result.error = 0 ;
46+ utf_ret<char8_t > CharacterConverter::pop_utf8 () {
47+ if (state->bytes_processed >= state->total_bytes )
48+ return {.out = 0 , .error = -1 };
11549
116- // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
50+ char8_t first_byte_headers[] = { 0 , 0xC0 , 0xE0 , 0xF0 };
11751 char32_t utf32 = state->partial ;
118- switch (state->bytes_processed ) {
119- case 0 :
120- result.out = (char8_t )(0xF0 | (utf32 >> 18 ));
121- break ;
122- case 1 :
123- result.out = (char8_t )(0x80 | ((utf32 >> 12 ) & 0x3f ));
124- break ;
125- case 2 :
126- result.out = (char8_t )(0x80 | ((utf32 >> 6 ) & 0x3f ));
127- break ;
128- case 3 :
129- result.out = (char8_t )(0x80 | (utf32 & 0x3f ));
130- break ;
131- default :
132- result.error = -1 ;
133- return result;
52+ char32_t tb = state->total_bytes ;
53+ char32_t bp = state->bytes_processed ;
54+ char32_t output;
55+ if (state->bytes_processed == 0 ) {
56+ /*
57+ Choose the correct set of most significant bits to encode the length
58+ of the utf8 sequence. The remaining bits contain the most significant
59+ bits of the unicode value of the character.
60+ */
61+ output = first_byte_headers[tb - 1 ] | (utf32 >> ((tb - 1 ) * 6 ));
62+ } else {
63+ // Get the next 6 bits and format it like so: 10xxxxxx
64+ const char32_t shift_amount = (tb - bp - 1 ) * 6 ;
65+ output = 0x80 | ((utf32 >> shift_amount) & 0x3f );
13466 }
13567
13668 state->bytes_processed ++;
137- return result;
138- }
139-
140- utf_ret<char8_t > CharacterConverter::pop_utf8 () {
141- switch (state->total_bytes ) {
142- case 1 :
143- return pop_utf8_seqlength1 ();
144- case 2 :
145- return pop_utf8_seqlength2 ();
146- case 3 :
147- return pop_utf8_seqlength3 ();
148- case 4 :
149- return pop_utf8_seqlength4 ();
150- }
151-
152- return {.out = 0 , .error = -1 };
69+ return {.out = (char8_t )output, .error = 0 };
15370}
15471
15572} // namespace internal
0 commit comments