|
23 | 23 | */
|
24 | 24 | #pragma once
|
25 | 25 |
|
26 |
| -#include "SharedUtil.IntTypes.h" |
27 | 26 | #include <string>
|
28 | 27 |
|
29 |
| -/* Return code if invalid. (xxx_mbtowc, xxx_wctomb) */ |
30 |
| -#define RET_ILSEQ 0 |
31 |
| -/* Return code if only a shift sequence of n bytes was read. (xxx_mbtowc) */ |
32 |
| -#define RET_TOOFEW(n) (-1-(n)) |
33 |
| -/* Return code if output buffer is too small. (xxx_wctomb, xxx_reset) */ |
34 |
| -#define RET_TOOSMALL -1 |
35 |
| -/* Replacement character for invalid multibyte sequence or wide character. */ |
36 |
| -#define BAD_WCHAR ((wchar_t) 0xfffd) |
37 |
| -#define BAD_CHAR '?' |
| 28 | +int utf8_mbtowc(wchar_t* pwc, const unsigned char* src, int src_len); |
38 | 29 |
|
39 |
| -int utf8_mbtowc(wchar_t* pwc, const unsigned char* src, int src_len) |
40 |
| -{ |
41 |
| - if (!pwc) |
42 |
| - return 0; |
43 |
| - |
44 |
| - unsigned char c = src[0]; |
45 |
| - |
46 |
| - if (c < 0x80) |
47 |
| - { |
48 |
| - *pwc = c; |
49 |
| - return 1; |
50 |
| - } |
51 |
| - else if (c < 0xc2) |
52 |
| - { |
53 |
| - return RET_ILSEQ; |
54 |
| - } |
55 |
| - else if (c < 0xe0) |
56 |
| - { |
57 |
| - if (src_len < 2) |
58 |
| - return RET_TOOFEW(0); |
59 |
| - if (!((src[1] ^ 0x80) < 0x40)) |
60 |
| - return RET_ILSEQ; |
61 |
| - *pwc = ((wchar_t)(c & 0x1f) << 6) | (wchar_t)(src[1] ^ 0x80); |
62 |
| - return 2; |
63 |
| - } |
64 |
| - else if (c < 0xf0) |
65 |
| - { |
66 |
| - if (src_len < 3) |
67 |
| - return RET_TOOFEW(0); |
68 |
| - if (!((src[1] ^ 0x80) < 0x40 && (src[2] ^ 0x80) < 0x40 && (c >= 0xe1 || src[1] >= 0xa0))) |
69 |
| - return RET_ILSEQ; |
70 |
| - *pwc = ((wchar_t)(c & 0x0f) << 12) | ((wchar_t)(src[1] ^ 0x80) << 6) | (wchar_t)(src[2] ^ 0x80); |
71 |
| - return 3; |
72 |
| - } |
73 |
| - else if (c < 0xf8) |
74 |
| - { |
75 |
| - if (src_len < 4) |
76 |
| - return RET_TOOFEW(0); |
77 |
| - if (!((src[1] ^ 0x80) < 0x40 && (src[2] ^ 0x80) < 0x40 && (src[3] ^ 0x80) < 0x40 && (c >= 0xf1 || src[1] >= 0x90))) |
78 |
| - return RET_ILSEQ; |
79 |
| - *pwc = ((wchar_t)(c & 0x07) << 18) | ((wchar_t)(src[1] ^ 0x80) << 12) | ((wchar_t)(src[2] ^ 0x80) << 6) | (wchar_t)(src[3] ^ 0x80); |
80 |
| - return 4; |
81 |
| - } |
82 |
| - else if (c < 0xfc) |
83 |
| - { |
84 |
| - if (src_len < 5) |
85 |
| - return RET_TOOFEW(0); |
86 |
| - if (!((src[1] ^ 0x80) < 0x40 && (src[2] ^ 0x80) < 0x40 && (src[3] ^ 0x80) < 0x40 && (src[4] ^ 0x80) < 0x40 && (c >= 0xf9 || src[1] >= 0x88))) |
87 |
| - return RET_ILSEQ; |
88 |
| - *pwc = ((wchar_t)(c & 0x03) << 24) | ((wchar_t)(src[1] ^ 0x80) << 18) | ((wchar_t)(src[2] ^ 0x80) << 12) | ((wchar_t)(src[3] ^ 0x80) << 6) | |
89 |
| - (wchar_t)(src[4] ^ 0x80); |
90 |
| - return 5; |
91 |
| - } |
92 |
| - else if (c < 0xfe) |
93 |
| - { |
94 |
| - if (src_len < 6) |
95 |
| - return RET_TOOFEW(0); |
96 |
| - if (!((src[1] ^ 0x80) < 0x40 && (src[2] ^ 0x80) < 0x40 && (src[3] ^ 0x80) < 0x40 && (src[4] ^ 0x80) < 0x40 && (src[5] ^ 0x80) < 0x40 && |
97 |
| - (c >= 0xfd || src[1] >= 0x84))) |
98 |
| - return RET_ILSEQ; |
99 |
| - *pwc = ((wchar_t)(c & 0x01) << 30) | ((wchar_t)(src[1] ^ 0x80) << 24) | ((wchar_t)(src[2] ^ 0x80) << 18) | ((wchar_t)(src[3] ^ 0x80) << 12) | |
100 |
| - ((wchar_t)(src[4] ^ 0x80) << 6) | (wchar_t)(src[5] ^ 0x80); |
101 |
| - return 6; |
102 |
| - } |
103 |
| - else |
104 |
| - return RET_ILSEQ; |
105 |
| -} |
106 |
| - |
107 |
| -int utf8_wctomb(unsigned char* dest, wchar_t wc, int dest_size) |
108 |
| -{ |
109 |
| - if (!dest) |
110 |
| - return 0; |
111 |
| - |
112 |
| - int count; |
113 |
| - if (wc < 0x80) |
114 |
| - count = 1; |
115 |
| - else if (wc < 0x800) |
116 |
| - count = 2; |
117 |
| - else if (wc < 0x10000) |
118 |
| - count = 3; |
119 |
| - else if (wc < 0x200000) |
120 |
| - count = 4; |
121 |
| - else if (wc < 0x4000000) |
122 |
| - count = 5; |
123 |
| - else if (wc <= 0x7fffffff) |
124 |
| - count = 6; |
125 |
| - else |
126 |
| - return RET_ILSEQ; |
127 |
| - if (dest_size < count) |
128 |
| - return RET_TOOSMALL; |
129 |
| - switch (count) |
130 |
| - { /* note: code falls through cases! */ |
131 |
| - case 6: |
132 |
| - dest[5] = 0x80 | (wc & 0x3f); |
133 |
| - wc = wc >> 6; |
134 |
| - wc |= 0x4000000; |
135 |
| - case 5: |
136 |
| - dest[4] = 0x80 | (wc & 0x3f); |
137 |
| - wc = wc >> 6; |
138 |
| - wc |= 0x200000; |
139 |
| - case 4: |
140 |
| - dest[3] = 0x80 | (wc & 0x3f); |
141 |
| - wc = wc >> 6; |
142 |
| - wc |= 0x10000; |
143 |
| - case 3: |
144 |
| - dest[2] = 0x80 | (wc & 0x3f); |
145 |
| - wc = wc >> 6; |
146 |
| - wc |= 0x800; |
147 |
| - case 2: |
148 |
| - dest[1] = 0x80 | (wc & 0x3f); |
149 |
| - wc = wc >> 6; |
150 |
| - wc |= 0xc0; |
151 |
| - case 1: |
152 |
| - dest[0] = (unsigned char)wc; |
153 |
| - } |
154 |
| - return count; |
155 |
| -} |
| 30 | +int utf8_wctomb(unsigned char* dest, wchar_t wc, int dest_size); |
156 | 31 |
|
157 | 32 | //////////////////////////////////////////////////
|
158 | 33 | //
|
159 | 34 | // Original - For testing
|
160 | 35 | //
|
161 | 36 |
|
162 |
| -std::wstring utf8_mbstowcs_orig(const std::string& str) |
163 |
| -{ |
164 |
| - std::wstring wstr; |
165 |
| - wchar_t wc; |
166 |
| - unsigned int sn = 0; |
167 |
| - int un = 0; |
168 |
| - |
169 |
| - const unsigned char* s = (const unsigned char*)str.c_str(); |
170 |
| - |
171 |
| - while (sn < str.length() && *s != 0 && (un = utf8_mbtowc(&wc, s, str.length() - sn)) > 0) |
172 |
| - { |
173 |
| - wstr.push_back(wc); |
174 |
| - s += un; |
175 |
| - sn += un; |
176 |
| - } |
177 |
| - return wstr; |
178 |
| -} |
179 |
| - |
180 |
| -std::string utf8_wcstombs_orig(const std::wstring& wstr) |
181 |
| -{ |
182 |
| - std::string str; |
183 |
| - char utf8[6]; |
184 |
| - int un = 0; |
| 37 | +std::wstring utf8_mbstowcs_orig(const std::string& str); |
185 | 38 |
|
186 |
| - for (unsigned int i = 0; i < wstr.size(); ++i) |
187 |
| - { |
188 |
| - un = utf8_wctomb((unsigned char*)utf8, wstr[i], 6); |
189 |
| - if (un > 0) |
190 |
| - str.append(utf8, un); |
191 |
| - } |
192 |
| - return str; |
193 |
| -} |
| 39 | +std::string utf8_wcstombs_orig(const std::wstring& wstr); |
194 | 40 |
|
195 | 41 | //////////////////////////////////////////////////
|
196 | 42 | //
|
197 |
| -// Optimized - faster for strings smaller than SMALL_STRING_LIMIT |
| 43 | +// Optimized - faster for smaller strings |
198 | 44 | //
|
199 |
| -#define SMALL_STRING_LIMIT 1000 |
200 |
| - |
201 |
| -std::wstring utf8_mbstowcs(const std::string& str) |
202 |
| -{ |
203 |
| - const unsigned char* s = (const unsigned char*)str.c_str(); |
204 |
| - const unsigned int length = str.length(); |
205 |
| - |
206 |
| - if (length < SMALL_STRING_LIMIT) |
207 |
| - { |
208 |
| - // Faster but limited size |
209 |
| - uint cCharacters = length + 1; |
210 |
| - uint cBytes = (cCharacters) * sizeof(wchar_t); |
211 |
| - |
212 |
| - wchar_t* buffer = (wchar_t*)alloca(cBytes); |
213 |
| - wchar_t* ptr = buffer; |
214 |
| - wchar_t wc; |
215 |
| - unsigned int sn = 0; |
216 |
| - int un = 0; |
217 |
| - |
218 |
| - while (sn < length && *s != 0 && (un = utf8_mbtowc(&wc, s, length - sn)) > 0) |
219 |
| - { |
220 |
| - *ptr++ = wc; |
221 |
| - s += un; |
222 |
| - sn += un; |
223 |
| - } |
224 |
| - size_t usedsize = ptr - buffer; |
225 |
| - return std::wstring(buffer, usedsize); |
226 |
| - } |
227 |
| - else |
228 |
| - { |
229 |
| - // Slower but any size |
230 |
| - std::wstring wstr; |
231 |
| - wchar_t wc; |
232 |
| - unsigned int sn = 0; |
233 |
| - int un = 0; |
234 |
| - |
235 |
| - while (sn < length && *s != 0 && (un = utf8_mbtowc(&wc, s, length - sn)) > 0) |
236 |
| - { |
237 |
| - wstr.push_back(wc); |
238 |
| - s += un; |
239 |
| - sn += un; |
240 |
| - } |
241 |
| - return wstr; |
242 |
| - } |
243 |
| -} |
244 |
| - |
245 |
| -// Optimized |
246 |
| -std::string utf8_wcstombs(const std::wstring& wstr) |
247 |
| -{ |
248 |
| - const unsigned int size = wstr.length(); |
249 | 45 |
|
250 |
| - if (size < SMALL_STRING_LIMIT) |
251 |
| - { |
252 |
| - // Faster but limited size |
253 |
| - uint cBytes = (size + 1) * 6; |
254 |
| - char* buffer = (char*)alloca(cBytes); |
255 |
| - char* ptr = buffer; |
256 |
| - for (unsigned int i = 0; i < size; ++i) |
257 |
| - { |
258 |
| - ptr += utf8_wctomb((unsigned char*)ptr, wstr[i], 6); |
259 |
| - } |
260 |
| - size_t usedsize = ptr - buffer; |
261 |
| - return std::string(buffer, usedsize); |
262 |
| - } |
263 |
| - else |
264 |
| - { |
265 |
| - // Slower but any size |
266 |
| - char utf8[6]; |
267 |
| - std::string str; |
| 46 | +std::wstring utf8_mbstowcs(const std::string& str); |
268 | 47 |
|
269 |
| - for (unsigned int i = 0; i < size; ++i) |
270 |
| - { |
271 |
| - int un = utf8_wctomb((unsigned char*)utf8, wstr[i], 6); |
272 |
| - if (un > 0) |
273 |
| - str.append(utf8, un); |
274 |
| - } |
275 |
| - return str; |
276 |
| - } |
277 |
| -} |
| 48 | +std::string utf8_wcstombs(const std::wstring& wstr); |
0 commit comments