Skip to content

Commit 904d769

Browse files
committed
Total redesign. Support VS2019.
1 parent de9ca6c commit 904d769

22 files changed

+1247
-818
lines changed

README.md

Lines changed: 28 additions & 249 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# UTF-8/16/32 C++ library
22
This is the C++11 template based header only library under Windows/Linux/MacOs to convert UFT-8/16/32 symbols and strings. The library transparently support `wchar_t` as UTF-16 for Windows and UTF-32 for Linux and MacOs.
33

4-
UTF-8 and UTF-32 (UCS-32) both support 31 bit wide code points `[0‥0x7FFFFFFF]`with no restriction. UTF-16 supports only unicode code points `[0‥0x10FFFF]` where high `[0xD800‥0xDBFF]` and low `[0xDC00‥0xDFFF]` surrogate regions are prohibited.
4+
UTF-8 and UTF-32 (UCS-32) both support 31 bit wide code points `[0‥0x7FFFFFFF]`with no restriction. UTF-16 supports only unicode code points `[0‥0x10FFFF]`, where high `[0xD800‥0xDBFF]` and low `[0xDC00‥0xDFFF]` surrogate regions are prohibited.
55

6-
The maximum UTF-16 symbol size is 2 words (4 bytes, both words should be from the surrogate region). UFT-32 (UCS-32) is always 1 word (4 bytes). UTF-8 has the maximum symbol size (see [conversion table](#conversion-table)):
6+
The maximum UTF-16 symbol size is 2 words (4 bytes, both words should be in the surrogate region). UFT-32 (UCS-32) is always 1 word (4 bytes). UTF-8 has the maximum symbol size (see [conversion table](#conversion-table)):
77
- 4 bytes for unicode code points
88
- 6 bytes for 31bit code points
99

@@ -20,260 +20,39 @@ The maximum UTF-16 symbol size is 2 words (4 bytes, both words should be from th
2020
## Supported compilers
2121

2222
Tested on following compilers:
23-
- Visual Studio 2013 v12.0.40629.00 Update 5
24-
- Visual Studio 2017 v15.6.7
25-
- GCC v5.4.0
26-
- Clang v3.9.1/v6.0.0
23+
- [Visual Studio 2013 v12.0.40629.00 Update 5](perf/vc120_win.md)
24+
- [Visual Studio 2015 v14.0.25431.01 Update 3](perf/vc140_win.md)
25+
- [Visual Studio 2017 v15.6.7](perf/vc141_win.md)
26+
- [Visual Studio 2019 v16.0.3](perf/vc142_win.md)
27+
- [GNU v5.4.0](perf/gnu_linux.md)
28+
- [Clang v6.0.1](perf/clang_linux.md)
29+
- [Apple Clang v10.0.1](perf/clang_mac.md)
2730

2831
## Usage example
2932

3033
```cpp
31-
// यूनिकोड
32-
static char const u8s[] = "\xE0\xA4\xAF\xE0\xA5\x82\xE0\xA4\xA8\xE0\xA4\xBF\xE0\xA4\x95\xE0\xA5\x8B\xE0\xA4\xA1";
33-
using namespace ww898::utf;
34-
std::u16string u16;
35-
convz<utf_selector_t<decltype(*u8s)>, utf16>(u8s, std::back_inserter(u16));
36-
std::u32string u32;
37-
conv<utf16, utf_selector_t<decltype(u32)::value_type>>(u16.begin(), u16.end(), std::back_inserter(u32));
38-
std::vector<char> u8;
39-
convz<utf32, utf8>(u32.data(), std::back_inserter(u8));
40-
std::wstring uw;
41-
conv<utf8, utfw>(u8s, u8s + sizeof(u8s), std::back_inserter(uw));
42-
auto u8r = conv<char>(uw);
43-
auto uwr = convz<wchar_t>(u8s);
44-
auto u32r = conv<char32_t>(std::string_view(u8r.data(), u8r.size())); // C++17 only
45-
static_assert(is_utf_same_v<decltype(*u8s), decltype(u8)::value_type>, "Fail"); // C++17 only
46-
static_assert(
47-
is_utf_same<decltype(u16)::value_type, decltype(uw)::value_type>::value !=
48-
is_utf_same<decltype(u32)::value_type, decltype(uw)::value_type>::value, "Fail");
49-
```
50-
51-
## Performance
52-
#### Windows x86 (Visual Studio 2013 v12.0.40629.00 Update 5):
53-
```cpp
54-
Running 489 test cases...
55-
sizeof wchar_t: 2
56-
UTFW: UTF16
57-
Resolution: 2591998334
58-
UTF8 ==> UTF8 : 0.163960290s
59-
UTF8 ==> UTF16: 0.282665666s
60-
UTF8 ==> UTF32: 0.149002153s
61-
UTF8 ==> UTFW : 0.283254604s
62-
UTF16 ==> UTF8 : 0.266152488s
63-
UTF16 ==> UTF16: 0.080108020s
64-
UTF16 ==> UTF32: 0.101033595s
65-
UTF16 ==> UTFW : 0.094183924s
66-
UTF32 ==> UTF8 : 0.215850861s
67-
UTF32 ==> UTF16: 0.146806864s
68-
UTF32 ==> UTF32: 0.042549969s
69-
UTF32 ==> UTFW : 0.146204410s
70-
UTFW ==> UTF8 : 0.266856024s
71-
UTFW ==> UTF16: 0.094266542s
72-
UTFW ==> UTF32: 0.102790712s
73-
UTFW ==> UTFW : 0.080478961s
74-
codecvt_utf8_utf16<char16_t>:
75-
UTF16 ==> UTF8 : 0.685873190s (+157.70%)
76-
UTF8 ==> UTF16: 0.466883577s (+65.17%)
77-
codecvt_utf8_utf16<wchar_t>:
78-
UTFW ==> UTF8 : 0.683433984s (+156.11%)
79-
UTF8 ==> UTFW : 0.456086023s (+61.02%)
80-
81-
*** No errors detected
82-
```
83-
84-
#### Windows x64 (Visual Studio 2013 v12.0.40629.00 Update 5):
85-
```cpp
86-
Running 489 test cases...
87-
sizeof wchar_t: 2
88-
UTFW: UTF16
89-
Resolution: 2591994871
90-
UTF8 ==> UTF8 : 0.196164103s
91-
UTF8 ==> UTF16: 0.220423499s
92-
UTF8 ==> UTF32: 0.180234824s
93-
UTF8 ==> UTFW : 0.217163697s
94-
UTF16 ==> UTF8 : 0.212900399s
95-
UTF16 ==> UTF16: 0.097028914s
96-
UTF16 ==> UTF32: 0.101757423s
97-
UTF16 ==> UTFW : 0.071567645s
98-
UTF32 ==> UTF8 : 0.196917702s
99-
UTF32 ==> UTF16: 0.112344089s
100-
UTF32 ==> UTF32: 0.049047871s
101-
UTF32 ==> UTFW : 0.112364705s
102-
UTFW ==> UTF8 : 0.211841364s
103-
UTFW ==> UTF16: 0.070938743s
104-
UTFW ==> UTF32: 0.102185818s
105-
UTFW ==> UTFW : 0.097848249s
106-
codecvt_utf8_utf16<char16_t>:
107-
UTF16 ==> UTF8 : 0.539077998s (+153.21%)
108-
UTF8 ==> UTF16: 0.396618873s (+79.93%)
109-
codecvt_utf8_utf16<wchar_t>:
110-
UTFW ==> UTF8 : 0.537690842s (+153.82%)
111-
UTF8 ==> UTFW : 0.412762006s (+90.07%)
34+
// यूनिकोड
35+
static char const u8s[] = "\xE0\xA4\xAF\xE0\xA5\x82\xE0\xA4\xA8\xE0\xA4\xBF\xE0\xA4\x95\xE0\xA5\x8B\xE0\xA4\xA1";
36+
using namespace ww898::utf;
37+
std::u16string u16;
38+
convz<utf_selector_t<decltype(*u8s)>, utf16>(u8s, std::back_inserter(u16));
39+
std::u32string u32;
40+
conv<utf16, utf_selector_t<decltype(u32)::value_type>>(u16.begin(), u16.end(), std::back_inserter(u32));
41+
std::vector<char> u8;
42+
convz<utf32, utf8>(u32.data(), std::back_inserter(u8));
43+
std::wstring uw;
44+
conv<utf8, utfw>(u8s, u8s + sizeof(u8s), std::back_inserter(uw));
45+
auto u8r = conv<char>(uw);
46+
auto u16r = conv<char16_t>(u16);
47+
auto uwr = convz<wchar_t>(u8s);
11248

113-
*** No errors detected
114-
```
115-
116-
#### Windows x86 (Visual Studio 2017 v15.6.7):
117-
```cpp
118-
Running 489 test cases...
119-
sizeof wchar_t: 2
120-
UTFW: UTF16
121-
Resolution: 2591998780
122-
UTF8 ==> UTF8 : 0.225589121s
123-
UTF8 ==> UTF16: 0.205551657s
124-
UTF8 ==> UTF32: 0.135360995s
125-
UTF8 ==> UTFW : 0.206828091s
126-
UTF16 ==> UTF8 : 0.284084302s
127-
UTF16 ==> UTF16: 0.109397058s
128-
UTF16 ==> UTF32: 0.101644463s
129-
UTF16 ==> UTFW : 0.131424306s
130-
UTF32 ==> UTF8 : 0.291001165s
131-
UTF32 ==> UTF16: 0.149109674s
132-
UTF32 ==> UTF32: 0.062499637s
133-
UTF32 ==> UTFW : 0.148655518s
134-
UTFW ==> UTF8 : 0.300835299s
135-
UTFW ==> UTF16: 0.127525400s
136-
UTFW ==> UTF32: 0.097031381s
137-
UTFW ==> UTFW : 0.109990072s
138-
codecvt_utf8_utf16<char16_t>:
139-
UTF16 ==> UTF8 : 0.552393684s (+94.45%)
140-
UTF8 ==> UTF16: 0.404987578s (+97.02%)
141-
codecvt_utf8_utf16<wchar_t>:
142-
UTFW ==> UTF8 : 0.596080263s (+98.14%)
143-
UTF8 ==> UTFW : 0.418794256s (+102.48%)
144-
145-
*** No errors detected
146-
```
147-
148-
#### Windows x64 (Visual Studio 2017 v15.6.7):
149-
```cpp
150-
Running 489 test cases...
151-
sizeof wchar_t: 2
152-
UTFW: UTF16
153-
Resolution: 2592011526
154-
UTF8 ==> UTF8 : 0.185124459s
155-
UTF8 ==> UTF16: 0.191509469s
156-
UTF8 ==> UTF32: 0.139597283s
157-
UTF8 ==> UTFW : 0.198169193s
158-
UTF16 ==> UTF8 : 0.243126679s
159-
UTF16 ==> UTF16: 0.096481336s
160-
UTF16 ==> UTF32: 0.088010385s
161-
UTF16 ==> UTFW : 0.105519284s
162-
UTF32 ==> UTF8 : 0.218815968s
163-
UTF32 ==> UTF16: 0.114674103s
164-
UTF32 ==> UTF32: 0.050287083s
165-
UTF32 ==> UTFW : 0.115018940s
166-
UTFW ==> UTF8 : 0.242360203s
167-
UTFW ==> UTF16: 0.105936683s
168-
UTFW ==> UTF32: 0.088388864s
169-
UTFW ==> UTFW : 0.098212312s
170-
codecvt_utf8_utf16<char16_t>:
171-
UTF16 ==> UTF8 : 0.508659574s (+109.22%)
172-
UTF8 ==> UTF16: 0.372852507s (+94.69%)
173-
codecvt_utf8_utf16<wchar_t>:
174-
UTFW ==> UTF8 : 0.526355029s (+117.18%)
175-
UTF8 ==> UTFW : 0.383913994s (+93.73%)
49+
auto u32r = conv<char32_t>(std::string_view(u8r.data(), u8r.size())); // C++17 only
17650

177-
*** No errors detected
178-
```
179-
180-
#### Ubuntu 16.04 LTS x64 (GCC v5.4.0):
181-
```cpp
182-
Running 489 test cases...
183-
sizeof wchar_t: 4
184-
UTFW: UTF32
185-
Resolution: 3400052319
186-
UTF8 ==> UTF8 : 0.110866077s
187-
UTF8 ==> UTF16: 0.141338578s
188-
UTF8 ==> UTF32: 0.081097171s
189-
UTF8 ==> UTFW : 0.090628401s
190-
UTF16 ==> UTF8 : 0.186256965s
191-
UTF16 ==> UTF16: 0.058923306s
192-
UTF16 ==> UTF32: 0.041104444s
193-
UTF16 ==> UTFW : 0.041324722s
194-
UTF32 ==> UTF8 : 0.166990347s
195-
UTF32 ==> UTF16: 0.079132988s
196-
UTF32 ==> UTF32: 0.030674187s
197-
UTF32 ==> UTFW : 0.028661489s
198-
UTFW ==> UTF8 : 0.166499877s
199-
UTFW ==> UTF16: 0.075715211s
200-
UTFW ==> UTF32: 0.028246457s
201-
UTFW ==> UTFW : 0.031145368s
202-
codecvt_utf8_utf16<char16_t>:
203-
UTF16 ==> UTF8 : 0.166462098s (-10.63%)
204-
UTF8 ==> UTF16: 0.412099566s (+191.57%)
205-
codecvt_utf8<wchar_t>:
206-
UTFW ==> UTF8 : 0.142860112s (-14.20%)
207-
UTF8 ==> UTFW : 0.703162093s (+675.87%)
208-
209-
*** No errors detected
210-
```
211-
**Attention:** the strange results for UTF16 to UTF8 and UTFW to UTF8 convertions. Strong GCC optimization or bug? Should be investigated in future.
212-
213-
#### Ubuntu 16.04 LTS x64 (Clang v3.9.1):
214-
```cpp
215-
Running 489 test cases...
216-
sizeof wchar_t: 4
217-
UTFW: UTF32
218-
Resolution: 3400053738
219-
UTF8 ==> UTF8 : 0.107137739s
220-
UTF8 ==> UTF16: 0.166798686s
221-
UTF8 ==> UTF32: 0.115869696s
222-
UTF8 ==> UTFW : 0.114985878s
223-
UTF16 ==> UTF8 : 0.179087502s
224-
UTF16 ==> UTF16: 0.060946522s
225-
UTF16 ==> UTF32: 0.071962061s
226-
UTF16 ==> UTFW : 0.071475919s
227-
UTF32 ==> UTF8 : 0.194061658s
228-
UTF32 ==> UTF16: 0.082039203s
229-
UTF32 ==> UTF32: 0.031557019s
230-
UTF32 ==> UTFW : 0.032523089s
231-
UTFW ==> UTF8 : 0.141759171s
232-
UTFW ==> UTF16: 0.078305338s
233-
UTFW ==> UTF32: 0.034137096s
234-
UTFW ==> UTFW : 0.031711982s
235-
codecvt_utf8_utf16<char16_t>:
236-
UTF16 ==> UTF8 : 0.205740508s (+14.88%)
237-
UTF8 ==> UTF16: 0.272519609s (+63.38%)
238-
codecvt_utf8<wchar_t>:
239-
UTFW ==> UTF8 : 0.158999648s (+12.16%)
240-
UTF8 ==> UTFW : 0.340384930s (+196.02%)
241-
242-
*** No errors detected
243-
```
244-
245-
#### MacOS High Sierra v10.13.6 (Clang v6.0.0)
246-
```cpp
247-
Running 489 test cases...
248-
sizeof wchar_t: 4
249-
UTFW: UTF32
250-
Resolution: 2793647583
251-
UTF8 ==> UTF8 : 0.111039205s
252-
UTF8 ==> UTF16: 0.143631552s
253-
UTF8 ==> UTF32: 0.105463425s
254-
UTF8 ==> UTFW : 0.105106640s
255-
UTF16 ==> UTF8 : 0.158074631s
256-
UTF16 ==> UTF16: 0.055528284s
257-
UTF16 ==> UTF32: 0.063203264s
258-
UTF16 ==> UTFW : 0.063167823s
259-
UTF32 ==> UTF8 : 0.123977591s
260-
UTF32 ==> UTF16: 0.061630976s
261-
UTF32 ==> UTF32: 0.027633560s
262-
UTF32 ==> UTFW : 0.029324893s
263-
UTFW ==> UTF8 : 0.123948012s
264-
UTFW ==> UTF16: 0.064873256s
265-
UTFW ==> UTF32: 0.030606730s
266-
UTFW ==> UTFW : 0.027596372s
267-
codecvt_utf8_utf16<char16_t>:
268-
UTF16 ==> UTF8 : 0.151798551s (-3.97%)
269-
UTF8 ==> UTF16: 0.256203078s (+78.38%)
270-
codecvt_utf8<wchar_t>:
271-
UTFW ==> UTF8 : 0.137034385s (+10.56%)
272-
UTF8 ==> UTFW : 0.360953804s (+243.42%)
273-
274-
*** No errors detected
51+
static_assert(std::is_same<utf_selector<decltype(*u8s)>, utf_selector<decltype(u8)::value_type>>::value, "Fail");
52+
static_assert(
53+
std::is_same<utf_selector_t<decltype(u16)::value_type>, utf_selector_t<decltype(uw)::value_type>>::value !=
54+
std::is_same<utf_selector_t<decltype(u32)::value_type>, utf_selector_t<decltype(uw)::value_type>>::value, "Fail");
27555
```
27656
27757
## Conversion table
27858
![UTF-8/32 table](https://upload.wikimedia.org/wikipedia/commons/3/38/UTF-8_Encoding_Scheme.png)
279-

build.cmd

Lines changed: 44 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,63 @@
11
@echo off
22

3-
set _MsBuild=C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\MSBuild\15.0\Bin\MSBuild.exe
3+
set _MsBuild=C:\Program Files (x86)\Microsoft Visual Studio\2019\Professional\MSBuild\Current\Bin\MSBuild.exe
4+
set _CMake=C:\Program Files\CMake\bin\cmake.exe
45

5-
set _Arch=
6-
set _Tool=
7-
set _Gen0=
8-
set _Gen1=
9-
set _Conf=Release
6+
call :_build vc142 x86 Release
7+
call :_build vc142 x86 Debug
8+
call :_build vc142 x64 Release
9+
call :_build vc142 x64 Debug
1010

11-
:_loop
12-
if "%1"=="" goto :_run
11+
call :_build vc141 x86 Release
12+
call :_build vc141 x86 Debug
13+
call :_build vc141 x64 Release
14+
call :_build vc141 x64 Debug
1315

14-
if %1==vc120 set _Tool=%1&& set _Gen0=Visual Studio 12 2013&& goto :_next
15-
if %1==vc140 set _Tool=%1&& set _Gen0=Visual Studio 14 2015&& goto :_next
16-
if %1==vc141 set _Tool=%1&& set _Gen0=Visual Studio 15 2017&& goto :_next
16+
call :_build vc140 x86 Release
17+
call :_build vc140 x86 Debug
18+
call :_build vc140 x64 Release
19+
call :_build vc140 x64 Debug
1720

18-
if %1==x86 set _Arch=%1&& set _Gen1=&& goto :_next
19-
if %1==x64 set _Arch=%1&& set _Gen1= Win64&& goto :_next
21+
call :_build vc120 x86 Release
22+
call :_build vc120 x86 Debug
23+
call :_build vc120 x64 Release
24+
call :_build vc120 x64 Debug
2025

21-
if %1==Release set _Conf=%1&& goto :_next
22-
if %1==Debug set _Conf=%1&& goto :_next
26+
echo Ok
27+
goto :_end
2328

24-
echo "Invalid argument %1"
25-
exit /b 1
29+
:_build
2630

27-
:_next
28-
shift
29-
goto :_loop
31+
set _Tool=%1
32+
set _Arch=%2
33+
set _Conf=%3
3034

31-
:_run
32-
if "%_Tool%"=="" echo Toolset was not defined&& exit /b 1
33-
if "%_Arch%"=="" echo Architecture was not defined&& exit /b 1
34-
if "%_Conf%"=="" echo Configuration was not defined&& exit /b 1
35+
echo Toolset: %_Tool%
36+
echo Architecture: %_Arch%
37+
echo Configuration: %_Conf%
3538

3639
set _Dir=obj.%_Tool%.%_Arch%
37-
set _Gen=%_Gen0%%_Gen1%
3840

39-
echo CMake binary directory: %_Dir%
40-
echo CMake generator: %_Gen%
41+
echo Subdirectory: %_Dir%
42+
43+
if "%_Tool%"=="vc120" set _Gen=Visual Studio 12 2013
44+
if "%_Tool%"=="vc140" set _Gen=Visual Studio 14 2015
45+
if "%_Tool%"=="vc141" set _Gen=Visual Studio 15 2017
46+
if "%_Tool%"=="vc142" set _Gen=Visual Studio 16 2019
47+
if "%_Gen%"=="" echo Unknwon toolset&& exit /b 1
48+
49+
echo CMake generator: %_Gen%
50+
51+
if "%_Arch%"=="x86" set _GenArch=Win32
52+
if "%_Arch%"=="x64" set _GenArch=x64
53+
if "%_GenArch%"=="" echo Unknwon architecture&& exit /b 1
54+
55+
echo CMake architecture: %_GenArch%
4156

4257
if not exist "%_Dir%" mkdir "%_Dir%"
4358

4459
pushd "%_Dir%"
45-
cmake -G "%_Gen%" ..
60+
"%_CMake%" -G "%_Gen%" -A %_GenArch% ..
4661
popd
4762
if errorlevel 1 exit /b 1
4863

@@ -51,4 +66,4 @@ pushd "%_Dir%"
5166
if errorlevel 1 exit /b 1
5267
popd
5368

54-
echo Ok
69+
:_end

0 commit comments

Comments
 (0)