Skip to content

Commit 1c73900

Browse files
committed
test: Add tests for uchar.h
Validate the cXrtomb and mbrtocX implementations, checking both valid input and a range of invalid inputs to make sure each function catches errors in the input encodings. Signed-off-by: Keith Packard <[email protected]>
1 parent f6f6875 commit 1c73900

File tree

3 files changed

+375
-0
lines changed

3 files changed

+375
-0
lines changed

test/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ set(tests
7272
test-strncpy_s
7373
test-strnlen_s
7474
test-sprintf_s
75+
test-uchar
7576
)
7677

7778
set(tests_fail

test/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ plain_tests_common = ['regex', 'ungetc',
4242
'test-raise',
4343
'test-sprintf-percent-n',
4444
'test-ctype',
45+
'test-uchar',
4546
]
4647

4748
math_tests_common = [

test/test-uchar.c

Lines changed: 373 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,373 @@
1+
/*
2+
* SPDX-License-Identifier: BSD-3-Clause
3+
*
4+
* Copyright © 2024 Keith Packard
5+
*
6+
* Redistribution and use in source and binary forms, with or without
7+
* modification, are permitted provided that the following conditions
8+
* are met:
9+
*
10+
* 1. Redistributions of source code must retain the above copyright
11+
* notice, this list of conditions and the following disclaimer.
12+
*
13+
* 2. Redistributions in binary form must reproduce the above
14+
* copyright notice, this list of conditions and the following
15+
* disclaimer in the documentation and/or other materials provided
16+
* with the distribution.
17+
*
18+
* 3. Neither the name of the copyright holder nor the names of its
19+
* contributors may be used to endorse or promote products derived
20+
* from this software without specific prior written permission.
21+
*
22+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23+
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24+
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25+
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26+
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
27+
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
28+
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
29+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30+
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
31+
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
33+
* OF THE POSSIBILITY OF SUCH DAMAGE.
34+
*/
35+
36+
#define _GNU_SOURCE
37+
#include <uchar.h>
38+
#include <stddef.h>
39+
#include <stdio.h>
40+
#include <string.h>
41+
#include <locale.h>
42+
#include <errno.h>
43+
44+
#define MAX_C8 16
45+
#define MAX_C16 16
46+
#define MAX_C32 16
47+
#define MAX_MB 16
48+
49+
const struct {
50+
char8_t c8[MAX_C8];
51+
char mb[MAX_MB];
52+
unsigned err;
53+
} test_c8[] = {
54+
{ .c8 = {0x61}, .mb = "a" },
55+
{ .c8 = {0x0}, .mb = "" },
56+
#if !defined(__PICOLIBC__) || defined(_MB_CAPABLE)
57+
{ .c8 = "\xF4\x8F\xBF\xBF", .mb = "\xF4\x8F\xBF\xBF" },
58+
59+
{ .c8 = "㌰", .mb = "㌰" },
60+
{ .c8 = "🚀", .mb = "🚀" },
61+
{ .c8 = "\xC2\x80", .mb = "\xC2\x80" },
62+
63+
/* overlong two-byte encoding */
64+
{ .c8= "\xC0\x80", .mb = "\xC0\x80", .err = 0 + 1 },
65+
/* overlong three-byte encoding */
66+
{ .c8 = "\xE0\x9F\xBF", .mb = "\xE0\x9F\xBF", .err = 1 + 1 },
67+
/* overlong four-byte encoding */
68+
{ .c8 = "\xF0\x8F\xBF\xBF", .mb = "\xF0\x8F\xBF\xBF", .err = 1 + 1 },
69+
/* lowest surrogate 0xd800 */
70+
{ .c8 = "\xED\xA0\x80", .mb = "\xED\xA0\x80", .err = 1 + 1 },
71+
/* highest surrogate 0xdfff */
72+
{ .c8 = "\xED\xBF\xBF", .mb = "\xED\xBF\xBF", .err = 1 + 1 },
73+
/* missing second byte */
74+
{ .c8 = "\xC2", .mb = "\xC2", .err = 1 + 1 },
75+
{ .c8 = "\xE0", .mb = "\xE0", .err = 1 + 1 },
76+
{ .c8 = "\xF0", .mb = "\xF0", .err = 1 + 1 },
77+
/* missing third byte */
78+
{ .c8 = "\xE0\xA0", .mb = "\xE0\xA0", .err = 2 + 1 },
79+
{ .c8 = "\xF0\x90", .mb = "\xF0\x90", .err = 2 + 1 },
80+
/* missing fourth byte */
81+
{ .c8 = "\xF0\x90\x80", .mb = "\xF0\x90\x80", .err = 3 + 1 },
82+
/* too big 0x110000 */
83+
{ .c8 = "\xF4\x90\x80\x80", .mb = "\xF4\x90\x80\x80", .err = 1 + 1 },
84+
/* too big 0x140000 */
85+
{ .c8 = "\xF5\x90\x80\x80", .mb = "\xF5\x90\x80\x80", .err = 0 + 1 },
86+
#endif
87+
};
88+
89+
#define NTEST_C8 sizeof(test_c8)/sizeof(test_c8[0])
90+
91+
const struct {
92+
char16_t c16[MAX_C16];
93+
char mb[MAX_MB];
94+
unsigned err;
95+
} test_c16[] = {
96+
{ .c16 = { 0x0061 }, .mb = "a" },
97+
{ .c16 = { 0x0000 }, .mb = "" },
98+
#if !defined(__PICOLIBC__) || defined(_MB_CAPABLE)
99+
{ .c16 = { 0x3330 }, .mb = "㌰" },
100+
{ .c16 = { 0xd83d, 0xde80 }, .mb = "🚀" },
101+
102+
/* Missing low surrogate */
103+
{ .c16 = { 0xd83d, 0x0000 }, .mb = "", .err = 1 + 1 },
104+
105+
/* Missing high surrogate */
106+
{ .c16 = { 0xde80, 0x0000 }, .mb = "ʀ", .err = 0 + 1 },
107+
108+
/* Extra high surrogate */
109+
{ .c16 = { 0xd83d, 0xda80 }, .mb = "🚀", .err = 1 + 1 },
110+
#endif
111+
};
112+
113+
#define NTEST_C16 sizeof(test_c16)/sizeof(test_c16[0])
114+
115+
const struct {
116+
char32_t c32[MAX_C32];
117+
char mb[MAX_MB];
118+
unsigned err;
119+
} test_c32[] = {
120+
{ .c32 = { 0x00000061 }, .mb = "a" },
121+
{ .c32 = { 0x00000000 }, .mb = "" },
122+
#if !defined(__PICOLIBC__) || defined(_MB_CAPABLE)
123+
{ .c32 = { 0x00003330 }, .mb = "㌰" },
124+
{ .c32 = { 0x0001f680 }, .mb = "🚀" },
125+
126+
#ifndef __GLIBC__
127+
/*
128+
* Unicode value out of range.
129+
*
130+
* Glibc doesn't report this as an error, instead it silently
131+
* drops the value (!).
132+
*/
133+
{ .c32 = { 0x00110000 }, .mb = "", .err = 0 + 1 },
134+
#endif
135+
136+
/* High surrogate value */
137+
{ .c32 = { 0x0000d83d }, .mb = "", .err = 0 + 1 },
138+
139+
/* Low surrogate value */
140+
{ .c32 = { 0x0000de80 }, .mb = "", .err = 0 + 1 },
141+
#endif
142+
};
143+
144+
#define NTEST_C32 sizeof(test_c32)/sizeof(test_c32[0])
145+
146+
int main(void)
147+
{
148+
unsigned i;
149+
unsigned j;
150+
int status = 0;
151+
mbstate_t mbstate;
152+
153+
#if !defined(__PICOLIBC__) || defined(_MB_CAPABLE)
154+
if (!setlocale(LC_CTYPE, "C.UTF-8")) {
155+
printf("setlocale(LC_CTYPE, \"C.UTF-8\") failed\n");
156+
return 1;
157+
}
158+
#endif
159+
160+
/* utf-8 tests */
161+
for (i = 0; i < NTEST_C8; i++) {
162+
char mb[MAX_MB];
163+
size_t ret;
164+
size_t off;
165+
166+
off = 0;
167+
memset(mb, 0, sizeof(mb));
168+
memset(&mbstate, 0, sizeof(mbstate));
169+
for (j = 0; ; j++) {
170+
ret = c8rtomb(mb + off, test_c8[i].c8[j], &mbstate);
171+
if (ret == (size_t) -1) {
172+
if (test_c8[i].err != 0 && test_c8[i].err == j + 1)
173+
break;
174+
printf("c8rtomb %d failed at byte %d\n", i, j);
175+
status = 1;
176+
break;
177+
} else if (test_c8[i].err == j + 1) {
178+
printf("c8rtomb %d unexpected success\n", i);
179+
status = 1;
180+
break;
181+
}
182+
off += ret;
183+
if (test_c8[i].c8[j] == '\0')
184+
break;
185+
}
186+
if (test_c8[i].err == 0) {
187+
mb[off] = '\0';
188+
if (strcmp(mb, test_c8[i].mb)) {
189+
printf("c8rtomb %d: expected '%s' got '%s'\n", i, test_c8[i].mb, mb);
190+
status = 1;
191+
}
192+
}
193+
194+
char8_t c8[MAX_C8];
195+
off = 0;
196+
ret = 0;
197+
j = 0;
198+
memset(c8, 0, sizeof(c8));
199+
memset(&mbstate, 0, sizeof(mbstate));
200+
while (test_c8[i].mb[j] != 0 || ret) {
201+
ret = mbrtoc8(c8 + off, &test_c8[i].mb[j], 1, &mbstate);
202+
if (ret == (size_t) -1)
203+
{
204+
if (test_c8[i].err <= j + 1)
205+
break;
206+
printf("mbrtoc8 %d failed at byte %d\n", i, j);
207+
status = 1;
208+
break;
209+
}
210+
switch (ret) {
211+
case (size_t) -2:
212+
j++;
213+
break;
214+
case (size_t) -3:
215+
off++;
216+
break;
217+
default:
218+
j += ret;
219+
off++;
220+
break;
221+
}
222+
}
223+
if (test_c8[i].err == 0) {
224+
for (j = 0; j < MAX_C8; j++) {
225+
if (c8[j] != test_c8[i].c8[j]) {
226+
printf("mbrtoc8 %d[%d]: expected 0x%x got 0x%x\n",
227+
i, j, test_c8[i].c8[j], c8[j]);
228+
status = 1;
229+
}
230+
}
231+
} else if (ret != (size_t) -1) {
232+
#ifdef __PICOLIBC__
233+
printf("mbrtoc8 %d unexpected success\n", i);
234+
status = 1;
235+
#endif
236+
}
237+
}
238+
239+
/* utf-16 tests */
240+
for (i = 0; i < NTEST_C16; i++) {
241+
char mb[MAX_MB];
242+
size_t ret;
243+
size_t off = 0;
244+
memset(mb, 0, sizeof(mb));
245+
memset(&mbstate, 0, sizeof(mbstate));
246+
for (j = 0; test_c16[i].c16[j] != 0; j++) {
247+
ret = c16rtomb(mb + off, test_c16[i].c16[j], &mbstate);
248+
if (ret == (size_t) -1) {
249+
if (test_c16[i].err != 0 && test_c16[i].err == j + 1)
250+
break;
251+
printf("c16rtomb %d failed at byte %d\n", i, j);
252+
status = 1;
253+
break;
254+
} else if (test_c16[i].err == j + 1) {
255+
printf("c16rtomb %d unexpected success\n", i);
256+
status = 1;
257+
break;
258+
}
259+
off += ret;
260+
}
261+
if (test_c16[i].err == 0) {
262+
mb[off] = '\0';
263+
if (strcmp(mb, test_c16[i].mb)) {
264+
printf("test %d: expected '%s' got '%s'\n", i, test_c16[i].mb, mb);
265+
status = 1;
266+
}
267+
}
268+
269+
char16_t c16[MAX_C16];
270+
off = 0;
271+
ret = 0;
272+
j = 0;
273+
memset(c16, 0, sizeof(c16));
274+
memset(&mbstate, 0, sizeof(mbstate));
275+
if (test_c16[i].err == 0) {
276+
while (test_c16[i].mb[j] != 0 || ret) {
277+
ret = mbrtoc16(c16 + off, &test_c16[i].mb[j], 1, &mbstate);
278+
if (ret == (size_t) -1)
279+
{
280+
printf("mbrtoc16 %d failed at byte %d\n", i, j);
281+
status = 1;
282+
break;
283+
}
284+
switch (ret) {
285+
case (size_t) -2:
286+
j++;
287+
break;
288+
case (size_t) -3:
289+
off++;
290+
break;
291+
default:
292+
j += ret;
293+
off++;
294+
break;
295+
}
296+
}
297+
for (j = 0; j < MAX_C16; j++) {
298+
if (c16[j] != test_c16[i].c16[j]) {
299+
printf("mbrtoc16 %d[%d]: expected 0x%x got 0x%x\n",
300+
i, j, test_c16[i].c16[j], c16[j]);
301+
status = 1;
302+
}
303+
}
304+
}
305+
}
306+
for (i = 0; i < NTEST_C32; i++) {
307+
char mb[MAX_MB];
308+
size_t ret;
309+
size_t off = 0;
310+
memset(mb, 0, sizeof(mb));
311+
memset(&mbstate, 0, sizeof(mbstate));
312+
for (j = 0; test_c32[i].c32[j] != 0; j++) {
313+
ret = c32rtomb(mb + off, test_c32[i].c32[j], &mbstate);
314+
if (ret == (size_t) -1) {
315+
if (test_c32[i].err != 0 && test_c32[i].err == j + 1)
316+
break;
317+
printf("c32rtomb %d failed at byte %d\n", i, j);
318+
status = 1;
319+
break;
320+
} else if (test_c32[i].err == j + 1) {
321+
printf("c32rtomb %d unexpected success c32[%u] is %#08lx\n", i, j, (unsigned long) test_c32[i].c32[j]);
322+
status = 1;
323+
break;
324+
}
325+
off += ret;
326+
}
327+
if (test_c32[i].err == 0) {
328+
mb[off] = '\0';
329+
if (strcmp(mb, test_c32[i].mb)) {
330+
printf("test %d: expected '%s' got '%s'\n", i, test_c32[i].mb, mb);
331+
status = 1;
332+
}
333+
}
334+
335+
char32_t c32[MAX_C32];
336+
off = 0;
337+
ret = 0;
338+
j = 0;
339+
memset(c32, 0, sizeof(c32));
340+
memset(&mbstate, 0, sizeof(mbstate));
341+
if (test_c32[i].err == 0) {
342+
while (test_c32[i].mb[j] != 0 || ret) {
343+
ret = mbrtoc32(c32 + off, &test_c32[i].mb[j], 1, &mbstate);
344+
if (ret == (size_t) -1)
345+
{
346+
printf("mbrtoc32 %d failed at byte %d\n", i, j);
347+
status = 1;
348+
break;
349+
}
350+
switch (ret) {
351+
case (size_t) -2:
352+
j++;
353+
break;
354+
case (size_t) -3:
355+
off++;
356+
break;
357+
default:
358+
j += ret;
359+
off++;
360+
break;
361+
}
362+
}
363+
for (j = 0; j < MAX_C32; j++) {
364+
if (c32[j] != test_c32[i].c32[j]) {
365+
printf("mbrtoc32 %d[%d]: expected 0x%08lx got 0x%08lx\n",
366+
i, j, (unsigned long) test_c32[i].c32[j], (unsigned long) c32[j]);
367+
status = 1;
368+
}
369+
}
370+
}
371+
}
372+
return status;
373+
}

0 commit comments

Comments
 (0)