Skip to content

Commit f6f6875

Browse files
committed
uchar: Add uchar implementation
This provides the C23 uchar.h and associated functions for translating between UTF-8, UTF-16, UTF-32 and C multibyte encodings. Signed-off-by: Keith Packard <[email protected]>
1 parent e8a719d commit f6f6875

File tree

16 files changed

+849
-66
lines changed

16 files changed

+849
-66
lines changed

newlib/libc/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,5 +51,6 @@ add_subdirectory(stdlib)
5151
add_subdirectory(string)
5252
add_subdirectory(time)
5353
add_subdirectory(tinystdio)
54+
add_subdirectory(uchar)
5455
add_subdirectory(xdr)
5556
add_subdirectory(locale)

newlib/libc/include/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ picolibc_headers(""
9292
termios.h
9393
threads.h
9494
time.h
95+
uchar.h
9596
unctrl.h
9697
unistd.h
9798
utime.h

newlib/libc/include/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ inc_headers = [
9191
'termios.h',
9292
'threads.h',
9393
'time.h',
94+
'uchar.h',
9495
'unctrl.h',
9596
'unistd.h',
9697
'utime.h',

newlib/libc/include/sys/_types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,7 @@ typedef struct
209209
{
210210
wint_t __wch;
211211
unsigned char __wchb[4];
212+
__uint32_t __ucs;
212213
} __value; /* Value so far. */
213214
} _mbstate_t;
214215
#endif

newlib/libc/include/uchar.h

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
/*
2+
* SPDX-License-Identifier: BSD-3-Clause
3+
*
4+
* Copyright © 2024 Keith Packard
5+
*
6+
* Redistribution and use in source and binary forms, with or without
7+
* modification, are permitted provided that the following conditions
8+
* are met:
9+
*
10+
* 1. Redistributions of source code must retain the above copyright
11+
* notice, this list of conditions and the following disclaimer.
12+
*
13+
* 2. Redistributions in binary form must reproduce the above
14+
* copyright notice, this list of conditions and the following
15+
* disclaimer in the documentation and/or other materials provided
16+
* with the distribution.
17+
*
18+
* 3. Neither the name of the copyright holder nor the names of its
19+
* contributors may be used to endorse or promote products derived
20+
* from this software without specific prior written permission.
21+
*
22+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23+
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24+
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25+
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26+
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
27+
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
28+
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
29+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30+
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
31+
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
33+
* OF THE POSSIBILITY OF SUCH DAMAGE.
34+
*/
35+
36+
#ifndef _UCHAR_H_
37+
#define _UCHAR_H_
38+
39+
#include <sys/cdefs.h>
40+
#include <machine/_default_types.h>
41+
#include <sys/_types.h>
42+
#define __need_size_t
43+
#include <stddef.h>
44+
45+
#define __STDC_VERSION_UCHAR_H_ 202311L
46+
47+
_BEGIN_STD_C
48+
49+
#ifndef _MBSTATE_DECLARED
50+
typedef _mbstate_t mbstate_t;
51+
#define _MBSTATE_DECLARED
52+
#endif
53+
54+
typedef unsigned char char8_t;
55+
typedef __uint_least16_t char16_t;
56+
typedef __uint_least32_t char32_t;
57+
58+
size_t mbrtoc8(char8_t * __restrict pc8, const char * __restrict s, size_t n,
59+
_mbstate_t * __restrict ps);
60+
61+
size_t c8rtomb(char * __restrict s, char8_t c8, _mbstate_t * __restrict ps);
62+
63+
size_t mbrtoc16(char16_t * __restrict pc16, const char * __restrict s, size_t n,
64+
_mbstate_t * __restrict ps);
65+
66+
size_t c16rtomb(char * __restrict s, char16_t c16, _mbstate_t * __restrict ps);
67+
68+
size_t mbrtoc32(char32_t * __restrict pc32, const char * __restrict s, size_t n,
69+
_mbstate_t * __restrict ps);
70+
71+
size_t c32rtomb(char * __restrict s, char32_t c32, _mbstate_t * __restrict ps);
72+
73+
_END_STD_C
74+
75+
#endif /* _UCHAR_H_ */

newlib/libc/meson.build

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535

3636
libdirs = ['argz', 'ctype', 'errno', 'iconv', 'misc',
3737
'posix', 'search', 'signal', 'ssp', 'stdlib',
38-
'string', 'time', 'xdr', 'locale']
38+
'string', 'time', 'xdr', 'locale', 'uchar']
3939

4040
if enable_picolib
4141
libdirs += 'picolib'

newlib/libc/uchar/CMakeLists.txt

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#
2+
# SPDX-License-Identifier: BSD-3-Clause
3+
#
4+
# Copyright © 2024 Keith Packard
5+
#
6+
# Redistribution and use in source and binary forms, with or without
7+
# modification, are permitted provided that the following conditions
8+
# are met:
9+
#
10+
# 1. Redistributions of source code must retain the above copyright
11+
# notice, this list of conditions and the following disclaimer.
12+
#
13+
# 2. Redistributions in binary form must reproduce the above
14+
# copyright notice, this list of conditions and the following
15+
# disclaimer in the documentation and/or other materials provided
16+
# with the distribution.
17+
#
18+
# 3. Neither the name of the copyright holder nor the names of its
19+
# contributors may be used to endorse or promote products derived
20+
# from this software without specific prior written permission.
21+
#
22+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23+
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24+
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25+
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26+
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
27+
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
28+
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
29+
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30+
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
31+
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32+
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
33+
# OF THE POSSIBILITY OF SUCH DAMAGE.
34+
#
35+
picolibc_sources(
36+
c16rtomb.c
37+
c32rtomb.c
38+
c8rtomb.c
39+
mbrtoc16.c
40+
mbrtoc32.c
41+
mbrtoc8.c
42+
)

newlib/libc/uchar/c16rtomb.c

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
/*
2+
* SPDX-License-Identifier: BSD-3-Clause
3+
*
4+
* Copyright © 2024 Keith Packard
5+
*
6+
* Redistribution and use in source and binary forms, with or without
7+
* modification, are permitted provided that the following conditions
8+
* are met:
9+
*
10+
* 1. Redistributions of source code must retain the above copyright
11+
* notice, this list of conditions and the following disclaimer.
12+
*
13+
* 2. Redistributions in binary form must reproduce the above
14+
* copyright notice, this list of conditions and the following
15+
* disclaimer in the documentation and/or other materials provided
16+
* with the distribution.
17+
*
18+
* 3. Neither the name of the copyright holder nor the names of its
19+
* contributors may be used to endorse or promote products derived
20+
* from this software without specific prior written permission.
21+
*
22+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23+
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24+
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25+
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26+
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
27+
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
28+
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
29+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30+
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
31+
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
33+
* OF THE POSSIBILITY OF SUCH DAMAGE.
34+
*/
35+
36+
#include "uchar-local.h"
37+
38+
size_t
39+
c16rtomb (char *s, char16_t c16, mbstate_t *ps)
40+
{
41+
static NEWLIB_THREAD_LOCAL mbstate_t local_state;
42+
43+
if (ps == NULL)
44+
ps = &local_state;
45+
46+
#if __SIZEOF_WCHAR_T__ == 2
47+
return wcrtomb(s, (wchar_t) c16, ps);
48+
#elif __SIZEOF_WCHAR_T__ == 4
49+
char32_t c32;
50+
51+
/* High surrogate */
52+
if (char16_is_high_surrogate(c16)) {
53+
if (ps->__count != 0) {
54+
errno = EILSEQ;
55+
return (size_t) -1;
56+
}
57+
ps->__value.__ucs = ((char32_t) (c16 & 0x3ff) << 10) | 0x10000;
58+
ps->__count = -1;
59+
return 0;
60+
} else if (char16_is_low_surrogate(c16)) {
61+
if (ps->__count == -1) {
62+
c32 = ps->__value.__ucs | (c16 & 0x3ff);
63+
ps->__count = 0;
64+
} else {
65+
errno = EILSEQ;
66+
return (size_t) -1;
67+
}
68+
} else {
69+
c32 = c16;
70+
}
71+
return wcrtomb(s, (wchar_t) c32, ps);
72+
#else
73+
#error wchar_t size unknown
74+
#endif
75+
}

newlib/libc/uchar/c32rtomb.c

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/*
2+
* SPDX-License-Identifier: BSD-3-Clause
3+
*
4+
* Copyright © 2024 Keith Packard
5+
*
6+
* Redistribution and use in source and binary forms, with or without
7+
* modification, are permitted provided that the following conditions
8+
* are met:
9+
*
10+
* 1. Redistributions of source code must retain the above copyright
11+
* notice, this list of conditions and the following disclaimer.
12+
*
13+
* 2. Redistributions in binary form must reproduce the above
14+
* copyright notice, this list of conditions and the following
15+
* disclaimer in the documentation and/or other materials provided
16+
* with the distribution.
17+
*
18+
* 3. Neither the name of the copyright holder nor the names of its
19+
* contributors may be used to endorse or promote products derived
20+
* from this software without specific prior written permission.
21+
*
22+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23+
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24+
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25+
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26+
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
27+
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
28+
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
29+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30+
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
31+
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
33+
* OF THE POSSIBILITY OF SUCH DAMAGE.
34+
*/
35+
36+
#include "uchar-local.h"
37+
38+
size_t
39+
c32rtomb (char *s, char32_t c32, mbstate_t *ps)
40+
{
41+
static NEWLIB_THREAD_LOCAL mbstate_t local_state;
42+
43+
if (ps == NULL)
44+
ps = &local_state;
45+
46+
if (!char32_is_valid(c32)) {
47+
errno = EILSEQ;
48+
return (size_t) -1;
49+
}
50+
#if __SIZEOF_WCHAR_T__ == 2
51+
if (char32_needs_surrogates(c32)) {
52+
const wchar_t wc[2] = {
53+
[0] = ((c32 - 0x10000) >> 10) + HIGH_SURROGATE_FIRST,
54+
[1] = (c32 & 0x3ff) + LOW_SURROGATE_FIRST,
55+
};
56+
const wchar_t *wcp = wc;
57+
return wcsnrtombs(s, &wcp, 2, SIZE_MAX, ps);
58+
}
59+
#endif
60+
return wcrtomb(s, (wchar_t) c32, ps);
61+
}

0 commit comments

Comments
 (0)