Skip to content

Commit ca8d8de

Browse files
authored
Add missing unicode module
Implement `unicode:characters_to_list/1,2` and `unicode:characters_to_binary/1,2,3` using new `interop_iolist_fold`. This fixes `io_lib:format/2` with `t` modifier. Rename `interop_iolist_fold` to `interop_chardata_fold` because it really processes `iodata` and now `chardata` as it works for unicode. Signed-off-by: Paul Guyot <[email protected]>
1 parent ce0be6c commit ca8d8de

File tree

17 files changed

+789
-35
lines changed

17 files changed

+789
-35
lines changed

.clang-format-ignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
# We have a number of existing files that are quite "re-format unfriendly"
88
# Let's ignore all of them
99
src/libAtomVM/bif.c
10-
src/libAtomVM/bitstring.c
1110
src/libAtomVM/bitstring.h
1211
src/libAtomVM/debug.h
1312
src/libAtomVM/defaultatoms.c

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
3838
- Added support for Erlang `gpio:close/1` and Elixir `GPIO.close/1` for ESP32
3939
- Added support for the Erlang `gen_event` module
4040
- Added `start_link` support for the `network` module
41-
- Added support for `erlang:monotomic_time/1`
41+
- Added support for `erlang:monotonic_time/1`
4242
- Added `start_link` support for the `gen_statem` module
4343
- Added support for serializing floats in erlang external term encoding
4444
- Added support for the `SMALL_BIG_EXT` erlang external term encoding
@@ -56,6 +56,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
5656
- Added `esp:partition_list/0` function
5757
- Added `esp:nvs_fetch_binary/2` and `nvs_put_binary/3` functions (`esp:nvs_set_binary` and
5858
functions that default to `?ATOMVM_NVS_NS` are deprecated now).
59+
- Added most format possibilities to `io:format/2` and `io_lib:format/2`
60+
- Added `unicode` module with `characters_to_list/1,2` and `characters_to_binary/1,2,3` functions
5961

6062
### Fixed
6163
- Fixed issue with formatting integers with io:format() on STM32 platform

libs/estdlib/src/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ set(ERLANG_MODULES
4242
proplists
4343
string
4444
timer
45+
unicode
4546
erlang
4647
)
4748

libs/estdlib/src/unicode.erl

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
%
2+
% This file is part of AtomVM.
3+
%
4+
% Copyright 2023 Paul Guyot <[email protected]>
5+
%
6+
% Licensed under the Apache License, Version 2.0 (the "License");
7+
% you may not use this file except in compliance with the License.
8+
% You may obtain a copy of the License at
9+
%
10+
% http://www.apache.org/licenses/LICENSE-2.0
11+
%
12+
% Unless required by applicable law or agreed to in writing, software
13+
% distributed under the License is distributed on an "AS IS" BASIS,
14+
% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
% See the License for the specific language governing permissions and
16+
% limitations under the License.
17+
%
18+
% SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
19+
%
20+
21+
%%-----------------------------------------------------------------------------
22+
%% @doc An implementation of the Erlang/OTP unicode interface.
23+
%%
24+
%% This module implements a strict subset of the Erlang/OTP unicode
25+
%% interface.
26+
%% @end
27+
%%-----------------------------------------------------------------------------
28+
-module(unicode).
29+
30+
-export([
31+
characters_to_list/1,
32+
characters_to_list/2,
33+
characters_to_binary/1,
34+
characters_to_binary/2,
35+
characters_to_binary/3
36+
]).
37+
38+
%% A UTF-8 encoded binary.
39+
-type unicode_binary() :: binary().
40+
41+
%% Latin-1 encoded data
42+
-type latin1_chardata() :: iodata().
43+
44+
%% Unicode or UTF-8 encoded data
45+
-type chardata() :: charlist() | unicode_binary().
46+
-type charlist() :: maybe_improper_list(
47+
char() | unicode_binary() | charlist(), unicode_binary() | []
48+
).
49+
50+
-type encoding() :: utf8 | latin1.
51+
52+
-export_type([
53+
unicode_binary/0,
54+
latin1_chardata/0,
55+
chardata/0,
56+
charlist/0,
57+
encoding/0
58+
]).
59+
60+
%% @doc Convert UTF-8 data to a list of Unicode characters.
61+
%% <p>If conversion fails, the function returns a tuple with three elements:</p>
62+
%% <ul>
63+
%% <li>First element is <code>error</code> or <code>incomplete</code>. <code>incomplete</code> means the conversion failed because of an incomplete unicode transform at the very end of data.</li>
64+
%% <li>Second element is what has been converted so far.</li>
65+
%% <li>Third element is the remaining data to be converted, for debugging purposes. This remaining data can differ with what Erlang/OTP returns.</li>
66+
%% </ul>
67+
%% @param Data data to convert to Unicode
68+
%% @return a list of characters or a tuple if conversion failed.
69+
-spec characters_to_list(Data :: chardata() | latin1_chardata()) ->
70+
list() | {error, list(), chardata() | latin1_chardata()} | {incomplete, list(), binary()}.
71+
characters_to_list(_Data) ->
72+
erlang:nif_error(undefined).
73+
74+
%% @doc Convert UTF-8 or Latin1 data to a list of Unicode characters.
75+
%% @see characters_to_list/1
76+
%% @param Data data to convert
77+
%% @param Encoding encoding of data to convert
78+
%% @return a list of characters or a tuple if conversion failed.
79+
-spec characters_to_list(Data :: chardata() | latin1_chardata(), Encoding :: encoding()) ->
80+
list()
81+
| {error, list(), chardata() | latin1_chardata()}
82+
| {incomplete, list(), chardata() | latin1_chardata()}.
83+
characters_to_list(_Data, _Encoding) ->
84+
erlang:nif_error(undefined).
85+
86+
%% @doc Convert character data to an UTF8 binary
87+
%% @equiv characters_to_binary(Data, utf8, utf8)
88+
%% @param Data data to convert to UTF8
89+
%% @return an utf8 binary or a tuple if conversion failed.
90+
-spec characters_to_binary(Data :: chardata() | latin1_chardata()) ->
91+
unicode_binary()
92+
| {error, list(), chardata() | latin1_chardata()}
93+
| {incomplete, unicode_binary(), chardata() | latin1_chardata()}.
94+
characters_to_binary(_Data) ->
95+
erlang:nif_error(undefined).
96+
97+
%% @doc Convert character data in a given encoding to an UTF8 binary
98+
%% @equiv characters_to_binary(Data, InEncoding, utf8)
99+
%% @param Data data to convert to UTF8
100+
%% @param InEncoding encoding of data
101+
%% @return an utf8 binary or a tuple if conversion failed.
102+
-spec characters_to_binary(Data :: chardata() | latin1_chardata(), InEncoding :: encoding()) ->
103+
unicode_binary()
104+
| {error, list(), chardata() | latin1_chardata()}
105+
| {incomplete, unicode_binary(), chardata() | latin1_chardata()}.
106+
characters_to_binary(_Data, _InEncoding) ->
107+
erlang:nif_error(undefined).
108+
109+
%% @doc Convert character data in a given encoding to a binary in a given encoding.
110+
%% <p>If conversion fails, the function returns a tuple with three elements:</p>
111+
%% <ul>
112+
%% <li>First element is <code>error</code> or <code>incomplete</code>. <code>incomplete</code> means the conversion failed because of an incomplete unicode transform at the very end of data.</li>
113+
%% <li>Second element is what has been converted so far.</li>
114+
%% <li>Third element is the remaining data to be converted, for debugging purposes. This remaining data can differ with what Erlang/OTP returns.</li>
115+
%% </ul>
116+
%% <p>Also, Erlang/OTP's implementation may error with <code>badarg</code> for parameters
117+
%% for which this function merely returns an error tuple.</p>
118+
%% @param Data data to convert to UTF8
119+
%% @param InEncoding encoding of input data
120+
%% @param InEncoding output encoding
121+
%% @return an encoded binary or a tuple if conversion failed.
122+
-spec characters_to_binary(
123+
Data :: chardata() | latin1_chardata(), InEncoding :: encoding(), OutEncoding :: encoding()
124+
) ->
125+
unicode_binary()
126+
| {error, list(), chardata() | latin1_chardata()}
127+
| {incomplete, unicode_binary(), chardata() | latin1_chardata()}.
128+
characters_to_binary(_Data, _InEncoding, _OutEncoding) ->
129+
erlang:nif_error(undefined).

src/libAtomVM/bitstring.c

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,7 @@
2323

2424
static inline uint64_t from_le64(uint64_t value)
2525
{
26-
return ((((value) & 0xFF) << 56) | (((value) & 0xFF00) << 40) | (((value) & 0xFF0000) << 24) | \
27-
(((value) & 0xFF000000) << 8) | (((value) & 0xFF00000000) >> 8) | (((value) & 0xFF0000000000) >> 24) | \
28-
(((value) & 0xFF000000000000) >> 40) | (((value) & 0xFF00000000000000) >> 56));
26+
return ((((value) &0xFF) << 56) | (((value) &0xFF00) << 40) | (((value) &0xFF0000) << 24) | (((value) &0xFF000000) << 8) | (((value) &0xFF00000000) >> 8) | (((value) &0xFF0000000000) >> 24) | (((value) &0xFF000000000000) >> 40) | (((value) &0xFF00000000000000) >> 56));
2927
}
3028

3129
bool bitstring_extract_any_integer(const uint8_t *src, size_t offset, avm_int_t n,
@@ -140,12 +138,12 @@ bool bitstring_utf8_encode(avm_int_t c, uint8_t *buf, size_t *out_size)
140138
return true;
141139
}
142140

143-
bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *out_size)
141+
enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size)
144142
{
145143
if (len == 0) {
146-
return false;
144+
return UnicodeTransformDecodeFail;
147145
} else if (len >= 4 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80) && ((buf[3] & 0xC0) == 0x80)) {
148-
int32_t v = 0;
146+
uint32_t v = 0;
149147
v |= (buf[0] & 0x07) << 18;
150148
v |= (buf[1] & 0x3F) << 12;
151149
v |= (buf[2] & 0x3F) << 6;
@@ -156,9 +154,9 @@ bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *o
156154
}
157155
*c = v;
158156
*out_size = 4;
159-
return true;
157+
return UnicodeTransformDecodeSuccess;
160158
} else if (len >= 3 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) {
161-
int32_t v = 0;
159+
uint32_t v = 0;
162160
v |= (buf[0] & 0x0F) << 12;
163161
v |= (buf[1] & 0x3F) << 6;
164162
v |= (buf[2] & 0x3F);
@@ -168,9 +166,9 @@ bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *o
168166
}
169167
*c = v;
170168
*out_size = 3;
171-
return true;
169+
return UnicodeTransformDecodeSuccess;
172170
} else if (len >= 2 && (buf[0] & 0xE0) == 0xC0 && ((buf[1] & 0xC0) == 0x80)) {
173-
int32_t v = 0;
171+
uint32_t v = 0;
174172
v |= (buf[0] & 0x1F) << 6;
175173
v |= (buf[1] & 0x3F);
176174
// overlong encoding
@@ -179,16 +177,28 @@ bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *o
179177
}
180178
*c = v;
181179
*out_size = 2;
182-
return true;
180+
return UnicodeTransformDecodeSuccess;
183181
} else if ((*buf & 0x80) == 0) {
184-
int32_t v = 0;
182+
uint32_t v = 0;
185183
v |= (buf[0] & 0x7F);
186184
*c = v;
187185
*out_size = 1;
188-
return true;
186+
return UnicodeTransformDecodeSuccess;
187+
} else if (len == 3 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) {
188+
return UnicodeTransformDecodeIncomplete;
189+
} else if (len == 2 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80)) {
190+
return UnicodeTransformDecodeIncomplete;
191+
} else if (len == 1 && (buf[0] & 0xF8) == 0xF0) {
192+
return UnicodeTransformDecodeIncomplete;
193+
} else if (len == 2 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80)) {
194+
return UnicodeTransformDecodeIncomplete;
195+
} else if (len == 1 && (buf[0] & 0xF0) == 0xE0) {
196+
return UnicodeTransformDecodeIncomplete;
197+
} else if (len == 1 && (buf[0] & 0xE0) == 0xC0) {
198+
return UnicodeTransformDecodeIncomplete;
189199
}
190200

191-
return false;
201+
return UnicodeTransformDecodeFail;
192202
}
193203

194204
// UTF-16 encoding, when U in U+010000 to U+10FFFF:
@@ -321,7 +331,7 @@ bool bitstring_utf32_decode(const uint8_t *buf, size_t len, int32_t *c, enum Bit
321331
v |= (buf[3] & 0xFF) << 24;
322332
v |= (buf[2] & 0xFF) << 16;
323333
v |= (buf[1] & 0xFF) << 8;
324-
v |= buf[0] & 0xFF;
334+
v |= buf[0] & 0xFF;
325335
if (is_invalid_codepoint(v)) {
326336
return false;
327337
}
@@ -332,7 +342,7 @@ bool bitstring_utf32_decode(const uint8_t *buf, size_t len, int32_t *c, enum Bit
332342
v |= (buf[0] & 0xFF) << 24;
333343
v |= (buf[1] & 0xFF) << 16;
334344
v |= (buf[2] & 0xFF) << 8;
335-
v |= buf[3] & 0xFF;
345+
v |= buf[3] & 0xFF;
336346
if (is_invalid_codepoint(v)) {
337347
return false;
338348
}

src/libAtomVM/bitstring.h

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,13 @@ enum BitstringFlags
9999
#endif
100100
};
101101

102+
enum UnicodeTransformDecodeResult
103+
{
104+
UnicodeTransformDecodeSuccess,
105+
UnicodeTransformDecodeFail,
106+
UnicodeTransformDecodeIncomplete
107+
};
108+
102109
union maybe_unsigned_int8
103110
{
104111
uint8_t u;
@@ -320,10 +327,12 @@ bool bitstring_utf8_encode(avm_int_t c, uint8_t *buf, size_t *out_size);
320327
* @param len the length (in bytes) of the bytes in buf
321328
* @param c int value to decode to or NULL to only compute the size.
322329
* @param out_size the size in bytes, on output (if not NULL)
323-
* @return \c true if decoding was successful, \c false if character starting at buf is not a valid
324-
* unicode character
330+
* @return \c UnicodeTransformDecodeSuccess if decoding was successful,
331+
* \c UnicodeTransformDecodeFail if character starting at buf is not a valid
332+
* unicode character or \c UnicodeTransformDecodeIncomplete if character
333+
* starting at buf is a valid but incomplete transformation
325334
*/
326-
bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *out_size);
335+
enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size);
327336

328337
/**
329338
* @brief Encode a character to UTF-16.
@@ -428,11 +437,11 @@ static inline bool bitstring_insert_utf8(term dst_bin, size_t offset, avm_int_t
428437
* @return \c true if encoding was successful, \c false if src_bin at offset is not a valid
429438
* unicode character
430439
*/
431-
static inline bool bitstring_match_utf8(term src_bin, size_t offset, int32_t *c, size_t *out_size)
440+
static inline bool bitstring_match_utf8(term src_bin, size_t offset, uint32_t *c, size_t *out_size)
432441
{
433442
size_t byte_offset = offset >> 3; // divide by 8
434443
const uint8_t *src = (const uint8_t *) term_binary_data(src_bin) + byte_offset;
435-
return bitstring_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size);
444+
return bitstring_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size) == UnicodeTransformDecodeSuccess;
436445
}
437446

438447
/**

src/libAtomVM/defaultatoms.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,8 @@ static const char *const attributes_atom = "\xA" "attributes";
143143
static const char *const compile_atom = "\x7" "compile";
144144
static const char *const exports_atom = "\x7" "exports";
145145

146+
static const char *const incomplete_atom = "\xA" "incomplete";
147+
146148
void defaultatoms_init(GlobalContext *glb)
147149
{
148150
int ok = 1;
@@ -270,6 +272,8 @@ void defaultatoms_init(GlobalContext *glb)
270272
ok &= globalcontext_insert_atom(glb, compile_atom) == COMPILE_ATOM_INDEX;
271273
ok &= globalcontext_insert_atom(glb, exports_atom) == EXPORTS_ATOM_INDEX;
272274

275+
ok &= globalcontext_insert_atom(glb, incomplete_atom) == INCOMPLETE_ATOM_INDEX;
276+
273277
if (!ok) {
274278
AVM_ABORT();
275279
}

src/libAtomVM/defaultatoms.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,9 @@ extern "C" {
152152
#define COMPILE_ATOM_INDEX 97
153153
#define EXPORTS_ATOM_INDEX 98
154154

155-
#define PLATFORM_ATOMS_BASE_INDEX 99
155+
#define INCOMPLETE_ATOM_INDEX 99
156+
157+
#define PLATFORM_ATOMS_BASE_INDEX 100
156158

157159
#define FALSE_ATOM TERM_FROM_ATOM_INDEX(FALSE_ATOM_INDEX)
158160
#define TRUE_ATOM TERM_FROM_ATOM_INDEX(TRUE_ATOM_INDEX)
@@ -279,6 +281,8 @@ extern "C" {
279281
#define COMPILE_ATOM TERM_FROM_ATOM_INDEX(COMPILE_ATOM_INDEX)
280282
#define EXPORTS_ATOM TERM_FROM_ATOM_INDEX(EXPORTS_ATOM_INDEX)
281283

284+
#define INCOMPLETE_ATOM TERM_FROM_ATOM_INDEX(INCOMPLETE_ATOM_INDEX)
285+
282286
void defaultatoms_init(GlobalContext *glb);
283287

284288
void platform_defaultatoms_init(GlobalContext *glb);

0 commit comments

Comments
 (0)