Skip to content

Commit 79b8d0f

Browse files
committed
Merge pull request atomvm#491 from fadushin/bs_utf_ops
Add support for bs_(get|skip)_utf(8|16|32) This PR adds support for the following opcodes: ``` (138) bs_get_utf8 (139) bs_skip_utf8 (140) bs_get_utf16 (141) bs_skip_utf16 (142) bs_get_utf32 (143) bs_skip_utf32 ``` These changes are made under both the "Apache 2.0" and the "GNU Lesser General Public License 2.1 or later" license terms (dual license). SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
2 parents 88d4511 + 61aaf87 commit 79b8d0f

File tree

6 files changed

+770
-3
lines changed

6 files changed

+770
-3
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
3434
- Added functions `esp:sleep_enable_ext0_wakeup/2` and `esp:sleep_enable_ext1_wakeup/2.`
3535
- Added support for FP opcodes 94-102 thus removing the need for `AVM_DISABLE_FP=On` with OTP-22+
3636
- Added support for stacktraces
37+
- Added support for `utf-8`, `utf-16`, and `utf-32` bit syntax modifiers (put and match)
38+
3739

3840
### Fixed
3941
- Fixed issue with formatting integers with io:format() on STM32 platform

src/libAtomVM/bitstring.c

Lines changed: 166 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,10 +82,31 @@ bool bitstring_insert_any_integer(uint8_t *dst, avm_int_t offset, avm_int64_t va
8282
return true;
8383
}
8484

85+
static bool is_invalid_codepoint(int32_t v)
86+
{
87+
return (v < 0) || (v > 0x10FFFF) || (v >= 0xD800 && v <= 0xDFFF);
88+
}
89+
90+
//
91+
// UTF-8 encoding
92+
// https://en.wikipedia.org/wiki/UTF-8
93+
// +----------+----------+----------+----------+----------+
94+
// | code pt | buf[0] | buf[1] | buf[2] | buf[3] |
95+
// +----------+----------+----------+----------+----------+
96+
// | U+0000 | 0xxxxxxx | | | |
97+
// +----------+----------+----------+----------+----------+
98+
// | U+0080 | 110xxxxx | 10xxxxxx | | |
99+
// +----------+----------+----------+----------+----------+
100+
// | U+0800 | 1110xxxx | 10xxxxxx | 10xxxxxx | |
101+
// +----------+----------+----------+----------+----------+
102+
// | U+10000 | 11110xxx | 10xxxxxx | 10xxxxxx | 10xxxxxx |
103+
// +----------+----------+----------+----------+----------+
104+
//
105+
85106
bool bitstring_utf8_encode(avm_int_t c, uint8_t *buf, size_t *out_size)
86107
{
87108
size_t sz = 0;
88-
if (c < 0 || c > 0x10FFFF) {
109+
if (is_invalid_codepoint(c)) {
89110
return false;
90111
}
91112
if (c < 0x80) {
@@ -119,10 +140,67 @@ bool bitstring_utf8_encode(avm_int_t c, uint8_t *buf, size_t *out_size)
119140
return true;
120141
}
121142

143+
bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *out_size)
144+
{
145+
if (len == 0) {
146+
return false;
147+
} else if (len >= 4 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80) && ((buf[3] & 0xC0) == 0x80)) {
148+
int32_t v = 0;
149+
v |= (buf[0] & 0x07) << 18;
150+
v |= (buf[1] & 0x3F) << 12;
151+
v |= (buf[2] & 0x3F) << 6;
152+
v |= (buf[3] & 0x3F);
153+
// overlong encoding or invalid codepoint
154+
if (v <= 0x10000 || v > 0x10FFFF) {
155+
return false;
156+
}
157+
*c = v;
158+
*out_size = 4;
159+
return true;
160+
} else if (len >= 3 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) {
161+
int32_t v = 0;
162+
v |= (buf[0] & 0x0F) << 12;
163+
v |= (buf[1] & 0x3F) << 6;
164+
v |= (buf[2] & 0x3F);
165+
// overlong encoding or surrogate
166+
if (v < 0x800 || (v >= 0xD800 && v <= 0xDFFF)) {
167+
return false;
168+
}
169+
*c = v;
170+
*out_size = 3;
171+
return true;
172+
} else if (len >= 2 && (buf[0] & 0xE0) == 0xC0 && ((buf[1] & 0xC0) == 0x80)) {
173+
int32_t v = 0;
174+
v |= (buf[0] & 0x1F) << 6;
175+
v |= (buf[1] & 0x3F);
176+
// overlong encoding
177+
if (v < 0x80) {
178+
return false;
179+
}
180+
*c = v;
181+
*out_size = 2;
182+
return true;
183+
} else if ((*buf & 0x80) == 0) {
184+
int32_t v = 0;
185+
v |= (buf[0] & 0x7F);
186+
*c = v;
187+
*out_size = 1;
188+
return true;
189+
}
190+
191+
return false;
192+
}
193+
194+
// UTF-16 encoding, when U in U+010000 to U+10FFFF:
195+
//
196+
// U' = yyyyyyyyyyxxxxxxxxxx // U - 0x10000
197+
// W1 = 110110yyyyyyyyyy // 0xD800 + yyyyyyyyyy
198+
// W2 = 110111xxxxxxxxxx // 0xDC00 + xxxxxxxxxx
199+
122200
bool bitstring_utf16_encode(avm_int_t c, uint8_t *buf, enum BitstringFlags bs_flags, size_t *out_size)
123201
{
124202
size_t sz = 0;
125-
if (c < 0 || c > 0x10FFFF) {
203+
if (is_invalid_codepoint(c)) {
126204
return false;
127205
}
128206
if (c < 0x10000) {
@@ -158,10 +236,66 @@ bool bitstring_utf16_encode(avm_int_t c, uint8_t *buf, enum BitstringFlags bs_fl
158236
return true;
159237
}
160238

239+
bool bitstring_utf16_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *out_size, enum BitstringFlags bs_flags)
240+
{
241+
if (len == 0) {
242+
return false;
243+
} else if (bs_flags & LittleEndianIntegerMask) {
244+
if (len >= 4 && ((buf[1] & 0xFC) == 0xD8) && ((buf[3] & 0xFC) == 0xDC)) {
245+
int32_t v = 0;
246+
v |= (buf[1] & 0x03) << 18;
247+
v |= (buf[0] & 0xFF) << 10;
248+
v |= (buf[3] & 0x03) << 8;
249+
v |= (buf[2] & 0xFF);
250+
v += 0x10000;
251+
if (is_invalid_codepoint(v)) {
252+
return false;
253+
}
254+
*c = v;
255+
*out_size = 4;
256+
return true;
257+
} else if (len >= 2) {
258+
int32_t v = 0;
259+
v = READ_16LE_UNALIGNED(buf);
260+
if (is_invalid_codepoint(v)) {
261+
return false;
262+
}
263+
*c = v;
264+
*out_size = 2;
265+
return true;
266+
}
267+
} else {
268+
if (len >= 4 && ((buf[0] & 0xFC) == 0xD8) && ((buf[2] & 0xFC) == 0xDC)) {
269+
int32_t v = 0;
270+
v |= (buf[0] & 0x03) << 18;
271+
v |= (buf[1] & 0xFF) << 10;
272+
v |= (buf[2] & 0x03) << 8;
273+
v |= (buf[3] & 0xFF);
274+
v += 0x10000;
275+
if (is_invalid_codepoint(v)) {
276+
return false;
277+
}
278+
*c = v;
279+
*out_size = 4;
280+
return true;
281+
} else if (len >= 2) {
282+
int32_t v = 0;
283+
v = READ_16_UNALIGNED(buf);
284+
if (is_invalid_codepoint(v)) {
285+
return false;
286+
}
287+
*c = v;
288+
*out_size = 2;
289+
return true;
290+
}
291+
}
292+
return false;
293+
}
294+
161295
bool bitstring_utf32_encode(avm_int_t c, uint8_t *buf, enum BitstringFlags bs_flags)
162296
{
163297
UNUSED(bs_flags);
164-
if (c < 0 || c > 0x10FFFF) {
298+
if (is_invalid_codepoint(c)) {
165299
return false;
166300
}
167301
if (bs_flags & LittleEndianIntegerMask) {
@@ -177,3 +311,32 @@ bool bitstring_utf32_encode(avm_int_t c, uint8_t *buf, enum BitstringFlags bs_fl
177311
}
178312
return true;
179313
}
314+
315+
bool bitstring_utf32_decode(const uint8_t *buf, size_t len, int32_t *c, enum BitstringFlags bs_flags)
316+
{
317+
if (len < 4) {
318+
return false;
319+
} else if (bs_flags & LittleEndianIntegerMask) {
320+
int32_t v = 0;
321+
v |= (buf[3] & 0xFF) << 24;
322+
v |= (buf[2] & 0xFF) << 16;
323+
v |= (buf[1] & 0xFF) << 8;
324+
v |= buf[0] & 0xFF;
325+
if (is_invalid_codepoint(v)) {
326+
return false;
327+
}
328+
*c = v;
329+
return true;
330+
} else {
331+
int32_t v = 0;
332+
v |= (buf[0] & 0xFF) << 24;
333+
v |= (buf[1] & 0xFF) << 16;
334+
v |= (buf[2] & 0xFF) << 8;
335+
v |= buf[3] & 0xFF;
336+
if (is_invalid_codepoint(v)) {
337+
return false;
338+
}
339+
*c = v;
340+
return true;
341+
}
342+
}

src/libAtomVM/bitstring.h

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,19 @@ static inline bool bitstring_insert_integer(term dst_bin, size_t offset, avm_int
313313
*/
314314
bool bitstring_utf8_encode(avm_int_t c, uint8_t *buf, size_t *out_size);
315315

316+
/**
317+
* @brief Decode a character from UTF-8.
318+
*
319+
* @param c int value to decode to
320+
* @param buf the buffer froom which to decode the sring to or NULL to only compute the
321+
* size.
322+
* @param len the length (in bytes) of the bytes in buf
323+
* @param out_size the size in bytes, on output (if not NULL)
324+
* @return \c true if decoding was successful, \c false if character starting at buf is not a valid
325+
* unicode character
326+
*/
327+
bool bitstring_utf8_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *out_size);
328+
316329
/**
317330
* @brief Encode a character to UTF-16.
318331
*
@@ -326,6 +339,20 @@ bool bitstring_utf8_encode(avm_int_t c, uint8_t *buf, size_t *out_size);
326339
*/
327340
bool bitstring_utf16_encode(avm_int_t c, uint8_t *buf, enum BitstringFlags bs_flags, size_t *out_size);
328341

342+
/**
343+
* @brief Decode a character from UTF-16.
344+
*
345+
* @param c int value to decode to
346+
* @param buf the buffer froom which to decode the sring to or NULL to only compute the
347+
* size.
348+
* @param len the length (in bytes) of the bytes in buf
349+
* @param bs_flags flags to decode the character (undefined/little/big/native)
350+
* @param out_size the size in bytes, on output (if not NULL)
351+
* @return \c true if decoding was successful, \c false if character starting at buf is not a valid
352+
* unicode character
353+
*/
354+
bool bitstring_utf16_decode(const uint8_t *buf, size_t len, int32_t *c, size_t *out_size, enum BitstringFlags bs_flags);
355+
329356
/**
330357
* @brief Encode a character to UTF-32.
331358
*
@@ -337,6 +364,19 @@ bool bitstring_utf16_encode(avm_int_t c, uint8_t *buf, enum BitstringFlags bs_fl
337364
*/
338365
bool bitstring_utf32_encode(avm_int_t c, uint8_t *buf, enum BitstringFlags bs_flags);
339366

367+
/**
368+
* @brief Decode a character from UTF-32.
369+
*
370+
* @param c int value to decode to
371+
* @param buf the buffer froom which to decode the sring to or NULL to only compute the
372+
* size.
373+
* @param len the length (in bytes) of the bytes in buf
374+
* @param bs_flags flags to decode the character (undefined/little/big/native)
375+
* @return \c true if decoding was successful, \c false if character starting at buf is not a valid
376+
* unicode character
377+
*/
378+
bool bitstring_utf32_decode(const uint8_t *buf, size_t len, int32_t *c, enum BitstringFlags bs_flags);
379+
340380
/**
341381
* @brief Compute the size of a character when UTF-8 encoded.
342382
*
@@ -379,6 +419,23 @@ static inline bool bitstring_insert_utf8(term dst_bin, size_t offset, avm_int_t
379419
return bitstring_utf8_encode(c, dst, out_size);
380420
}
381421

422+
/**
423+
* @brief Match a character in UTF-8 format
424+
*
425+
* @param src_bin binary to match against
426+
* @param offset offset, in bits, to where to start to match the character
427+
* @param c int to decode to
428+
* @param out_size the size in bytes, on output
429+
* @return \c true if encoding was successful, \c false if src_bin at offset is not a valid
430+
* unicode character
431+
*/
432+
static inline bool bitstring_match_utf8(term src_bin, size_t offset, int32_t *c, size_t *out_size)
433+
{
434+
size_t byte_offset = offset >> 3; // divide by 8
435+
const uint8_t *src = (const uint8_t *) term_binary_data(src_bin) + byte_offset;
436+
return bitstring_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size);
437+
}
438+
382439
/**
383440
* @brief Insert a character in UTF-&§ format
384441
*
@@ -397,6 +454,24 @@ static inline bool bitstring_insert_utf16(term dst_bin, size_t offset, avm_int_t
397454
return bitstring_utf16_encode(c, dst, bs_flags, out_size);
398455
}
399456

457+
/**
458+
* @brief Match a character in UTF-16 format
459+
*
460+
* @param src_bin binary to match against
461+
* @param offset offset, in bits, to where to start to match the character
462+
* @param c int to decode to
463+
* @param bs_flags flags to decode the character (undefined/little/big/native)
464+
* @param out_size the size in bytes, on output
465+
* @return \c true if encoding was successful, \c false if src_bin at offset is not a valid
466+
* unicode character
467+
*/
468+
static inline bool bitstring_match_utf16(term src_bin, size_t offset, int32_t *c, size_t *out_size, enum BitstringFlags bs_flags)
469+
{
470+
size_t byte_offset = offset >> 3; // divide by 8
471+
const uint8_t *src = (const uint8_t *) term_binary_data(src_bin) + byte_offset;
472+
return bitstring_utf16_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size, bs_flags);
473+
}
474+
400475
/**
401476
* @brief Insert a character in UTF-32 format
402477
*
@@ -414,6 +489,23 @@ static inline bool bitstring_insert_utf32(term dst_bin, size_t offset, avm_int_t
414489
return bitstring_utf32_encode(c, dst, bs_flags);
415490
}
416491

492+
/**
493+
* @brief Match a character in UTF-32 format
494+
*
495+
* @param src_bin binary to match against
496+
* @param offset offset, in bits, to where to start to match the character
497+
* @param c int to decode to
498+
* @param bs_flags flags to decode the character (undefined/little/big/native)
499+
* @return \c true if encoding was successful, \c false if src_bin at offset is not a valid
500+
* unicode character
501+
*/
502+
static inline bool bitstring_match_utf32(term src_bin, size_t offset, int32_t *c, enum BitstringFlags bs_flags)
503+
{
504+
size_t byte_offset = offset >> 3; // divide by 8
505+
const uint8_t *src = (const uint8_t *) term_binary_data(src_bin) + byte_offset;
506+
return bitstring_utf32_decode(src, term_binary_size(src_bin) - byte_offset, c, bs_flags);
507+
}
508+
417509
#ifdef __cplusplus
418510
}
419511
#endif

src/libAtomVM/opcodes.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,12 @@
126126
#define OP_BS_PRIVATE_APPEND 135
127127
#define OP_TRIM 136
128128
#define OP_BS_INIT_BITS 137
129+
#define OP_BS_GET_UTF8 138
130+
#define OP_BS_SKIP_UTF8 139
131+
#define OP_BS_GET_UTF16 140
132+
#define OP_BS_SKIP_UTF16 141
133+
#define OP_BS_GET_UTF32 142
134+
#define OP_BS_SKIP_UTF32 143
129135
#define OP_BS_UTF8_SIZE 144
130136
#define OP_BS_PUT_UTF8 145
131137
#define OP_BS_UTF16_SIZE 146

0 commit comments

Comments
 (0)