Skip to content

Commit 320d476

Browse files
committed
Add bitstring UTF opcodes
This commit adds support for the following opcodes - bs_utf8_size - bs_put_utf8 - bs_utf16_size - bs_put_utf16 - bs_put_utf32 Signed-off-by: Paul Guyot <[email protected]>
1 parent d9040e9 commit 320d476

File tree

7 files changed

+528
-0
lines changed

7 files changed

+528
-0
lines changed

src/libAtomVM/bitstring.c

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,3 +76,99 @@ bool bitstring_insert_any_integer(uint8_t *dst, avm_int_t offset, avm_int64_t va
7676
}
7777
return true;
7878
}
79+
80+
bool bitstring_utf8_encode(avm_int_t c, uint8_t *buf, size_t *out_size)
81+
{
82+
size_t sz = 0;
83+
if (c < 0 || c > 0x10FFFF) {
84+
return false;
85+
}
86+
if (c < 0x80) {
87+
if (buf) {
88+
*buf++ = c;
89+
}
90+
sz++;
91+
} else if (c < 0x800) {
92+
if (buf) {
93+
*buf++ = (c >> 6) | 0xC0;
94+
*buf++ = (c & 0x3F) | 0x80;
95+
}
96+
sz += 2;
97+
} else if (c < 0x10000) {
98+
if (buf) {
99+
*buf++ = (c >> 12) | 0xE0;
100+
*buf++ = ((c >> 6) & 0x3F) | 0x80;
101+
*buf++ = (c & 0x3F) | 0x80;
102+
}
103+
sz += 3;
104+
} else {
105+
if (buf) {
106+
*buf++ = (c >> 18) | 0xF0;
107+
*buf++ = ((c >> 12) & 0x3F) | 0x80;
108+
*buf++ = ((c >> 6) & 0x3F) | 0x80;
109+
*buf++ = (c & 0x3F) | 0x80;
110+
}
111+
sz += 4;
112+
}
113+
*out_size = sz;
114+
return true;
115+
}
116+
117+
bool bitstring_utf16_encode(avm_int_t c, uint8_t *buf, enum BitstringFlags bs_flags, size_t *out_size)
118+
{
119+
size_t sz = 0;
120+
if (c < 0 || c > 0x10FFFF) {
121+
return false;
122+
}
123+
if (c < 0x10000) {
124+
// Ignore D800-DFFF range
125+
if (buf) {
126+
if (bs_flags & LittleEndianIntegerMask) {
127+
*buf++ = c & 0xFF;
128+
*buf++ = c >> 8;
129+
} else {
130+
*buf++ = c >> 8;
131+
*buf++ = c & 0xFF;
132+
}
133+
}
134+
sz += 2;
135+
} else {
136+
if (buf) {
137+
c -= 0x10000;
138+
if (bs_flags & LittleEndianIntegerMask) {
139+
*buf++ = ((c >> 10) & 0xFF);
140+
*buf++ = (c >> 18) | 0xD8;
141+
*buf++ = c & 0xFF;
142+
*buf++ = ((c >> 8) & 0x03) | 0xDC;
143+
} else {
144+
*buf++ = (c >> 18) | 0xD8;
145+
*buf++ = ((c >> 10) & 0xFF);
146+
*buf++ = ((c >> 8) & 0x03) | 0xDC;
147+
*buf++ = c & 0xFF;
148+
}
149+
}
150+
sz += 4;
151+
}
152+
*out_size = sz;
153+
return true;
154+
}
155+
156+
bool bitstring_utf32_encode(avm_int_t c, uint8_t *buf, enum BitstringFlags bs_flags)
157+
{
158+
UNUSED(bs_flags);
159+
if (c < 0 || c > 0x10FFFF) {
160+
return false;
161+
}
162+
if (bs_flags & LittleEndianIntegerMask) {
163+
*buf++ = c & 0xFF;
164+
*buf++ = (c >> 8) & 0xFF;
165+
*buf++ = (c >> 16) & 0xFF;
166+
*buf++ = c >> 24;
167+
} else {
168+
*buf++ = c >> 24;
169+
*buf++ = (c >> 16) & 0xFF;
170+
*buf++ = (c >> 8) & 0xFF;
171+
*buf++ = c & 0xFF;
172+
}
173+
return true;
174+
}

src/libAtomVM/bitstring.h

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,119 @@ static inline bool bitstring_insert_integer(term dst_bin, size_t offset, avm_int
301301
return bitstring_insert_any_integer((uint8_t *) term_binary_data(dst_bin), offset, value, n, bs_flags);
302302
}
303303

304+
/**
305+
* @brief Encode a character to UTF-8.
306+
*
307+
* @param c character to encode
308+
* @param buf the buffer to encode the sring to or NULL to only compute the
309+
* size.
310+
* @param out_size the size in bytes, on output (if not NULL)
311+
* @return \c true if encoding was successful, \c false if c is not a valid
312+
* unicode character
313+
*/
314+
bool bitstring_utf8_encode(avm_int_t c, uint8_t *buf, size_t *out_size);
315+
316+
/**
317+
* @brief Encode a character to UTF-16.
318+
*
319+
* @param c character to encode
320+
* @param buf the buffer to encode the character to or NULL to only compute the
321+
* size.
322+
* @param bs_flags flags to encode the character (undefined/little/big/native)
323+
* @param out_size the size in bytes, on output (if not NULL)
324+
* @return \c true if encoding was successful, \c false if c is not a valid
325+
* unicode character
326+
*/
327+
bool bitstring_utf16_encode(avm_int_t c, uint8_t *buf, enum BitstringFlags bs_flags, size_t *out_size);
328+
329+
/**
330+
* @brief Encode a character to UTF-32.
331+
*
332+
* @param c character to encode
333+
* @param buf the buffer to encode the character
334+
* @param bs_flags flags to encode the character (undefined/little/big/native)
335+
* @return \c true if encoding was successful, \c false if c is not a valid
336+
* unicode character
337+
*/
338+
bool bitstring_utf32_encode(avm_int_t c, uint8_t *buf, enum BitstringFlags bs_flags);
339+
340+
/**
341+
* @brief Compute the size of a character when UTF-8 encoded.
342+
*
343+
* @param c character to encode
344+
* @param out_size the size in bytes, on output
345+
* @return \c true if encoding was successful, \c false if c is not a valid
346+
* unicode character
347+
*/
348+
static inline bool bitstring_utf8_size(avm_int_t c, size_t *out_size)
349+
{
350+
return bitstring_utf8_encode(c, NULL, out_size);
351+
}
352+
353+
/**
354+
* @brief Compute the size of a unicode character when UTF-16 encoded.
355+
*
356+
* @param c character to encode
357+
* @param out_size the size in bytes, on output
358+
* @return \c true if encoding was successful, \c false if c is not a valid
359+
* unicode character
360+
*/
361+
static inline bool bitstring_utf16_size(avm_int_t c, size_t *out_size) {
362+
return bitstring_utf16_encode(c, NULL, 0, out_size);
363+
}
364+
365+
/**
366+
* @brief Insert a character in UTF-8 format
367+
*
368+
* @param dst_bin binary to insert to
369+
* @param offset offset, in bits, to where to insert the character
370+
* @param c character to encode
371+
* @param out_size the size in bytes, on output
372+
* @return \c true if encoding was successful, \c false if c is not a valid
373+
* unicode character
374+
*/
375+
static inline bool bitstring_insert_utf8(term dst_bin, size_t offset, avm_int_t c, size_t *out_size)
376+
{
377+
// size was verified by a bs_utf8_size instruction call
378+
uint8_t *dst = (uint8_t *) term_binary_data(dst_bin) + (offset >> 3);
379+
return bitstring_utf8_encode(c, dst, out_size);
380+
}
381+
382+
/**
383+
* @brief Insert a character in UTF-&§ format
384+
*
385+
* @param dst_bin binary to insert to
386+
* @param offset offset, in bits, to where to insert the character
387+
* @param c character to encode
388+
* @param bs_flags flags to encode the character (undefined/little/big/native)
389+
* @param out_size the size in bytes, on output
390+
* @return \c true if encoding was successful, \c false if c is not a valid
391+
* unicode character
392+
*/
393+
static inline bool bitstring_insert_utf16(term dst_bin, size_t offset, avm_int_t c, enum BitstringFlags bs_flags, size_t *out_size)
394+
{
395+
// size was verified by a bs_utf8_size instruction call
396+
uint8_t *dst = (uint8_t *) term_binary_data(dst_bin) + (offset >> 3);
397+
return bitstring_utf16_encode(c, dst, bs_flags, out_size);
398+
}
399+
400+
/**
401+
* @brief Insert a character in UTF-32 format
402+
*
403+
* @param dst_bin binary to insert to
404+
* @param offset offset, in bits, to where to insert the character
405+
* @param c character to encode
406+
* @param bs_flags flags to encode the character (undefined/little/big/native)
407+
* @param out_size the size in bytes, on output
408+
* @return \c true if encoding was successful, \c false if c is not a valid
409+
* unicode character
410+
*/
411+
static inline bool bitstring_insert_utf32(term dst_bin, size_t offset, avm_int_t c, enum BitstringFlags bs_flags)
412+
{
413+
uint8_t *dst = (uint8_t *) term_binary_data(dst_bin) + (offset >> 3);
414+
return bitstring_utf32_encode(c, dst, bs_flags);
415+
}
416+
304417
#ifdef __cplusplus
305418
}
306419
#endif

src/libAtomVM/opcodes.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,11 @@
115115
#define OP_BS_APPEND 134
116116
#define OP_TRIM 136
117117
#define OP_BS_INIT_BITS 137
118+
#define OP_BS_UTF8_SIZE 144
119+
#define OP_BS_PUT_UTF8 145
120+
#define OP_BS_UTF16_SIZE 146
121+
#define OP_BS_PUT_UTF16 147
122+
#define OP_BS_PUT_UTF32 148
118123
#define OP_RECV_MARK 150
119124
#define OP_RECV_SET 151
120125
#define OP_GC_BIF3 152

0 commit comments

Comments
 (0)