Skip to content

Commit a093b56

Browse files
committed
utils: Add utf8_is_valid function
Add 'utf8_is_valid' function to check if a given string is utf8 encoded. Signed-off-by: James Roy <[email protected]>
1 parent 2f23313 commit a093b56

File tree

3 files changed

+75
-4
lines changed

3 files changed

+75
-4
lines changed

include/zephyr/sys/util.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -685,6 +685,16 @@ char *utf8_trunc(char *utf8_str);
685685
*/
686686
char *utf8_lcpy(char *dst, const char *src, size_t n);
687687

688+
/**
689+
* @brief Checks if the given string @p str is UTF-8 encoded.
690+
*
691+
* @param str Target string
692+
* @param maxlen The maximum length of string @p str to check
693+
*
694+
* @return true if @p str is UTF-8 encoded, or false otherwise.
695+
*/
696+
bool utf8_is_valid(const unsigned char *str, size_t maxlen);
697+
688698
#define __z_log2d(x) (32 - __builtin_clz(x) - 1)
689699
#define __z_log2q(x) (64 - __builtin_clzll(x) - 1)
690700
#define __z_log2(x) (sizeof(__typeof__(x)) > 4 ? __z_log2q(x) : __z_log2d(x))

lib/utils/utf8.c

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,17 @@
77
#include <stdint.h>
88
#include <string.h>
99
#include <zephyr/sys/__assert.h>
10+
#include <zephyr/sys/util.h>
1011

1112
#define ASCII_CHAR 0x7F
1213
#define SEQUENCE_FIRST_MASK 0xC0
1314
#define SEQUENCE_LEN_2_BYTE 0xC0
14-
#define SEQUENCE_LEN_3_BYTE 0xE0
15-
#define SEQUENCE_LEN_4_BYTE 0xF0
15+
#define SEQUENCE_MIN_LEN_2_BYTE (SEQUENCE_LEN_2_BYTE + 2)
16+
#define SEQUENCE_MAX_LEN_2_BYTE 0xDF
17+
#define SEQUENCE_MIN_LEN_3_BYTE 0xE0
18+
#define SEQUENCE_MAX_LEN_3_BYTE 0xEF
19+
#define SEQUENCE_MIN_LEN_4_BYTE 0xF0
20+
#define SEQUENCE_MAX_LEN_4_BYTE 0xF4
1621

1722
char *utf8_trunc(char *utf8_str)
1823
{
@@ -46,11 +51,11 @@ char *utf8_trunc(char *utf8_str)
4651
* matches the number of bytes we searched for the starting byte
4752
*/
4853
seq_start_byte = *last_byte_p;
49-
if ((seq_start_byte & SEQUENCE_LEN_4_BYTE) == SEQUENCE_LEN_4_BYTE) {
54+
if ((seq_start_byte & SEQUENCE_MIN_LEN_4_BYTE) == SEQUENCE_MIN_LEN_4_BYTE) {
5055
if (bytes_truncated == 4) {
5156
return utf8_str;
5257
}
53-
} else if ((seq_start_byte & SEQUENCE_LEN_3_BYTE) == SEQUENCE_LEN_3_BYTE) {
58+
} else if ((seq_start_byte & SEQUENCE_MIN_LEN_3_BYTE) == SEQUENCE_MIN_LEN_3_BYTE) {
5459
if (bytes_truncated == 3) {
5560
return utf8_str;
5661
}
@@ -79,3 +84,38 @@ char *utf8_lcpy(char *dst, const char *src, size_t n)
7984

8085
return dst;
8186
}
87+
88+
bool utf8_is_valid(const unsigned char *str, size_t len)
89+
{
90+
size_t i = 0, nbyte = 0;
91+
92+
/* It will also return false */
93+
if (str == NULL) {
94+
return false;
95+
}
96+
97+
while (i < len && str[i] != '\0') {
98+
if (str[i] <= ASCII_CHAR) {
99+
i++;
100+
continue;
101+
} else {
102+
if (str[i] <= SEQUENCE_MAX_LEN_2_BYTE
103+
&& str[i] >= SEQUENCE_MIN_LEN_2_BYTE) {
104+
nbyte = 2;
105+
} else if (str[i] <= SEQUENCE_MAX_LEN_3_BYTE
106+
&& str[i] >= SEQUENCE_MIN_LEN_3_BYTE) {
107+
nbyte = 3;
108+
} else if (str[i] <= SEQUENCE_MAX_LEN_4_BYTE
109+
&& str[i] >= SEQUENCE_MIN_LEN_4_BYTE) {
110+
nbyte = 4;
111+
} else {
112+
return false;
113+
}
114+
}
115+
if (i + nbyte > len) {
116+
return false;
117+
}
118+
i += nbyte;
119+
}
120+
return true;
121+
}

tests/unit/util/main.c

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -867,6 +867,27 @@ ZTEST(util, test_utf8_lcpy_truncated)
867867
zassert_str_equal(dest_str, expected_result, "Failed to copy");
868868
}
869869

870+
ZTEST(util, test_utf8_is_valid)
871+
{
872+
/* Test whether the verification function meets the requirements */
873+
zassert_true(utf8_is_valid("κόσμε", 11));
874+
zassert_true(utf8_is_valid("\x00", 1), "1 byte (U-00000000)");
875+
zassert_true(utf8_is_valid("\xc2\x80", 2)), "2 bytes (U-00000080)";
876+
zassert_true(utf8_is_valid("\xef\xbf\xbf", 3), "(U-0000FFFF)");
877+
zassert_true(utf8_is_valid("\xed\x9f\xbf", 3), "U-0000D7FF");
878+
zassert_true(utf8_is_valid("\xef\xbf\xbf", 3), "Replacement Character U+FFFF");
879+
zassert_true(utf8_is_valid("\xef\xbf\xbe", 3), "Byte Order Mark (BOM) U+FFFE");
880+
zassert_false(utf8_is_valid("\x80", 1), "First continuation byte 0x80");
881+
zassert_false(utf8_is_valid("\xc0", 1), "2-bytes U+0000, last byte missing");
882+
zassert_false(utf8_is_valid("\xfe", 1), "impossible byte");
883+
zassert_false(utf8_is_valid("\xfe\xfe\xff\xff", 4), "several impossible bytes");
884+
zassert_false(utf8_is_valid("\xc0\x7f", 2), "no continuation byte");
885+
zassert_false(utf8_is_valid("\xc0\xaf", 2), "Overlong U+002F");
886+
zassert_false(utf8_is_valid("\xc1\xbf", 2), "Overlong U-0000007F");
887+
zassert_false(utf8_is_valid("\xc0\x80", 2), "2 bytes overlong U+0000");
888+
zassert_false(utf8_is_valid(NULL, 1), "NULL str argument");
889+
}
890+
870891
ZTEST(util, test_utf8_lcpy_not_truncated)
871892
{
872893
/* dest_str size is based on storing 3 * € plus the null terminator */

0 commit comments

Comments
 (0)