utils: Add utf8_is_valid function

rruuaanng · rruuaanng · commit a093b5672030 · 2024-11-16T20:49:21.000+08:00
Add 'utf8_is_valid' function to check if a given string is utf8 encoded.

Signed-off-by: James Roy &lt;rruuaanng@outlook.com&gt;
diff --git a/include/zephyr/sys/util.h b/include/zephyr/sys/util.h
@@ -685,6 +685,16 @@ char *utf8_trunc(char *utf8_str);
  */
 char *utf8_lcpy(char *dst, const char *src, size_t n);
 
+/**
+ * @brief Checks if the given string @p str is UTF-8 encoded.
+ *
+ * @param str Target string
+ * @param maxlen The maximum length of string @p str to check
+ *
+ * @return true if @p str is UTF-8 encoded, or false otherwise.
+ */
+bool utf8_is_valid(const unsigned char *str, size_t maxlen);
+
 #define __z_log2d(x) (32 - __builtin_clz(x) - 1)
 #define __z_log2q(x) (64 - __builtin_clzll(x) - 1)
 #define __z_log2(x) (sizeof(__typeof__(x)) > 4 ? __z_log2q(x) : __z_log2d(x))
diff --git a/lib/utils/utf8.c b/lib/utils/utf8.c
@@ -7,12 +7,17 @@
 #include <stdint.h>
 #include <string.h>
 #include <zephyr/sys/__assert.h>
+#include <zephyr/sys/util.h>
 
 #define ASCII_CHAR 0x7F
 #define SEQUENCE_FIRST_MASK 0xC0
 #define SEQUENCE_LEN_2_BYTE 0xC0
-#define SEQUENCE_LEN_3_BYTE 0xE0
-#define SEQUENCE_LEN_4_BYTE 0xF0
+#define SEQUENCE_MIN_LEN_2_BYTE (SEQUENCE_LEN_2_BYTE + 2)
+#define SEQUENCE_MAX_LEN_2_BYTE 0xDF
+#define SEQUENCE_MIN_LEN_3_BYTE 0xE0
+#define SEQUENCE_MAX_LEN_3_BYTE 0xEF
+#define SEQUENCE_MIN_LEN_4_BYTE 0xF0
+#define SEQUENCE_MAX_LEN_4_BYTE 0xF4
 
 char *utf8_trunc(char *utf8_str)
 {
@@ -46,11 +51,11 @@ char *utf8_trunc(char *utf8_str)
 	 * matches the number of bytes we searched for the starting byte
 	 */
 	seq_start_byte = *last_byte_p;
-	if ((seq_start_byte & SEQUENCE_LEN_4_BYTE) == SEQUENCE_LEN_4_BYTE) {
+	if ((seq_start_byte & SEQUENCE_MIN_LEN_4_BYTE) == SEQUENCE_MIN_LEN_4_BYTE) {
 		if (bytes_truncated == 4) {
 			return utf8_str;
 		}
-	} else if ((seq_start_byte & SEQUENCE_LEN_3_BYTE) == SEQUENCE_LEN_3_BYTE) {
+	} else if ((seq_start_byte & SEQUENCE_MIN_LEN_3_BYTE) == SEQUENCE_MIN_LEN_3_BYTE) {
 		if (bytes_truncated == 3) {
 			return utf8_str;
 		}
@@ -79,3 +84,38 @@ char *utf8_lcpy(char *dst, const char *src, size_t n)
 
 	return dst;
 }
+
+bool utf8_is_valid(const unsigned char *str, size_t len)
+{
+	size_t i = 0, nbyte = 0;
+
+	/* It will also return false */
+	if (str == NULL) {
+		return false;
+	}
+
+	while (i < len && str[i] != '\0') {
+		if (str[i] <= ASCII_CHAR) {
+			i++;
+			continue;
+		} else {
+			if (str[i] <= SEQUENCE_MAX_LEN_2_BYTE
+			 && str[i] >= SEQUENCE_MIN_LEN_2_BYTE) {
+				nbyte = 2;
+			} else if (str[i] <= SEQUENCE_MAX_LEN_3_BYTE
+					&& str[i] >= SEQUENCE_MIN_LEN_3_BYTE) {
+				nbyte = 3;
+			} else if (str[i] <= SEQUENCE_MAX_LEN_4_BYTE
+					&& str[i] >= SEQUENCE_MIN_LEN_4_BYTE) {
+				nbyte = 4;
+			} else {
+				return false;
+			}
+		}
+		if (i + nbyte > len) {
+			return false;
+		}
+		i += nbyte;
+	}
+	return true;
+}
diff --git a/tests/unit/util/main.c b/tests/unit/util/main.c
@@ -867,6 +867,27 @@ ZTEST(util, test_utf8_lcpy_truncated)
 	zassert_str_equal(dest_str, expected_result, "Failed to copy");
 }
 
+ZTEST(util, test_utf8_is_valid)
+{
+	/* Test whether the verification function meets the requirements */
+	zassert_true(utf8_is_valid("κόσμε", 11));
+	zassert_true(utf8_is_valid("\x00", 1), "1 byte (U-00000000)");
+	zassert_true(utf8_is_valid("\xc2\x80", 2)), "2 bytes (U-00000080)";
+	zassert_true(utf8_is_valid("\xef\xbf\xbf", 3), "(U-0000FFFF)");
+	zassert_true(utf8_is_valid("\xed\x9f\xbf", 3), "U-0000D7FF");
+	zassert_true(utf8_is_valid("\xef\xbf\xbf", 3), "Replacement Character U+FFFF");
+	zassert_true(utf8_is_valid("\xef\xbf\xbe", 3), "Byte Order Mark (BOM) U+FFFE");
+	zassert_false(utf8_is_valid("\x80", 1), "First continuation byte 0x80");
+	zassert_false(utf8_is_valid("\xc0", 1), "2-bytes U+0000, last byte missing");
+	zassert_false(utf8_is_valid("\xfe", 1), "impossible byte");
+	zassert_false(utf8_is_valid("\xfe\xfe\xff\xff", 4), "several impossible bytes");
+	zassert_false(utf8_is_valid("\xc0\x7f", 2), "no continuation byte");
+	zassert_false(utf8_is_valid("\xc0\xaf", 2), "Overlong U+002F");
+	zassert_false(utf8_is_valid("\xc1\xbf", 2), "Overlong U-0000007F");
+	zassert_false(utf8_is_valid("\xc0\x80", 2), "2 bytes overlong U+0000");
+	zassert_false(utf8_is_valid(NULL, 1), "NULL str argument");
+}
+
 ZTEST(util, test_utf8_lcpy_not_truncated)
 {
 	/* dest_str size is based on storing 3 * € plus the null terminator  */