@@ -1096,6 +1096,65 @@ static int flb_utils_write_str_escaped(char *buf, int *off, size_t size, const c
10961096 return FLB_TRUE ;
10971097}
10981098
1099+ static inline int flb_utf8_validate_char (const unsigned char * str , int max_len )
1100+ {
1101+ unsigned char c = str [0 ];
1102+ int len = 0 ;
1103+ int i ;
1104+
1105+ if (max_len < 1 ) {
1106+ return 0 ;
1107+ }
1108+
1109+ /* 1-byte sequence (ASCII) */
1110+ if (c <= 0x7F ) {
1111+ return 1 ;
1112+ }
1113+ /* 2-byte sequence */
1114+ else if ((c & 0xE0 ) == 0xC0 ) {
1115+ if (c < 0xC2 ) return 0 ; /* Overlong encoding */
1116+ len = 2 ;
1117+ }
1118+ /* 3-byte sequence */
1119+ else if ((c & 0xF0 ) == 0xE0 ) {
1120+ if (max_len > 1 && c == 0xE0 && (unsigned char )str [1 ] < 0xA0 ) {
1121+ return 0 ; /* Overlong */
1122+ }
1123+ if (max_len > 1 && c == 0xED && (unsigned char )str [1 ] >= 0xA0 ) {
1124+ return 0 ; /* Surrogates */
1125+ }
1126+ len = 3 ;
1127+ }
1128+ /* 4-byte sequence */
1129+ else if ((c & 0xF8 ) == 0xF0 ) {
1130+ if (max_len > 1 && c == 0xF0 && (unsigned char )str [1 ] < 0x90 ) {
1131+ return 0 ; /* Overlong */
1132+ }
1133+ if (c > 0xF4 ) {
1134+ return 0 ; /* Outside of Unicode range */
1135+ }
1136+ if (max_len > 1 && c == 0xF4 && (unsigned char )str [1 ] > 0x8F ) {
1137+ return 0 ; /* Outside of Unicode range */
1138+ }
1139+ len = 4 ;
1140+ }
1141+ else {
1142+ return 0 ; /* Invalid starting byte */
1143+ }
1144+
1145+ if (max_len < len ) {
1146+ return 0 ; /* Truncated sequence */
1147+ }
1148+
1149+ for (i = 1 ; i < len ; i ++ ) {
1150+ if ((str [i ] & 0xC0 ) != 0x80 ) {
1151+ return 0 ; /* Invalid continuation byte */
1152+ }
1153+ }
1154+
1155+ return len ;
1156+ }
1157+
10991158/* Safely copies raw UTF-8 strings, only escaping essential characters.
11001159 * This version correctly implements the repeating SIMD fast path for performance.
11011160 */
@@ -1180,7 +1239,7 @@ static int flb_utils_write_str_raw(char *buf, int *off, size_t size,
11801239 available -- ;
11811240 }
11821241 else { /* Multibyte UTF-8 sequence */
1183- utf_len = flb_utf8_len ( & str [i ]);
1242+ utf_len = flb_utf8_validate_char (( const unsigned char * ) & str [i ], str_len - i );
11841243
11851244 if (utf_len == 0 || i + utf_len > str_len ) { /* Invalid/truncated */
11861245 if (available < 3 ) {
0 commit comments