@@ -4657,7 +4657,8 @@ static_inline bool read_num(u8 **ptr, u8 **pre, yyjson_read_flag flg,
46574657 *============================================================================*/
46584658
46594659/** Read unicode escape sequence. */
4660- static_inline bool read_uni_esc (u8 * * src_ptr , u8 * * dst_ptr , const char * * msg ) {
4660+ static_inline bool read_uni_esc (u8 * * src_ptr , u8 * * dst_ptr ,
4661+ const char * * msg , yyjson_read_flag flg ) {
46614662#define return_err (_end , _msg ) *msg = _msg; *src_ptr = _end; return false
46624663
46634664 u8 * src = * src_ptr ;
@@ -4667,6 +4668,15 @@ static_inline bool read_uni_esc(u8 **src_ptr, u8 **dst_ptr, const char **msg) {
46674668
46684669 src += 2 ; /* skip `\u` */
46694670 if (unlikely (!hex_load_4 (src , & hi ))) {
4671+ if (has_rflag (flg , YYJSON_READ_REPLACE_INVALID_UNICODE , 1 )) {
4672+ usize cnt = 0 ;
4673+ while (cnt < 4 && char_is_hex (src [cnt ])) cnt ++ ;
4674+ src += cnt ;
4675+ * dst ++ = 0xEF ; * dst ++ = 0xBF ; * dst ++ = 0xBD ;
4676+ * src_ptr = src ;
4677+ * dst_ptr = dst ;
4678+ return true;
4679+ }
46704680 return_err (src - 2 , "invalid escaped sequence in string" );
46714681 }
46724682 src += 4 ; /* skip hex */
@@ -4682,18 +4692,83 @@ static_inline bool read_uni_esc(u8 **src_ptr, u8 **dst_ptr, const char **msg) {
46824692 } else {
46834693 * dst ++ = (u8 )hi ;
46844694 }
4685- } else {
4695+ } else if (( hi & 0xFC00 ) == 0xD800 ) {
46864696 /* a non-BMP character, represented as a surrogate pair */
4687- if (unlikely ((hi & 0xFC00 ) != 0xD800 )) {
4688- return_err (src - 6 , "invalid high surrogate in string" );
4689- }
46904697 if (unlikely (!byte_match_2 (src , "\\u" ))) {
4698+ if (has_rflag (flg , YYJSON_READ_REPLACE_INVALID_UNICODE , 1 )) {
4699+ * dst ++ = 0xEF ; * dst ++ = 0xBF ; * dst ++ = 0xBD ;
4700+ * src_ptr = src ;
4701+ * dst_ptr = dst ;
4702+ return true;
4703+ }
4704+ if (has_allow (INVALID_SURROGATE )) {
4705+ if (hi >= 0x800 ) {
4706+ * dst ++ = (u8 )(0xE0 | (hi >> 12 ));
4707+ * dst ++ = (u8 )(0x80 | ((hi >> 6 ) & 0x3F ));
4708+ * dst ++ = (u8 )(0x80 | (hi & 0x3F ));
4709+ } else if (hi >= 0x80 ) {
4710+ * dst ++ = (u8 )(0xC0 | (hi >> 6 ));
4711+ * dst ++ = (u8 )(0x80 | (hi & 0x3F ));
4712+ } else {
4713+ * dst ++ = (u8 )hi ;
4714+ }
4715+ * src_ptr = src ;
4716+ * dst_ptr = dst ;
4717+ return true;
4718+ }
46914719 return_err (src - 6 , "no low surrogate in string" );
46924720 }
46934721 if (unlikely (!hex_load_4 (src + 2 , & lo ))) {
4722+ if (has_rflag (flg , YYJSON_READ_REPLACE_INVALID_UNICODE , 1 )) {
4723+ usize cnt = 0 ;
4724+ src += 2 ; /* skip \u */
4725+ while (cnt < 4 && char_is_hex (src [cnt ])) cnt ++ ;
4726+ src += cnt ;
4727+ * dst ++ = 0xEF ; * dst ++ = 0xBF ; * dst ++ = 0xBD ;
4728+ * src_ptr = src ;
4729+ * dst_ptr = dst ;
4730+ return true;
4731+ }
4732+ if (has_allow (INVALID_SURROGATE )) {
4733+ if (hi >= 0x800 ) {
4734+ * dst ++ = (u8 )(0xE0 | (hi >> 12 ));
4735+ * dst ++ = (u8 )(0x80 | ((hi >> 6 ) & 0x3F ));
4736+ * dst ++ = (u8 )(0x80 | (hi & 0x3F ));
4737+ } else if (hi >= 0x80 ) {
4738+ * dst ++ = (u8 )(0xC0 | (hi >> 6 ));
4739+ * dst ++ = (u8 )(0x80 | (hi & 0x3F ));
4740+ } else {
4741+ * dst ++ = (u8 )hi ;
4742+ }
4743+ * src_ptr = src ;
4744+ * dst_ptr = dst ;
4745+ return true;
4746+ }
46944747 return_err (src - 6 , "invalid escape in string" );
46954748 }
46964749 if (unlikely ((lo & 0xFC00 ) != 0xDC00 )) {
4750+ if (has_rflag (flg , YYJSON_READ_REPLACE_INVALID_UNICODE , 1 )) {
4751+ src += 6 ;
4752+ * dst ++ = 0xEF ; * dst ++ = 0xBF ; * dst ++ = 0xBD ;
4753+ * src_ptr = src ;
4754+ * dst_ptr = dst ;
4755+ return true;
4756+ }
4757+ if (has_allow (INVALID_SURROGATE )) {
4758+ if (hi >= 0x800 ) {
4759+ * dst ++ = (u8 )(0xE0 | (hi >> 12 ));
4760+ * dst ++ = (u8 )(0x80 | ((hi >> 6 ) & 0x3F ));
4761+ * dst ++ = (u8 )(0x80 | (hi & 0x3F ));
4762+ } else if (hi >= 0x80 ) {
4763+ * dst ++ = (u8 )(0xC0 | (hi >> 6 ));
4764+ * dst ++ = (u8 )(0x80 | (hi & 0x3F ));
4765+ } else {
4766+ * dst ++ = (u8 )hi ;
4767+ }
4768+ * src_ptr = src ;
4769+ * dst_ptr = dst ;
4770+ return true;
4771+ }
46974772 return_err (src - 6 , "invalid low surrogate in string" );
46984773 }
46994774 uni = ((((u32 )hi - 0xD800 ) << 10 ) |
@@ -4703,6 +4778,26 @@ static_inline bool read_uni_esc(u8 **src_ptr, u8 **dst_ptr, const char **msg) {
47034778 * dst ++ = (u8 )(0x80 | ((uni >> 6 ) & 0x3F ));
47044779 * dst ++ = (u8 )(0x80 | (uni & 0x3F ));
47054780 src += 6 ;
4781+ } else { /* low surrogate without preceding high surrogate */
4782+ if (has_rflag (flg , YYJSON_READ_REPLACE_INVALID_UNICODE , 1 )) {
4783+ * dst ++ = 0xEF ; * dst ++ = 0xBF ; * dst ++ = 0xBD ;
4784+ * src_ptr = src ;
4785+ * dst_ptr = dst ;
4786+ return true;
4787+ }
4788+ if (!has_allow (INVALID_SURROGATE )) {
4789+ return_err (src - 6 , "invalid low surrogate in string" );
4790+ }
4791+ if (hi >= 0x800 ) {
4792+ * dst ++ = (u8 )(0xE0 | (hi >> 12 ));
4793+ * dst ++ = (u8 )(0x80 | ((hi >> 6 ) & 0x3F ));
4794+ * dst ++ = (u8 )(0x80 | (hi & 0x3F ));
4795+ } else if (hi >= 0x80 ) {
4796+ * dst ++ = (u8 )(0xC0 | (hi >> 6 ));
4797+ * dst ++ = (u8 )(0x80 | (hi & 0x3F ));
4798+ } else {
4799+ * dst ++ = (u8 )hi ;
4800+ }
47064801 }
47074802 * src_ptr = src ;
47084803 * dst_ptr = dst ;
@@ -4855,6 +4950,12 @@ static_inline bool read_str_opt(u8 quo, u8 **ptr, u8 *eof, yyjson_read_flag flg,
48554950 }
48564951#endif
48574952 if (unlikely (pos == src )) {
4953+ if (has_rflag (flg , YYJSON_READ_REPLACE_INVALID_UNICODE , 1 )) {
4954+ dst = src ;
4955+ * dst ++ = 0xEF ; * dst ++ = 0xBF ; * dst ++ = 0xBD ;
4956+ ++ src ;
4957+ goto copy_utf8 ;
4958+ }
48584959 if (has_allow (INVALID_UNICODE )) ++ src ;
48594960 else return_err (src , "invalid UTF-8 encoding in string" );
48604961 }
@@ -4876,7 +4977,7 @@ static_inline bool read_str_opt(u8 quo, u8 **ptr, u8 *eof, yyjson_read_flag flg,
48764977 case 't' : * dst ++ = '\t' ; src ++ ; break ;
48774978 case 'u' :
48784979 src -- ;
4879- if (!read_uni_esc (& src , & dst , msg )) return_err (src , * msg );
4980+ if (!read_uni_esc (& src , & dst , msg , flg )) return_err (src , * msg );
48804981 break ;
48814982 default : {
48824983 if (has_allow (EXT_ESCAPE )) {
@@ -4935,11 +5036,17 @@ static_inline bool read_str_opt(u8 quo, u8 **ptr, u8 *eof, yyjson_read_flag flg,
49355036 if (con ) con [0 ] = con [1 ] = NULL ;
49365037 return true;
49375038 } else {
4938- if (!has_allow (INVALID_UNICODE )) {
4939- return_err (src , "unexpected control character in string" );
5039+ if (has_rflag (flg , YYJSON_READ_REPLACE_INVALID_UNICODE , 1 )) {
5040+ if (src >= eof ) return_err (src , "unclosed string" );
5041+ * dst ++ = 0xEF ; * dst ++ = 0xBF ; * dst ++ = 0xBD ;
5042+ src ++ ;
5043+ } else {
5044+ if (!has_allow (INVALID_UNICODE )) {
5045+ return_err (src , "unexpected control character in string" );
5046+ }
5047+ if (src >= eof ) return_err (src , "unclosed string" );
5048+ * dst ++ = * src ++ ;
49405049 }
4941- if (src >= eof ) return_err (src , "unclosed string" );
4942- * dst ++ = * src ++ ;
49435050 }
49445051
49455052copy_ascii :
@@ -5027,6 +5134,11 @@ static_inline bool read_str_opt(u8 quo, u8 **ptr, u8 *eof, yyjson_read_flag flg,
50275134 }
50285135#endif
50295136 if (unlikely (pos == src )) {
5137+ if (has_rflag (flg , YYJSON_READ_REPLACE_INVALID_UNICODE , 1 )) {
5138+ * dst ++ = 0xEF ; * dst ++ = 0xBF ; * dst ++ = 0xBD ;
5139+ ++ src ;
5140+ goto copy_utf8 ;
5141+ }
50305142 if (!has_allow (INVALID_UNICODE )) {
50315143 return_err (src , MSG_ERR_UTF8 );
50325144 }
@@ -5131,7 +5243,7 @@ static_noinline bool read_str_id(u8 **ptr, u8 *eof, yyjson_read_flag flg,
51315243 dst = src ;
51325244copy_escape :
51335245 if (byte_match_2 (src , "\\u" )) {
5134- if (!read_uni_esc (& src , & dst , msg )) return_err (src , * msg );
5246+ if (!read_uni_esc (& src , & dst , msg , flg )) return_err (src , * msg );
51355247 } else {
51365248 if (!char_is_id_next (* src )) return_suc (dst , src );
51375249 return_err (src , "unexpected character in key" );
@@ -5183,10 +5295,17 @@ static_noinline bool read_str_id(u8 **ptr, u8 *eof, yyjson_read_flag flg,
51835295 dst += 4 ; src += 4 ;
51845296 } else {
51855297#if !YYJSON_DISABLE_UTF8_VALIDATION
5186- if (!has_allow (INVALID_UNICODE )) return_err (src , MSG_ERR_UTF8 );
5298+ if (!has_allow (INVALID_UNICODE ) &&
5299+ !has_rflag (flg , YYJSON_READ_REPLACE_INVALID_UNICODE , 1 ))
5300+ return_err (src , MSG_ERR_UTF8 );
51875301#endif
5188- * dst = * src ;
5189- dst += 1 ; src += 1 ;
5302+ if (has_rflag (flg , YYJSON_READ_REPLACE_INVALID_UNICODE , 1 )) {
5303+ * dst ++ = 0xEF ; * dst ++ = 0xBF ; * dst ++ = 0xBD ;
5304+ src += 1 ;
5305+ } else {
5306+ * dst = * src ;
5307+ dst += 1 ; src += 1 ;
5308+ }
51905309 }
51915310 }
51925311 if (char_is_id_ascii (* src )) goto copy_ascii ;
@@ -6206,6 +6325,13 @@ yyjson_doc *yyjson_read_opts(char *dat, usize len,
62066325 }
62076326 memset (eof , 0 , YYJSON_PADDING_SIZE );
62086327
6328+ /* replacement has highest precedence: tolerate and replace all invalid
6329+ sequences so that the final output is always valid UTF-8 */
6330+ if (has_rflag (flg , YYJSON_READ_REPLACE_INVALID_UNICODE , 1 )) {
6331+ flg |= YYJSON_READ_ALLOW_INVALID_UNICODE ;
6332+ flg |= YYJSON_READ_ALLOW_INVALID_SURROGATE ;
6333+ }
6334+
62096335 if (has_allow (BOM )) {
62106336 if (len >= 3 && is_utf8_bom (cur )) cur += 3 ;
62116337 }
@@ -6488,6 +6614,8 @@ yyjson_incr_state *yyjson_incr_new(char *buf, size_t buf_len,
64886614 flg &= ~YYJSON_READ_JSON5 ;
64896615 flg &= ~YYJSON_READ_ALLOW_BOM ;
64906616 flg &= ~YYJSON_READ_ALLOW_INVALID_UNICODE ;
6617+ flg &= ~YYJSON_READ_ALLOW_INVALID_SURROGATE ;
6618+ flg &= ~YYJSON_READ_REPLACE_INVALID_UNICODE ;
64916619
64926620 if (unlikely (!buf )) return NULL ;
64936621 if (unlikely (buf_len >= USIZE_MAX - YYJSON_PADDING_SIZE )) return NULL ;
0 commit comments