Skip to content

Commit baff322

Browse files
committed
lib: yyjson: add Unicode replacement and surrogate tolerance flags
Upstream PR: ibireme/yyjson#227 Signed-off-by: Eduardo Silva <[email protected]>
1 parent 64eb411 commit baff322

File tree

3 files changed

+199
-22
lines changed

3 files changed

+199
-22
lines changed

lib/yyjson-0.12.0/src/yyjson.c

Lines changed: 142 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4657,7 +4657,8 @@ static_inline bool read_num(u8 **ptr, u8 **pre, yyjson_read_flag flg,
46574657
*============================================================================*/
46584658

46594659
/** Read unicode escape sequence. */
4660-
static_inline bool read_uni_esc(u8 **src_ptr, u8 **dst_ptr, const char **msg) {
4660+
static_inline bool read_uni_esc(u8 **src_ptr, u8 **dst_ptr,
4661+
const char **msg, yyjson_read_flag flg) {
46614662
#define return_err(_end, _msg) *msg = _msg; *src_ptr = _end; return false
46624663

46634664
u8 *src = *src_ptr;
@@ -4667,6 +4668,15 @@ static_inline bool read_uni_esc(u8 **src_ptr, u8 **dst_ptr, const char **msg) {
46674668

46684669
src += 2; /* skip `\u` */
46694670
if (unlikely(!hex_load_4(src, &hi))) {
4671+
if (has_rflag(flg, YYJSON_READ_REPLACE_INVALID_UNICODE, 1)) {
4672+
usize cnt = 0;
4673+
while (cnt < 4 && char_is_hex(src[cnt])) cnt++;
4674+
src += cnt;
4675+
*dst++ = 0xEF; *dst++ = 0xBF; *dst++ = 0xBD;
4676+
*src_ptr = src;
4677+
*dst_ptr = dst;
4678+
return true;
4679+
}
46704680
return_err(src - 2, "invalid escaped sequence in string");
46714681
}
46724682
src += 4; /* skip hex */
@@ -4682,18 +4692,83 @@ static_inline bool read_uni_esc(u8 **src_ptr, u8 **dst_ptr, const char **msg) {
46824692
} else {
46834693
*dst++ = (u8)hi;
46844694
}
4685-
} else {
4695+
} else if ((hi & 0xFC00) == 0xD800) {
46864696
/* a non-BMP character, represented as a surrogate pair */
4687-
if (unlikely((hi & 0xFC00) != 0xD800)) {
4688-
return_err(src - 6, "invalid high surrogate in string");
4689-
}
46904697
if (unlikely(!byte_match_2(src, "\\u"))) {
4698+
if (has_rflag(flg, YYJSON_READ_REPLACE_INVALID_UNICODE, 1)) {
4699+
*dst++ = 0xEF; *dst++ = 0xBF; *dst++ = 0xBD;
4700+
*src_ptr = src;
4701+
*dst_ptr = dst;
4702+
return true;
4703+
}
4704+
if (has_allow(INVALID_SURROGATE)) {
4705+
if (hi >= 0x800) {
4706+
*dst++ = (u8)(0xE0 | (hi >> 12));
4707+
*dst++ = (u8)(0x80 | ((hi >> 6) & 0x3F));
4708+
*dst++ = (u8)(0x80 | (hi & 0x3F));
4709+
} else if (hi >= 0x80) {
4710+
*dst++ = (u8)(0xC0 | (hi >> 6));
4711+
*dst++ = (u8)(0x80 | (hi & 0x3F));
4712+
} else {
4713+
*dst++ = (u8)hi;
4714+
}
4715+
*src_ptr = src;
4716+
*dst_ptr = dst;
4717+
return true;
4718+
}
46914719
return_err(src - 6, "no low surrogate in string");
46924720
}
46934721
if (unlikely(!hex_load_4(src + 2, &lo))) {
4722+
if (has_rflag(flg, YYJSON_READ_REPLACE_INVALID_UNICODE, 1)) {
4723+
usize cnt = 0;
4724+
src += 2; /* skip \u */
4725+
while (cnt < 4 && char_is_hex(src[cnt])) cnt++;
4726+
src += cnt;
4727+
*dst++ = 0xEF; *dst++ = 0xBF; *dst++ = 0xBD;
4728+
*src_ptr = src;
4729+
*dst_ptr = dst;
4730+
return true;
4731+
}
4732+
if (has_allow(INVALID_SURROGATE)) {
4733+
if (hi >= 0x800) {
4734+
*dst++ = (u8)(0xE0 | (hi >> 12));
4735+
*dst++ = (u8)(0x80 | ((hi >> 6) & 0x3F));
4736+
*dst++ = (u8)(0x80 | (hi & 0x3F));
4737+
} else if (hi >= 0x80) {
4738+
*dst++ = (u8)(0xC0 | (hi >> 6));
4739+
*dst++ = (u8)(0x80 | (hi & 0x3F));
4740+
} else {
4741+
*dst++ = (u8)hi;
4742+
}
4743+
*src_ptr = src;
4744+
*dst_ptr = dst;
4745+
return true;
4746+
}
46944747
return_err(src - 6, "invalid escape in string");
46954748
}
46964749
if (unlikely((lo & 0xFC00) != 0xDC00)) {
4750+
if (has_rflag(flg, YYJSON_READ_REPLACE_INVALID_UNICODE, 1)) {
4751+
src += 6;
4752+
*dst++ = 0xEF; *dst++ = 0xBF; *dst++ = 0xBD;
4753+
*src_ptr = src;
4754+
*dst_ptr = dst;
4755+
return true;
4756+
}
4757+
if (has_allow(INVALID_SURROGATE)) {
4758+
if (hi >= 0x800) {
4759+
*dst++ = (u8)(0xE0 | (hi >> 12));
4760+
*dst++ = (u8)(0x80 | ((hi >> 6) & 0x3F));
4761+
*dst++ = (u8)(0x80 | (hi & 0x3F));
4762+
} else if (hi >= 0x80) {
4763+
*dst++ = (u8)(0xC0 | (hi >> 6));
4764+
*dst++ = (u8)(0x80 | (hi & 0x3F));
4765+
} else {
4766+
*dst++ = (u8)hi;
4767+
}
4768+
*src_ptr = src;
4769+
*dst_ptr = dst;
4770+
return true;
4771+
}
46974772
return_err(src - 6, "invalid low surrogate in string");
46984773
}
46994774
uni = ((((u32)hi - 0xD800) << 10) |
@@ -4703,6 +4778,26 @@ static_inline bool read_uni_esc(u8 **src_ptr, u8 **dst_ptr, const char **msg) {
47034778
*dst++ = (u8)(0x80 | ((uni >> 6) & 0x3F));
47044779
*dst++ = (u8)(0x80 | (uni & 0x3F));
47054780
src += 6;
4781+
} else { /* low surrogate without preceding high surrogate */
4782+
if (has_rflag(flg, YYJSON_READ_REPLACE_INVALID_UNICODE, 1)) {
4783+
*dst++ = 0xEF; *dst++ = 0xBF; *dst++ = 0xBD;
4784+
*src_ptr = src;
4785+
*dst_ptr = dst;
4786+
return true;
4787+
}
4788+
if (!has_allow(INVALID_SURROGATE)) {
4789+
return_err(src - 6, "invalid low surrogate in string");
4790+
}
4791+
if (hi >= 0x800) {
4792+
*dst++ = (u8)(0xE0 | (hi >> 12));
4793+
*dst++ = (u8)(0x80 | ((hi >> 6) & 0x3F));
4794+
*dst++ = (u8)(0x80 | (hi & 0x3F));
4795+
} else if (hi >= 0x80) {
4796+
*dst++ = (u8)(0xC0 | (hi >> 6));
4797+
*dst++ = (u8)(0x80 | (hi & 0x3F));
4798+
} else {
4799+
*dst++ = (u8)hi;
4800+
}
47064801
}
47074802
*src_ptr = src;
47084803
*dst_ptr = dst;
@@ -4855,6 +4950,12 @@ static_inline bool read_str_opt(u8 quo, u8 **ptr, u8 *eof, yyjson_read_flag flg,
48554950
}
48564951
#endif
48574952
if (unlikely(pos == src)) {
4953+
if (has_rflag(flg, YYJSON_READ_REPLACE_INVALID_UNICODE, 1)) {
4954+
dst = src;
4955+
*dst++ = 0xEF; *dst++ = 0xBF; *dst++ = 0xBD;
4956+
++src;
4957+
goto copy_utf8;
4958+
}
48584959
if (has_allow(INVALID_UNICODE)) ++src;
48594960
else return_err(src, "invalid UTF-8 encoding in string");
48604961
}
@@ -4876,7 +4977,7 @@ static_inline bool read_str_opt(u8 quo, u8 **ptr, u8 *eof, yyjson_read_flag flg,
48764977
case 't': *dst++ = '\t'; src++; break;
48774978
case 'u':
48784979
src--;
4879-
if (!read_uni_esc(&src, &dst, msg)) return_err(src, *msg);
4980+
if (!read_uni_esc(&src, &dst, msg, flg)) return_err(src, *msg);
48804981
break;
48814982
default: {
48824983
if (has_allow(EXT_ESCAPE)) {
@@ -4935,11 +5036,17 @@ static_inline bool read_str_opt(u8 quo, u8 **ptr, u8 *eof, yyjson_read_flag flg,
49355036
if (con) con[0] = con[1] = NULL;
49365037
return true;
49375038
} else {
4938-
if (!has_allow(INVALID_UNICODE)) {
4939-
return_err(src, "unexpected control character in string");
5039+
if (has_rflag(flg, YYJSON_READ_REPLACE_INVALID_UNICODE, 1)) {
5040+
if (src >= eof) return_err(src, "unclosed string");
5041+
*dst++ = 0xEF; *dst++ = 0xBF; *dst++ = 0xBD;
5042+
src++;
5043+
} else {
5044+
if (!has_allow(INVALID_UNICODE)) {
5045+
return_err(src, "unexpected control character in string");
5046+
}
5047+
if (src >= eof) return_err(src, "unclosed string");
5048+
*dst++ = *src++;
49405049
}
4941-
if (src >= eof) return_err(src, "unclosed string");
4942-
*dst++ = *src++;
49435050
}
49445051

49455052
copy_ascii:
@@ -5027,6 +5134,11 @@ static_inline bool read_str_opt(u8 quo, u8 **ptr, u8 *eof, yyjson_read_flag flg,
50275134
}
50285135
#endif
50295136
if (unlikely(pos == src)) {
5137+
if (has_rflag(flg, YYJSON_READ_REPLACE_INVALID_UNICODE, 1)) {
5138+
*dst++ = 0xEF; *dst++ = 0xBF; *dst++ = 0xBD;
5139+
++src;
5140+
goto copy_utf8;
5141+
}
50305142
if (!has_allow(INVALID_UNICODE)) {
50315143
return_err(src, MSG_ERR_UTF8);
50325144
}
@@ -5131,7 +5243,7 @@ static_noinline bool read_str_id(u8 **ptr, u8 *eof, yyjson_read_flag flg,
51315243
dst = src;
51325244
copy_escape:
51335245
if (byte_match_2(src, "\\u")) {
5134-
if (!read_uni_esc(&src, &dst, msg)) return_err(src, *msg);
5246+
if (!read_uni_esc(&src, &dst, msg, flg)) return_err(src, *msg);
51355247
} else {
51365248
if (!char_is_id_next(*src)) return_suc(dst, src);
51375249
return_err(src, "unexpected character in key");
@@ -5183,10 +5295,17 @@ static_noinline bool read_str_id(u8 **ptr, u8 *eof, yyjson_read_flag flg,
51835295
dst += 4; src += 4;
51845296
} else {
51855297
#if !YYJSON_DISABLE_UTF8_VALIDATION
5186-
if (!has_allow(INVALID_UNICODE)) return_err(src, MSG_ERR_UTF8);
5298+
if (!has_allow(INVALID_UNICODE) &&
5299+
!has_rflag(flg, YYJSON_READ_REPLACE_INVALID_UNICODE, 1))
5300+
return_err(src, MSG_ERR_UTF8);
51875301
#endif
5188-
*dst = *src;
5189-
dst += 1; src += 1;
5302+
if (has_rflag(flg, YYJSON_READ_REPLACE_INVALID_UNICODE, 1)) {
5303+
*dst++ = 0xEF; *dst++ = 0xBF; *dst++ = 0xBD;
5304+
src += 1;
5305+
} else {
5306+
*dst = *src;
5307+
dst += 1; src += 1;
5308+
}
51905309
}
51915310
}
51925311
if (char_is_id_ascii(*src)) goto copy_ascii;
@@ -6206,6 +6325,13 @@ yyjson_doc *yyjson_read_opts(char *dat, usize len,
62066325
}
62076326
memset(eof, 0, YYJSON_PADDING_SIZE);
62086327

6328+
/* replacement has highest precedence: tolerate and replace all invalid
6329+
sequences so that the final output is always valid UTF-8 */
6330+
if (has_rflag(flg, YYJSON_READ_REPLACE_INVALID_UNICODE, 1)) {
6331+
flg |= YYJSON_READ_ALLOW_INVALID_UNICODE;
6332+
flg |= YYJSON_READ_ALLOW_INVALID_SURROGATE;
6333+
}
6334+
62096335
if (has_allow(BOM)) {
62106336
if (len >= 3 && is_utf8_bom(cur)) cur += 3;
62116337
}
@@ -6488,6 +6614,8 @@ yyjson_incr_state *yyjson_incr_new(char *buf, size_t buf_len,
64886614
flg &= ~YYJSON_READ_JSON5;
64896615
flg &= ~YYJSON_READ_ALLOW_BOM;
64906616
flg &= ~YYJSON_READ_ALLOW_INVALID_UNICODE;
6617+
flg &= ~YYJSON_READ_ALLOW_INVALID_SURROGATE;
6618+
flg &= ~YYJSON_READ_REPLACE_INVALID_UNICODE;
64916619

64926620
if (unlikely(!buf)) return NULL;
64936621
if (unlikely(buf_len >= USIZE_MAX - YYJSON_PADDING_SIZE)) return NULL;

lib/yyjson-0.12.0/src/yyjson.h

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -760,14 +760,13 @@ static const yyjson_read_flag YYJSON_READ_ALLOW_INF_AND_NAN = 1 << 4;
760760
inf/nan literal is also read as raw with `ALLOW_INF_AND_NAN` flag. */
761761
static const yyjson_read_flag YYJSON_READ_NUMBER_AS_RAW = 1 << 5;
762762

763-
/** Allow reading invalid unicode when parsing string values (non-standard).
764-
Invalid characters will be allowed to appear in the string values, but
765-
invalid escape sequences will still be reported as errors.
766-
This flag does not affect the performance of correctly encoded strings.
767-
768-
@warning Strings in JSON values may contain incorrect encoding when this
769-
option is used, you need to handle these strings carefully to avoid security
770-
risks. */
763+
/** Allow reading raw invalid UTF-8 bytes in strings (non-standard).
764+
This flag only affects raw bytes; `\uXXXX` escapes still require valid
765+
surrogate pairs unless `YYJSON_READ_ALLOW_INVALID_SURROGATE` is also set.
766+
Invalid escape sequences will still be reported as errors.
767+
768+
@warning Strings may contain ill-formed UTF-8 when this option is used,
769+
you need to handle these strings carefully to avoid security risks. */
771770
static const yyjson_read_flag YYJSON_READ_ALLOW_INVALID_UNICODE = 1 << 6;
772771

773772
/** Read big numbers as raw strings. These big numbers include integers that
@@ -810,6 +809,22 @@ static const yyjson_read_flag YYJSON_READ_ALLOW_SINGLE_QUOTED_STR = 1 << 12;
810809
non-whitespace character with code point above `U+007F`. */
811810
static const yyjson_read_flag YYJSON_READ_ALLOW_UNQUOTED_KEY = 1 << 13;
812811

812+
/** Replace invalid unicode code units with replacement character `U+FFFD`
813+
when parsing string values (non-standard).
814+
This flag implicitly enables `YYJSON_READ_ALLOW_INVALID_UNICODE` and
815+
`YYJSON_READ_ALLOW_INVALID_SURROGATE`, so malformed input is tolerated
816+
and replaced without needing those flags.
817+
Note: when compiled with `YYJSON_DISABLE_UTF8_VALIDATION=ON`, invalid
818+
UTF-8 byte sequences are not detected and therefore not replaced. */
819+
static const yyjson_read_flag YYJSON_READ_REPLACE_INVALID_UNICODE = 1 << 14;
820+
821+
/** Allow unpaired surrogate code units in `\uXXXX` escapes (non-standard).
822+
Raw invalid UTF-8 bytes are unaffected; use
823+
`YYJSON_READ_ALLOW_INVALID_UNICODE` for that. When combined with
824+
`YYJSON_READ_REPLACE_INVALID_UNICODE`, the surrogates are replaced with
825+
`U+FFFD`. */
826+
static const yyjson_read_flag YYJSON_READ_ALLOW_INVALID_SURROGATE = 1 << 15;
827+
813828
/** Allow JSON5 format, see: [https://json5.org].
814829
This flag supports all JSON5 features with some additional extensions:
815830
- Accepts more escape sequences than JSON5 (e.g. `\a`, `\e`).

lib/yyjson-0.12.0/test/test_string.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1450,6 +1450,39 @@ static void test_unquoted_key(void) {
14501450
}, YYJSON_READ_ALLOW_INVALID_UNICODE);
14511451
}
14521452

1453+
/*----------------------------------------------------------------------------*/
1454+
/* MARK: - Invalid Unicode Flags */
1455+
/*----------------------------------------------------------------------------*/
1456+
1457+
static void test_invalid_unicode_flags(void) {
1458+
#if !YYJSON_DISABLE_READER
1459+
/* unpaired surrogate */
1460+
char inv_sur_hi[8 + YYJSON_PADDING_SIZE];
1461+
memcpy(inv_sur_hi, "\"\\uD83D\"", 8);
1462+
memset(inv_sur_hi + 8, 0, YYJSON_PADDING_SIZE);
1463+
yyjson_doc *doc = yyjson_read(inv_sur_hi, 8, 0);
1464+
yy_assert(!doc);
1465+
1466+
doc = yyjson_read(inv_sur_hi, 8, YYJSON_READ_ALLOW_INVALID_SURROGATE);
1467+
yy_assert(doc);
1468+
yyjson_val *val = yyjson_doc_get_root(doc);
1469+
const char *str = yyjson_get_str(val);
1470+
yy_assert(str && (u8)str[0] == 0xED && (u8)str[1] == 0xA0 &&
1471+
(u8)str[2] == 0xBD && str[3] == '\0');
1472+
yyjson_doc_free(doc);
1473+
1474+
doc = yyjson_read(inv_sur_hi, 8,
1475+
YYJSON_READ_ALLOW_INVALID_SURROGATE |
1476+
YYJSON_READ_REPLACE_INVALID_UNICODE);
1477+
yy_assert(doc);
1478+
val = yyjson_doc_get_root(doc);
1479+
str = yyjson_get_str(val);
1480+
yy_assert(str && (u8)str[0] == 0xEF && (u8)str[1] == 0xBF &&
1481+
(u8)str[2] == 0xBD && str[3] == '\0');
1482+
yyjson_doc_free(doc);
1483+
#endif
1484+
}
1485+
14531486

14541487

14551488
/*==============================================================================
@@ -1460,5 +1493,6 @@ yy_test_case(test_string) {
14601493
test_read_write();
14611494
test_extended_escape();
14621495
test_single_quoted_string();
1496+
test_invalid_unicode_flags();
14631497
test_unquoted_key();
14641498
}

0 commit comments

Comments
 (0)