Skip to content

Commit bc48fb8

Browse files
committed
wip
1 parent d4297de commit bc48fb8

File tree

3 files changed

+94
-65
lines changed

3 files changed

+94
-65
lines changed

ext/json/json_encoder.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -544,7 +544,7 @@ typedef enum php_json_simd_result {
544544
static zend_always_inline php_json_simd_result php_json_process_simd_block(
545545
smart_str *buf,
546546
const __m128i sse_escape_mask,
547-
const char **s,
547+
const char **restrict s,
548548
size_t *restrict pos,
549549
size_t *restrict len,
550550
int options
@@ -666,6 +666,7 @@ zend_result php_json_escape_string(
666666

667667
php_json_simd_result result = PHP_JSON_SLOW;
668668
#ifdef JSON_USE_SIMD
669+
// TODO: html.c change (incl UNEXPECTED) & mss dit manueel terug inlinen?
669670
result = php_json_process_simd_block(buf, sse_escape_mask, &s, &pos, &len, options);
670671
if (UNEXPECTED(result == PHP_JSON_STOP)) {
671672
break;
@@ -678,11 +679,10 @@ zend_result php_json_escape_string(
678679
len--;
679680
} else {
680681
if (UNEXPECTED(us >= 0x80)) {
681-
zend_result status;
682682
size_t pos_old = pos;
683683
const char *cur = s + pos;
684684
pos = 0;
685-
us = php_next_utf8_char((unsigned char *)cur, len, &pos, &status);
685+
us = php_next_utf8_char_ex((unsigned char *)cur, us, len, &pos);
686686
len -= pos;
687687
pos += pos_old;
688688

ext/standard/html.c

Lines changed: 90 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,16 @@
5353
(all) = (all) && !CHARSET_PARTIAL_SUPPORT((charset)) && ((doctype) != ENT_HTML_DOC_XML1); \
5454
} while (0)
5555

56-
#define MB_FAILURE(pos, advance) do { \
56+
#define MB_FAILURE_NO_STATUS(pos, advance) do { \
5757
*cursor = pos + (advance); \
58-
*status = FAILURE; \
5958
return 0; \
6059
} while (0)
6160

61+
#define MB_FAILURE(pos, advance) do { \
62+
*status = FAILURE; \
63+
MB_FAILURE_NO_STATUS(pos, advance); \
64+
} while (0)
65+
6266
#define CHECK_LEN(pos, chars_need) ((str_len - (pos)) >= (chars_need))
6367

6468
/* valid as single byte character or leading byte */
@@ -85,6 +89,85 @@ static char *get_default_charset(void) {
8589
}
8690
/* }}} */
8791

92+
PHPAPI unsigned int php_next_utf8_char_ex(
93+
const unsigned char *str,
94+
unsigned char c,
95+
size_t str_len,
96+
size_t *cursor)
97+
{
98+
size_t pos = *cursor;
99+
unsigned int this_char = 0;
100+
101+
/* We'll follow strategy 2. from section 3.6.1 of UTR #36:
102+
* "In a reported illegal byte sequence, do not include any
103+
* non-initial byte that encodes a valid character or is a leading
104+
* byte for a valid sequence." */
105+
106+
ZEND_ASSERT(c >= 0x80);
107+
108+
if (UNEXPECTED(c < 0xc2)) {
109+
MB_FAILURE_NO_STATUS(pos, 1);
110+
} else if (c < 0xe0) {
111+
if (UNEXPECTED(!CHECK_LEN(pos, 2)))
112+
MB_FAILURE_NO_STATUS(pos, 1);
113+
114+
if (UNEXPECTED(!utf8_trail(str[pos + 1]))) {
115+
MB_FAILURE_NO_STATUS(pos, utf8_lead(str[pos + 1]) ? 1 : 2);
116+
}
117+
this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
118+
if (UNEXPECTED(this_char < 0x80)) { /* non-shortest form */
119+
MB_FAILURE_NO_STATUS(pos, 2);
120+
}
121+
pos += 2;
122+
} else if (c < 0xf0) {
123+
size_t avail = str_len - pos;
124+
125+
if (UNEXPECTED(avail < 3 ||
126+
!utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2]))) {
127+
if (avail < 2 || utf8_lead(str[pos + 1]))
128+
MB_FAILURE_NO_STATUS(pos, 1);
129+
else if (avail < 3 || utf8_lead(str[pos + 2]))
130+
MB_FAILURE_NO_STATUS(pos, 2);
131+
else
132+
MB_FAILURE_NO_STATUS(pos, 3);
133+
}
134+
135+
this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
136+
if (UNEXPECTED(this_char < 0x800)) { /* non-shortest form */
137+
MB_FAILURE_NO_STATUS(pos, 3);
138+
} else if (UNEXPECTED(this_char >= 0xd800 && this_char <= 0xdfff)) { /* surrogate */
139+
MB_FAILURE_NO_STATUS(pos, 3);
140+
}
141+
pos += 3;
142+
} else if (c < 0xf5) {
143+
size_t avail = str_len - pos;
144+
145+
if (UNEXPECTED(avail < 4 ||
146+
!utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2]) ||
147+
!utf8_trail(str[pos + 3]))) {
148+
if (avail < 2 || utf8_lead(str[pos + 1]))
149+
MB_FAILURE_NO_STATUS(pos, 1);
150+
else if (avail < 3 || utf8_lead(str[pos + 2]))
151+
MB_FAILURE_NO_STATUS(pos, 2);
152+
else if (avail < 4 || utf8_lead(str[pos + 3]))
153+
MB_FAILURE_NO_STATUS(pos, 3);
154+
else
155+
MB_FAILURE_NO_STATUS(pos, 4);
156+
}
157+
158+
this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
159+
if (UNEXPECTED(this_char < 0x10000 || this_char > 0x10FFFF)) { /* non-shortest form or outside range */
160+
MB_FAILURE_NO_STATUS(pos, 4);
161+
}
162+
pos += 4;
163+
} else {
164+
MB_FAILURE_NO_STATUS(pos, 1);
165+
}
166+
167+
*cursor = pos;
168+
return this_char;
169+
}
170+
88171
/* {{{ get_next_char */
89172
static inline unsigned int get_next_char(
90173
enum entity_charset charset,
@@ -105,72 +188,17 @@ static inline unsigned int get_next_char(
105188
switch (charset) {
106189
case cs_utf_8:
107190
{
108-
/* We'll follow strategy 2. from section 3.6.1 of UTR #36:
109-
* "In a reported illegal byte sequence, do not include any
110-
* non-initial byte that encodes a valid character or is a leading
111-
* byte for a valid sequence." */
112191
unsigned char c;
113192
c = str[pos];
114193
if (c < 0x80) {
115194
this_char = c;
116195
pos++;
117-
} else if (c < 0xc2) {
118-
MB_FAILURE(pos, 1);
119-
} else if (c < 0xe0) {
120-
if (!CHECK_LEN(pos, 2))
121-
MB_FAILURE(pos, 1);
122-
123-
if (!utf8_trail(str[pos + 1])) {
124-
MB_FAILURE(pos, utf8_lead(str[pos + 1]) ? 1 : 2);
125-
}
126-
this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
127-
if (this_char < 0x80) { /* non-shortest form */
128-
MB_FAILURE(pos, 2);
129-
}
130-
pos += 2;
131-
} else if (c < 0xf0) {
132-
size_t avail = str_len - pos;
133-
134-
if (avail < 3 ||
135-
!utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2])) {
136-
if (avail < 2 || utf8_lead(str[pos + 1]))
137-
MB_FAILURE(pos, 1);
138-
else if (avail < 3 || utf8_lead(str[pos + 2]))
139-
MB_FAILURE(pos, 2);
140-
else
141-
MB_FAILURE(pos, 3);
142-
}
143-
144-
this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
145-
if (this_char < 0x800) { /* non-shortest form */
146-
MB_FAILURE(pos, 3);
147-
} else if (this_char >= 0xd800 && this_char <= 0xdfff) { /* surrogate */
148-
MB_FAILURE(pos, 3);
149-
}
150-
pos += 3;
151-
} else if (c < 0xf5) {
152-
size_t avail = str_len - pos;
153-
154-
if (avail < 4 ||
155-
!utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2]) ||
156-
!utf8_trail(str[pos + 3])) {
157-
if (avail < 2 || utf8_lead(str[pos + 1]))
158-
MB_FAILURE(pos, 1);
159-
else if (avail < 3 || utf8_lead(str[pos + 2]))
160-
MB_FAILURE(pos, 2);
161-
else if (avail < 4 || utf8_lead(str[pos + 3]))
162-
MB_FAILURE(pos, 3);
163-
else
164-
MB_FAILURE(pos, 4);
165-
}
166-
167-
this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
168-
if (this_char < 0x10000 || this_char > 0x10FFFF) { /* non-shortest form or outside range */
169-
MB_FAILURE(pos, 4);
170-
}
171-
pos += 4;
172196
} else {
173-
MB_FAILURE(pos, 1);
197+
this_char = php_next_utf8_char_ex(str, c, str_len, cursor);
198+
if (UNEXPECTED(this_char == 0)) {
199+
*status = FAILURE;
200+
}
201+
return this_char;
174202
}
175203
}
176204
break;

ext/standard/html.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,5 +48,6 @@ PHPAPI zend_string *php_escape_html_entities(const unsigned char *old, size_t ol
4848
PHPAPI zend_string *php_escape_html_entities_ex(const unsigned char *old, size_t oldlen, int all, int flags, const char *hint_charset, bool double_encode, bool quiet);
4949
PHPAPI zend_string *php_unescape_html_entities(zend_string *str, int all, int flags, const char *hint_charset);
5050
PHPAPI unsigned int php_next_utf8_char(const unsigned char *str, size_t str_len, size_t *cursor, zend_result *status);
51+
PHPAPI unsigned int php_next_utf8_char_ex(const unsigned char *str, unsigned char c, size_t str_len, size_t *cursor);
5152

5253
#endif /* HTML_H */

0 commit comments

Comments
 (0)