5353	(all) = (all) && !CHARSET_PARTIAL_SUPPORT((charset)) && ((doctype) != ENT_HTML_DOC_XML1); \
5454} while (0)
5555
56- #define  MB_FAILURE (pos , advance ) do { \
56+ #define  MB_FAILURE_NO_STATUS (pos , advance ) do { \
5757	*cursor = pos + (advance); \
58- 	*status = FAILURE; \
5958	return 0; \
6059} while (0)
6160
61+ #define  MB_FAILURE (pos , advance ) do { \
62+ 	*status = FAILURE; \
63+ 	MB_FAILURE_NO_STATUS(pos, advance); \
64+ } while (0)
65+ 
6266#define  CHECK_LEN (pos , chars_need ) ((str_len - (pos)) >= (chars_need))
6367
6468/* valid as single byte character or leading byte */ 
@@ -85,6 +89,85 @@ static char *get_default_charset(void) {
8589}
8690/* }}} */ 
8791
92+ PHPAPI  unsignedint  php_next_utf8_char_ex (
93+ 		const  unsigned char   * str ,
94+ 		unsigned char   c ,
95+ 		size_t  str_len ,
96+ 		size_t  * cursor )
97+ {
98+ 	size_t  pos  =  * cursor ;
99+ 	unsigned int   this_char  =  0 ;
100+ 
101+ 	/* We'll follow strategy 2. from section 3.6.1 of UTR #36: 
102+ 	 * "In a reported illegal byte sequence, do not include any 
103+ 	 *  non-initial byte that encodes a valid character or is a leading 
104+ 	 *  byte for a valid sequence." */ 
105+ 
106+ 	ZEND_ASSERT (c  >= 0x80 );
107+ 
108+ 	if  (UNEXPECTED (c  <  0xc2 )) {
109+ 		MB_FAILURE_NO_STATUS (pos , 1 );
110+ 	} else  if  (c  <  0xe0 ) {
111+ 		if  (UNEXPECTED (!CHECK_LEN (pos , 2 )))
112+ 			MB_FAILURE_NO_STATUS (pos , 1 );
113+ 
114+ 		if  (UNEXPECTED (!utf8_trail (str [pos  +  1 ]))) {
115+ 			MB_FAILURE_NO_STATUS (pos , utf8_lead (str [pos  +  1 ]) ? 1  : 2 );
116+ 		}
117+ 		this_char  =  ((c  &  0x1f ) << 6 ) | (str [pos  +  1 ] &  0x3f );
118+ 		if  (UNEXPECTED (this_char  <  0x80 )) { /* non-shortest form */ 
119+ 			MB_FAILURE_NO_STATUS (pos , 2 );
120+ 		}
121+ 		pos  +=  2 ;
122+ 	} else  if  (c  <  0xf0 ) {
123+ 		size_t  avail  =  str_len  -  pos ;
124+ 
125+ 		if  (UNEXPECTED (avail  <  3  || 
126+ 				!utf8_trail (str [pos  +  1 ]) ||  !utf8_trail (str [pos  +  2 ]))) {
127+ 			if  (avail  <  2  ||  utf8_lead (str [pos  +  1 ]))
128+ 				MB_FAILURE_NO_STATUS (pos , 1 );
129+ 			else  if  (avail  <  3  ||  utf8_lead (str [pos  +  2 ]))
130+ 				MB_FAILURE_NO_STATUS (pos , 2 );
131+ 			else 
132+ 				MB_FAILURE_NO_STATUS (pos , 3 );
133+ 		}
134+ 
135+ 		this_char  =  ((c  &  0x0f ) << 12 ) | ((str [pos  +  1 ] &  0x3f ) << 6 ) | (str [pos  +  2 ] &  0x3f );
136+ 		if  (UNEXPECTED (this_char  <  0x800 )) { /* non-shortest form */ 
137+ 			MB_FAILURE_NO_STATUS (pos , 3 );
138+ 		} else  if  (UNEXPECTED (this_char  >= 0xd800  &&  this_char  <= 0xdfff )) { /* surrogate */ 
139+ 			MB_FAILURE_NO_STATUS (pos , 3 );
140+ 		}
141+ 		pos  +=  3 ;
142+ 	} else  if  (c  <  0xf5 ) {
143+ 		size_t  avail  =  str_len  -  pos ;
144+ 
145+ 		if  (UNEXPECTED (avail  <  4  || 
146+ 				!utf8_trail (str [pos  +  1 ]) ||  !utf8_trail (str [pos  +  2 ]) || 
147+ 				!utf8_trail (str [pos  +  3 ]))) {
148+ 			if  (avail  <  2  ||  utf8_lead (str [pos  +  1 ]))
149+ 				MB_FAILURE_NO_STATUS (pos , 1 );
150+ 			else  if  (avail  <  3  ||  utf8_lead (str [pos  +  2 ]))
151+ 				MB_FAILURE_NO_STATUS (pos , 2 );
152+ 			else  if  (avail  <  4  ||  utf8_lead (str [pos  +  3 ]))
153+ 				MB_FAILURE_NO_STATUS (pos , 3 );
154+ 			else 
155+ 				MB_FAILURE_NO_STATUS (pos , 4 );
156+ 		}
157+ 
158+ 		this_char  =  ((c  &  0x07 ) << 18 ) | ((str [pos  +  1 ] &  0x3f ) << 12 ) | ((str [pos  +  2 ] &  0x3f ) << 6 ) | (str [pos  +  3 ] &  0x3f );
159+ 		if  (UNEXPECTED (this_char  <  0x10000  ||  this_char  >  0x10FFFF )) { /* non-shortest form or outside range */ 
160+ 			MB_FAILURE_NO_STATUS (pos , 4 );
161+ 		}
162+ 		pos  +=  4 ;
163+ 	} else  {
164+ 		MB_FAILURE_NO_STATUS (pos , 1 );
165+ 	}
166+ 
167+ 	* cursor  =  pos ;
168+ 	return  this_char ;
169+ }
170+ 
88171/* {{{ get_next_char */ 
89172static  inline  unsigned int   get_next_char (
90173		enum  entity_charset  charset ,
@@ -105,72 +188,17 @@ static inline unsigned int get_next_char(
105188	switch  (charset ) {
106189	case  cs_utf_8 :
107190		{
108- 			/* We'll follow strategy 2. from section 3.6.1 of UTR #36: 
109- 			 * "In a reported illegal byte sequence, do not include any 
110- 			 *  non-initial byte that encodes a valid character or is a leading 
111- 			 *  byte for a valid sequence." */ 
112191			unsigned char   c ;
113192			c  =  str [pos ];
114193			if  (c  <  0x80 ) {
115194				this_char  =  c ;
116195				pos ++ ;
117- 			} else  if  (c  <  0xc2 ) {
118- 				MB_FAILURE (pos , 1 );
119- 			} else  if  (c  <  0xe0 ) {
120- 				if  (!CHECK_LEN (pos , 2 ))
121- 					MB_FAILURE (pos , 1 );
122- 
123- 				if  (!utf8_trail (str [pos  +  1 ])) {
124- 					MB_FAILURE (pos , utf8_lead (str [pos  +  1 ]) ? 1  : 2 );
125- 				}
126- 				this_char  =  ((c  &  0x1f ) << 6 ) | (str [pos  +  1 ] &  0x3f );
127- 				if  (this_char  <  0x80 ) { /* non-shortest form */ 
128- 					MB_FAILURE (pos , 2 );
129- 				}
130- 				pos  +=  2 ;
131- 			} else  if  (c  <  0xf0 ) {
132- 				size_t  avail  =  str_len  -  pos ;
133- 
134- 				if  (avail  <  3  || 
135- 						!utf8_trail (str [pos  +  1 ]) ||  !utf8_trail (str [pos  +  2 ])) {
136- 					if  (avail  <  2  ||  utf8_lead (str [pos  +  1 ]))
137- 						MB_FAILURE (pos , 1 );
138- 					else  if  (avail  <  3  ||  utf8_lead (str [pos  +  2 ]))
139- 						MB_FAILURE (pos , 2 );
140- 					else 
141- 						MB_FAILURE (pos , 3 );
142- 				}
143- 
144- 				this_char  =  ((c  &  0x0f ) << 12 ) | ((str [pos  +  1 ] &  0x3f ) << 6 ) | (str [pos  +  2 ] &  0x3f );
145- 				if  (this_char  <  0x800 ) { /* non-shortest form */ 
146- 					MB_FAILURE (pos , 3 );
147- 				} else  if  (this_char  >= 0xd800  &&  this_char  <= 0xdfff ) { /* surrogate */ 
148- 					MB_FAILURE (pos , 3 );
149- 				}
150- 				pos  +=  3 ;
151- 			} else  if  (c  <  0xf5 ) {
152- 				size_t  avail  =  str_len  -  pos ;
153- 
154- 				if  (avail  <  4  || 
155- 						!utf8_trail (str [pos  +  1 ]) ||  !utf8_trail (str [pos  +  2 ]) || 
156- 						!utf8_trail (str [pos  +  3 ])) {
157- 					if  (avail  <  2  ||  utf8_lead (str [pos  +  1 ]))
158- 						MB_FAILURE (pos , 1 );
159- 					else  if  (avail  <  3  ||  utf8_lead (str [pos  +  2 ]))
160- 						MB_FAILURE (pos , 2 );
161- 					else  if  (avail  <  4  ||  utf8_lead (str [pos  +  3 ]))
162- 						MB_FAILURE (pos , 3 );
163- 					else 
164- 						MB_FAILURE (pos , 4 );
165- 				}
166- 
167- 				this_char  =  ((c  &  0x07 ) << 18 ) | ((str [pos  +  1 ] &  0x3f ) << 12 ) | ((str [pos  +  2 ] &  0x3f ) << 6 ) | (str [pos  +  3 ] &  0x3f );
168- 				if  (this_char  <  0x10000  ||  this_char  >  0x10FFFF ) { /* non-shortest form or outside range */ 
169- 					MB_FAILURE (pos , 4 );
170- 				}
171- 				pos  +=  4 ;
172196			} else  {
173- 				MB_FAILURE (pos , 1 );
197+ 				this_char  =  php_next_utf8_char_ex (str , c , str_len , cursor );
198+ 				if  (UNEXPECTED (this_char  ==  0 )) {
199+ 					* status  =  FAILURE ;
200+ 				}
201+ 				return  this_char ;
174202			}
175203		}
176204		break ;
0 commit comments