@@ -809,149 +809,144 @@ static inline size_t write_octet_sequence(unsigned char *buf, enum entity_charse
809809/* +2 is 1 because of rest (probably unnecessary), 1 because of terminating 0 */
810810#define TRAVERSE_FOR_ENTITIES_EXPAND_SIZE (oldlen ) ((oldlen) + (oldlen) / 5 + 2)
811811static void traverse_for_entities (
812- const char * input ,
813- size_t input_len ,
812+ const zend_string * input ,
814813 zend_string * output , /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) */
815- int all ,
816- int flags ,
814+ const int all ,
815+ const int flags ,
817816 const entity_ht * inv_map ,
818- enum entity_charset charset )
817+ const enum entity_charset charset )
819818{
820- const char * current_ptr = input ;
821- const char * input_end = input + input_len ; /* terminator address */
822- char * output_ptr = ZSTR_VAL (output );
823- int doctype = flags & ENT_HTML_DOC_TYPE_MASK ;
824-
825- assert (* input_end == '\0' );
826-
827- while (current_ptr < input_end ) {
828- const char * ampersand_ptr = memchr (current_ptr , '&' , input_end - current_ptr );
829- if (!ampersand_ptr ) {
830- size_t tail_len = input_end - current_ptr ;
831- if (tail_len > 0 ) {
832- memcpy (output_ptr , current_ptr , tail_len );
833- output_ptr += tail_len ;
834- }
835- break ;
836- }
837-
838- /* Copy everything up to the found '&' */
839- size_t chunk_len = ampersand_ptr - current_ptr ;
840- if (chunk_len > 0 ) {
841- memcpy (output_ptr , current_ptr , chunk_len );
842- output_ptr += chunk_len ;
843- }
844-
845- /* Now current_ptr points to the '&' character. */
846- current_ptr = ampersand_ptr ;
847-
848- /* If there are less than 4 bytes remaining, there isn't enough for an entity – copy '&' as a normal character */
849- if (current_ptr + 3 >= input_end ) {
850- * output_ptr ++ = * current_ptr ++ ;
851- continue ;
852- }
853-
854- unsigned code = 0 , code2 = 0 ;
855- const char * entity_end_ptr = NULL ;
856- int valid_entity = 1 ;
857-
858- if (current_ptr [1 ] == '#' ) {
859- /* Processing numeric entity */
860- const char * num_start = current_ptr + 2 ;
861- entity_end_ptr = num_start ;
862- if (process_numeric_entity (& entity_end_ptr , & code ) == FAILURE ) {
863- valid_entity = 0 ;
864- }
865- /* If we're in htmlspecialchars_decode, we're only decoding entities
866- * that represent &, <, >, " and '. Is this one of them? */
867- if (valid_entity && !all &&
868- (code > 63U ||
869- stage3_table_be_apos_00000 [code ].data .ent .entity == NULL ))
870- {
871- valid_entity = 0 ;
872- }
873- /* are we allowed to decode this entity in this document type?
874- * HTML 5 is the only that has a character that cannot be used in
875- * a numeric entity but is allowed literally (U+000D). The
876- * unoptimized version would be ... || !numeric_entity_is_allowed(code) */
877- if (valid_entity && (!unicode_cp_is_allowed (code , doctype ) ||
878- (doctype == ENT_HTML_DOC_HTML5 && code == 0x0D )))
879- {
880- valid_entity = 0 ;
881- }
882- } else {
883- /* Processing named entity */
884- const char * name_start = current_ptr + 1 ;
885- /* Search for ';' */
886- const char * semi_colon_ptr = memchr (name_start , ';' , LONGEST_ENTITY_LENGTH + 1 );
887- if (!semi_colon_ptr ) {
888- valid_entity = 0 ;
889- } else {
890- size_t name_len = semi_colon_ptr - name_start ;
891- if (name_len == 0 ) {
892- valid_entity = 0 ;
893- } else {
894- if (resolve_named_entity_html (name_start , name_len , inv_map , & code , & code2 ) == FAILURE ) {
895- if (doctype == ENT_HTML_DOC_XHTML && name_len == 4 &&
896- name_start [0 ] == 'a' && name_start [1 ] == 'p' &&
897- name_start [2 ] == 'o' && name_start [3 ] == 's' )
898- {
899- /* uses html4 inv_map, which doesn't include apos;. This is a
819+ const char * current_ptr = ZSTR_VAL (input );
820+ const char * input_end = current_ptr + input -> len ; /* terminator address */
821+ char * output_ptr = ZSTR_VAL (output );
822+ const int doctype = flags & ENT_HTML_DOC_TYPE_MASK ;
823+
824+ while (current_ptr < input_end ) {
825+ const char * ampersand_ptr = memchr (current_ptr , '&' , input_end - current_ptr );
826+ if (!ampersand_ptr ) {
827+ const size_t tail_len = input_end - current_ptr ;
828+ if (tail_len > 0 ) {
829+ memcpy (output_ptr , current_ptr , tail_len );
830+ output_ptr += tail_len ;
831+ }
832+ break ;
833+ }
834+
835+ /* Copy everything up to the found '&' */
836+ const size_t chunk_len = ampersand_ptr - current_ptr ;
837+ if (chunk_len > 0 ) {
838+ memcpy (output_ptr , current_ptr , chunk_len );
839+ output_ptr += chunk_len ;
840+ }
841+
842+ /* Now current_ptr points to the '&' character. */
843+ current_ptr = ampersand_ptr ;
844+
845+ /* If there are less than 4 bytes remaining, there isn't enough for an entity – copy '&' as a normal character */
846+ if (input_end - current_ptr < 4 ){
847+ const size_t remaining = input_end - current_ptr ;
848+ memcpy (output_ptr , current_ptr , remaining );
849+ output_ptr += remaining ;
850+ break ;
851+ }
852+
853+ unsigned code = 0 , code2 = 0 ;
854+ const char * entity_end_ptr = NULL ;
855+ bool valid_entity = true;
856+
857+ if (current_ptr [1 ] == '#' ) {
858+ /* Processing numeric entity */
859+ const char * num_start = current_ptr + 2 ;
860+ entity_end_ptr = num_start ;
861+ if (process_numeric_entity (& entity_end_ptr , & code ) == FAILURE ) {
862+ valid_entity = false;
863+ }
864+ if (valid_entity && !all && (code > 63U || stage3_table_be_apos_00000 [code ].data .ent .entity == NULL )) {
865+ /* If we're in htmlspecialchars_decode, we're only decoding entities
866+ * that represent &, <, >, " and '. Is this one of them? */
867+ valid_entity = false;
868+ } else if (valid_entity && (!unicode_cp_is_allowed (code , doctype ) ||
869+ (doctype == ENT_HTML_DOC_HTML5 && code == 0x0D ))) {
870+ /* are we allowed to decode this entity in this document type?
871+ * HTML 5 is the only that has a character that cannot be used in
872+ * a numeric entity but is allowed literally (U+000D). The
873+ * unoptimized version would be ... || !numeric_entity_is_allowed(code) */
874+ valid_entity = false;
875+ }
876+ } else {
877+ /* Processing named entity */
878+ const char * name_start = current_ptr + 1 ;
879+ /* Search for ';' */
880+ const size_t max_search_len = MIN (LONGEST_ENTITY_LENGTH + 1 , input_end - name_start );
881+ const char * semi_colon_ptr = memchr (name_start , ';' , max_search_len );
882+ if (!semi_colon_ptr ) {
883+ valid_entity = false;
884+ } else {
885+ const size_t name_len = semi_colon_ptr - name_start ;
886+ if (name_len == 0 ) {
887+ valid_entity = false;
888+ } else {
889+ if (resolve_named_entity_html (name_start , name_len , inv_map , & code , & code2 ) == FAILURE ) {
890+ if (doctype == ENT_HTML_DOC_XHTML && name_len == 4 &&
891+ name_start [0 ] == 'a' && name_start [1 ] == 'p' &&
892+ name_start [2 ] == 'o' && name_start [3 ] == 's' )
893+ {
894+ /* uses html4 inv_map, which doesn't include apos;. This is a
900895 * hack to support it */
901- code = (unsigned )'\'' ;
902- } else {
903- valid_entity = 0 ;
904- }
905- }
906- entity_end_ptr = semi_colon_ptr ;
907- }
908- }
909- }
910-
911- /* If entity_end_ptr is not found or does not point to ';', consider the entity invalid */
912- if (!valid_entity || entity_end_ptr == NULL || * entity_end_ptr != ';' ) {
913- * output_ptr ++ = * current_ptr ++ ;
914- continue ;
915- }
916-
917- /* Check if quotes are allowed for entities representing ' or " */
918- if ( ((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE )) ||
919- (code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE ) )))
920- {
921- valid_entity = 0 ;
922- }
923-
924- /* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but
896+ code = (unsigned )'\'' ;
897+ } else {
898+ valid_entity = false ;
899+ }
900+ }
901+ entity_end_ptr = semi_colon_ptr ;
902+ }
903+ }
904+ }
905+
906+ /* If entity_end_ptr is not found or does not point to ';', consider the entity invalid */
907+ if (!valid_entity || entity_end_ptr == NULL || * entity_end_ptr != ';' ) {
908+ * output_ptr ++ = * current_ptr ++ ;
909+ continue ;
910+ }
911+
912+ /* Check if quotes are allowed for entities representing ' or " */
913+ if ((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE )) ||
914+ (code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE )))
915+ {
916+ valid_entity = false ;
917+ }
918+
919+ /* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but
925920 * the call is needed to ensure the codepoint <= U+00FF) */
926- if (valid_entity && charset != cs_utf_8 ) {
927- /* replace unicode code point */
928- if (map_from_unicode (code , charset , & code ) == FAILURE || code2 != 0 )
929- valid_entity = 0 ;
930- }
931-
932- if (valid_entity ) {
933- /* Write the parsed entity into the output buffer */
934- output_ptr += write_octet_sequence ((unsigned char * )output_ptr , charset , code );
935- if (code2 ) {
936- output_ptr += write_octet_sequence ((unsigned char * )output_ptr , charset , code2 );
937- }
938- /* Move current_ptr past the semicolon */
939- current_ptr = entity_end_ptr + 1 ;
940- } else {
941- /* If the entity is invalid, copy characters from current_ptr up to entity_end_ptr */
942- if (entity_end_ptr ) {
943- size_t len = entity_end_ptr - current_ptr ;
944- memcpy (output_ptr , current_ptr , len );
945- output_ptr += len ;
946- current_ptr = entity_end_ptr ;
947- } else {
948- * output_ptr ++ = * current_ptr ++ ;
949- }
950- }
951- }
952-
953- * output_ptr = '\0' ;
954- ZSTR_LEN (output ) = (size_t )(output_ptr - ZSTR_VAL (output ));
921+ if (valid_entity && charset != cs_utf_8 ) {
922+ /* replace unicode code point */
923+ if (map_from_unicode (code , charset , & code ) == FAILURE || code2 != 0 )
924+ valid_entity = false ;
925+ }
926+
927+ if (valid_entity ) {
928+ /* Write the parsed entity into the output buffer */
929+ output_ptr += write_octet_sequence ((unsigned char * )output_ptr , charset , code );
930+ if (code2 ) {
931+ output_ptr += write_octet_sequence ((unsigned char * )output_ptr , charset , code2 );
932+ }
933+ /* Move current_ptr past the semicolon */
934+ current_ptr = entity_end_ptr + 1 ;
935+ } else {
936+ /* If the entity is invalid, copy characters from current_ptr up to entity_end_ptr */
937+ if (entity_end_ptr ) {
938+ const size_t len = entity_end_ptr - current_ptr ;
939+ memcpy (output_ptr , current_ptr , len );
940+ output_ptr += len ;
941+ current_ptr = entity_end_ptr ;
942+ } else {
943+ * output_ptr ++ = * current_ptr ++ ;
944+ }
945+ }
946+ }
947+
948+ * output_ptr = '\0' ;
949+ ZSTR_LEN (output ) = (size_t )(output_ptr - ZSTR_VAL (output ));
955950}
956951/* }}} */
957952
@@ -1036,7 +1031,7 @@ PHPAPI zend_string *php_unescape_html_entities(zend_string *str, int all, int fl
10361031 inverse_map = unescape_inverse_map (all , flags );
10371032
10381033 /* replace numeric entities */
1039- traverse_for_entities (ZSTR_VAL ( str ), ZSTR_LEN ( str ) , ret , all , flags , inverse_map , charset );
1034+ traverse_for_entities (str , ret , all , flags , inverse_map , charset );
10401035
10411036 return ret ;
10421037}
0 commit comments