@@ -809,112 +809,148 @@ static inline size_t write_octet_sequence(unsigned char *buf, enum entity_charse
809809/* +2 is 1 because of rest (probably unnecessary), 1 because of terminating 0 */
810810#define TRAVERSE_FOR_ENTITIES_EXPAND_SIZE (oldlen ) ((oldlen) + (oldlen) / 5 + 2)
811811static void traverse_for_entities (
812- const char * old ,
813- size_t oldlen ,
814- zend_string * ret , /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) */
815- int all ,
816- int flags ,
812+ const zend_string * input ,
813+ zend_string * output , /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) */
814+ const int all ,
815+ const int flags ,
817816 const entity_ht * inv_map ,
818- enum entity_charset charset )
817+ const enum entity_charset charset )
819818{
820- const char * p ,
821- * lim ;
822- char * q ;
823- int doctype = flags & ENT_HTML_DOC_TYPE_MASK ;
824-
825- lim = old + oldlen ; /* terminator address */
826- assert (* lim == '\0' );
827-
828- for (p = old , q = ZSTR_VAL (ret ); p < lim ;) {
829- unsigned code , code2 = 0 ;
830- const char * next = NULL ; /* when set, next > p, otherwise possible inf loop */
831-
832- /* Shift JIS, Big5 and HKSCS use multi-byte encodings where an
833- * ASCII range byte can be part of a multi-byte sequence.
834- * However, they start at 0x40, therefore if we find a 0x26 byte,
835- * we're sure it represents the '&' character. */
819+ const char * current_ptr = ZSTR_VAL (input );
820+ const char * input_end = current_ptr + ZSTR_LEN (input ); /* terminator address */
821+ char * output_ptr = ZSTR_VAL (output );
822+ const int doctype = flags & ENT_HTML_DOC_TYPE_MASK ;
823+
824+ while (current_ptr < input_end ) {
825+ const char * ampersand_ptr = memchr (current_ptr , '&' , input_end - current_ptr );
826+ if (!ampersand_ptr ) {
827+ const size_t tail_len = input_end - current_ptr ;
828+ if (tail_len > 0 ) {
829+ memcpy (output_ptr , current_ptr , tail_len );
830+ output_ptr += tail_len ;
831+ }
832+ break ;
833+ }
836834
837- /* assumes there are no single-char entities */
838- if (p [0 ] != '&' || (p + 3 >= lim )) {
839- * (q ++ ) = * (p ++ );
840- continue ;
835+ /* Copy everything up to the found '&' */
836+ const size_t chunk_len = ampersand_ptr - current_ptr ;
837+ if (chunk_len > 0 ) {
838+ memcpy (output_ptr , current_ptr , chunk_len );
839+ output_ptr += chunk_len ;
841840 }
842841
843- /* now p[3] is surely valid and is no terminator */
844-
845- /* numerical entity */
846- if (p [1 ] == '#' ) {
847- next = & p [2 ];
848- if (process_numeric_entity (& next , & code ) == FAILURE )
849- goto invalid_code ;
850-
851- /* If we're in htmlspecialchars_decode, we're only decoding entities
852- * that represent &, <, >, " and '. Is this one of them? */
853- if (!all && (code > 63U ||
854- stage3_table_be_apos_00000 [code ].data .ent .entity == NULL ))
855- goto invalid_code ;
856-
857- /* are we allowed to decode this entity in this document type?
858- * HTML 5 is the only that has a character that cannot be used in
859- * a numeric entity but is allowed literally (U+000D). The
860- * unoptimized version would be ... || !numeric_entity_is_allowed(code) */
861- if (!unicode_cp_is_allowed (code , doctype ) ||
862- (doctype == ENT_HTML_DOC_HTML5 && code == 0x0D ))
863- goto invalid_code ;
864- } else {
865- const char * start ;
866- size_t ent_len ;
842+ /* Now current_ptr points to the '&' character. */
843+ current_ptr = ampersand_ptr ;
867844
868- next = & p [1 ];
869- start = next ;
845+ /* If there are less than 4 bytes remaining, there isn't enough for an entity -
846+ * copy '&' as a normal character. */
847+ if (input_end - current_ptr < 4 ) {
848+ const size_t remaining = input_end - current_ptr ;
849+ memcpy (output_ptr , current_ptr , remaining );
850+ output_ptr += remaining ;
851+ break ;
852+ }
870853
871- if ( process_named_entity_html ( & next , & start , & ent_len ) == FAILURE )
872- goto invalid_code ;
854+ unsigned code = 0 , code2 = 0 ;
855+ const char * entity_end_ptr = NULL ;
873856
874- if (resolve_named_entity_html (start , ent_len , inv_map , & code , & code2 ) == FAILURE ) {
875- if (doctype == ENT_HTML_DOC_XHTML && ent_len == 4 && start [0 ] == 'a'
876- && start [1 ] == 'p' && start [2 ] == 'o' && start [3 ] == 's' ) {
877- /* uses html4 inv_map, which doesn't include apos;. This is a
878- * hack to support it */
879- code = (unsigned ) '\'' ;
857+ if (current_ptr [1 ] == '#' ) {
858+ /* Processing numeric entity */
859+ const char * num_start = current_ptr + 2 ;
860+ entity_end_ptr = num_start ;
861+ if (process_numeric_entity (& entity_end_ptr , & code ) == FAILURE ) {
862+ goto invalid_incomplete_entity ;
863+ }
864+ if (!all && (code > 63U || stage3_table_be_apos_00000 [code ].data .ent .entity == NULL )) {
865+ /* If we're in htmlspecialchars_decode, we're only decoding entities
866+ * that represent &, <, >, " and '. Is this one of them? */
867+ goto invalid_incomplete_entity ;
868+ } else if (!unicode_cp_is_allowed (code , doctype ) ||
869+ (doctype == ENT_HTML_DOC_HTML5 && code == 0x0D )) {
870+ /* are we allowed to decode this entity in this document type?
871+ * HTML 5 is the only that has a character that cannot be used in
872+ * a numeric entity but is allowed literally (U+000D). The
873+ * unoptimized version would be ... || !numeric_entity_is_allowed(code) */
874+ goto invalid_incomplete_entity ;
875+ }
876+ } else {
877+ /* Processing named entity */
878+ const char * name_start = current_ptr + 1 ;
879+ /* Search for ';' */
880+ const size_t max_search_len = MIN (LONGEST_ENTITY_LENGTH + 1 , input_end - name_start );
881+ const char * semi_colon_ptr = memchr (name_start , ';' , max_search_len );
882+ if (!semi_colon_ptr ) {
883+ goto invalid_incomplete_entity ;
884+ } else {
885+ const size_t name_len = semi_colon_ptr - name_start ;
886+ if (name_len == 0 ) {
887+ goto invalid_incomplete_entity ;
880888 } else {
881- goto invalid_code ;
889+ if (resolve_named_entity_html (name_start , name_len , inv_map , & code , & code2 ) == FAILURE ) {
890+ if (doctype == ENT_HTML_DOC_XHTML && name_len == 4 &&
891+ name_start [0 ] == 'a' && name_start [1 ] == 'p' &&
892+ name_start [2 ] == 'o' && name_start [3 ] == 's' )
893+ {
894+ /* uses html4 inv_map, which doesn't include apos;. This is a
895+ * hack to support it */
896+ code = (unsigned )'\'' ;
897+ } else {
898+ goto invalid_incomplete_entity ;
899+ }
900+ }
901+ entity_end_ptr = semi_colon_ptr ;
882902 }
883903 }
884904 }
885905
886- assert (* next == ';' );
906+ /* At this stage the entity_end_ptr should be always set. */
907+ ZEND_ASSERT (entity_end_ptr != NULL );
887908
888- if (((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE )) ||
889- (code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE )))
890- /* && code2 == '\0' always true for current maps */ )
891- goto invalid_code ;
909+ /* Check if quotes are allowed for entities representing ' or " */
910+ if ((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE )) ||
911+ (code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE )))
912+ {
913+ goto invalid_complete_entity ;
914+ }
892915
893916 /* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but
894917 * the call is needed to ensure the codepoint <= U+00FF) */
895918 if (charset != cs_utf_8 ) {
896919 /* replace unicode code point */
897- if (map_from_unicode (code , charset , & code ) == FAILURE || code2 != 0 )
898- goto invalid_code ; /* not representable in target charset */
920+ if (map_from_unicode (code , charset , & code ) == FAILURE || code2 != 0 ) {
921+ goto invalid_complete_entity ;
922+ }
899923 }
900924
901- q += write_octet_sequence ((unsigned char * )q , charset , code );
925+ /* Write the parsed entity into the output buffer */
926+ output_ptr += write_octet_sequence ((unsigned char * )output_ptr , charset , code );
902927 if (code2 ) {
903- q += write_octet_sequence ((unsigned char * )q , charset , code2 );
928+ output_ptr += write_octet_sequence ((unsigned char * )output_ptr , charset , code2 );
904929 }
930+ /* Move current_ptr past the semicolon */
931+ current_ptr = entity_end_ptr + 1 ;
932+ continue ;
905933
906- /* jump over the valid entity; may go beyond size of buffer; np */
907- p = next + 1 ;
934+ invalid_incomplete_entity :
935+ /* If the entity is invalid at parse stage or entity_end_ptr was never found, copy '&' as normal */
936+ * output_ptr ++ = * current_ptr ++ ;
908937 continue ;
909938
910- invalid_code :
911- for (; p < next ; p ++ ) {
912- * (q ++ ) = * p ;
939+ invalid_complete_entity :
940+ /* If the entity became invalid after we found entity_end_ptr */
941+ if (entity_end_ptr ) {
942+ const size_t len = entity_end_ptr - current_ptr ;
943+ memcpy (output_ptr , current_ptr , len );
944+ output_ptr += len ;
945+ current_ptr = entity_end_ptr ;
946+ } else {
947+ * output_ptr ++ = * current_ptr ++ ;
913948 }
949+ continue ;
914950 }
915951
916- * q = '\0' ;
917- ZSTR_LEN (ret ) = (size_t )(q - ZSTR_VAL (ret ));
952+ * output_ptr = '\0' ;
953+ ZSTR_LEN (output ) = (size_t )(output_ptr - ZSTR_VAL (output ));
918954}
919955/* }}} */
920956
@@ -999,7 +1035,7 @@ PHPAPI zend_string *php_unescape_html_entities(zend_string *str, int all, int fl
9991035 inverse_map = unescape_inverse_map (all , flags );
10001036
10011037 /* replace numeric entities */
1002- traverse_for_entities (ZSTR_VAL ( str ), ZSTR_LEN ( str ) , ret , all , flags , inverse_map , charset );
1038+ traverse_for_entities (str , ret , all , flags , inverse_map , charset );
10031039
10041040 return ret ;
10051041}
0 commit comments