@@ -809,112 +809,149 @@ static inline size_t write_octet_sequence(unsigned char *buf, enum entity_charse
809809/* +2 is 1 because of rest (probably unnecessary), 1 because of terminating 0 */
810810#define TRAVERSE_FOR_ENTITIES_EXPAND_SIZE (oldlen ) ((oldlen) + (oldlen) / 5 + 2)
811811static void traverse_for_entities (
812- const char * old ,
813- size_t oldlen ,
814- zend_string * ret , /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) */
812+ const char * input ,
813+ size_t input_len ,
814+ zend_string * output , /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) */
815815 int all ,
816816 int flags ,
817817 const entity_ht * inv_map ,
818818 enum entity_charset charset )
819819{
820- const char * p ,
821- * lim ;
822- char * q ;
823- int doctype = flags & ENT_HTML_DOC_TYPE_MASK ;
824-
825- lim = old + oldlen ; /* terminator address */
826- assert (* lim == '\0' );
827-
828- for (p = old , q = ZSTR_VAL (ret ); p < lim ;) {
829- unsigned code , code2 = 0 ;
830- const char * next = NULL ; /* when set, next > p, otherwise possible inf loop */
831-
832- /* Shift JIS, Big5 and HKSCS use multi-byte encodings where an
833- * ASCII range byte can be part of a multi-byte sequence.
834- * However, they start at 0x40, therefore if we find a 0x26 byte,
835- * we're sure it represents the '&' character. */
836-
837- /* assumes there are no single-char entities */
838- if (p [0 ] != '&' || (p + 3 >= lim )) {
839- * (q ++ ) = * (p ++ );
840- continue ;
841- }
842-
843- /* now p[3] is surely valid and is no terminator */
844-
845- /* numerical entity */
846- if (p [1 ] == '#' ) {
847- next = & p [2 ];
848- if (process_numeric_entity (& next , & code ) == FAILURE )
849- goto invalid_code ;
850-
851- /* If we're in htmlspecialchars_decode, we're only decoding entities
820+ const char * current_ptr = input ;
821+ const char * input_end = input + input_len ; /* terminator address */
822+ char * output_ptr = ZSTR_VAL (output );
823+ int doctype = flags & ENT_HTML_DOC_TYPE_MASK ;
824+
825+ assert (* input_end == '\0' );
826+
827+ while (current_ptr < input_end ) {
828+ const char * ampersand_ptr = memchr (current_ptr , '&' , input_end - current_ptr );
829+ if (!ampersand_ptr ) {
830+ size_t tail_len = input_end - current_ptr ;
831+ if (tail_len > 0 ) {
832+ memcpy (output_ptr , current_ptr , tail_len );
833+ output_ptr += tail_len ;
834+ }
835+ break ;
836+ }
837+
838+ /* Copy everything up to the found '&' */
839+ size_t chunk_len = ampersand_ptr - current_ptr ;
840+ if (chunk_len > 0 ) {
841+ memcpy (output_ptr , current_ptr , chunk_len );
842+ output_ptr += chunk_len ;
843+ }
844+
845+ /* Now current_ptr points to the '&' character. */
846+ current_ptr = ampersand_ptr ;
847+
848+ /* If there are less than 4 bytes remaining, there isn't enough for an entity – copy '&' as a normal character */
849+ if (current_ptr + 3 >= input_end ) {
850+ * output_ptr ++ = * current_ptr ++ ;
851+ continue ;
852+ }
853+
854+ unsigned code = 0 , code2 = 0 ;
855+ const char * entity_end_ptr = NULL ;
856+ int valid_entity = 1 ;
857+
858+ if (current_ptr [1 ] == '#' ) {
859+ /* Processing numeric entity */
860+ const char * num_start = current_ptr + 2 ;
861+ entity_end_ptr = num_start ;
862+ if (process_numeric_entity (& entity_end_ptr , & code ) == FAILURE ) {
863+ valid_entity = 0 ;
864+ }
865+ /* If we're in htmlspecialchars_decode, we're only decoding entities
852866 * that represent &, <, >, " and '. Is this one of them? */
853- if (!all && (code > 63U ||
854- stage3_table_be_apos_00000 [code ].data .ent .entity == NULL ))
855- goto invalid_code ;
856-
857- /* are we allowed to decode this entity in this document type?
867+ if (valid_entity && !all &&
868+ (code > 63U ||
869+ stage3_table_be_apos_00000 [code ].data .ent .entity == NULL ))
870+ {
871+ valid_entity = 0 ;
872+ }
873+ /* are we allowed to decode this entity in this document type?
858874 * HTML 5 is the only that has a character that cannot be used in
859875 * a numeric entity but is allowed literally (U+000D). The
860876 * unoptimized version would be ... || !numeric_entity_is_allowed(code) */
861- if (!unicode_cp_is_allowed (code , doctype ) ||
862- (doctype == ENT_HTML_DOC_HTML5 && code == 0x0D ))
863- goto invalid_code ;
864- } else {
865- const char * start ;
866- size_t ent_len ;
867-
868- next = & p [1 ];
869- start = next ;
870-
871- if (process_named_entity_html (& next , & start , & ent_len ) == FAILURE )
872- goto invalid_code ;
873-
874- if (resolve_named_entity_html (start , ent_len , inv_map , & code , & code2 ) == FAILURE ) {
875- if (doctype == ENT_HTML_DOC_XHTML && ent_len == 4 && start [0 ] == 'a'
876- && start [1 ] == 'p' && start [2 ] == 'o' && start [3 ] == 's' ) {
877- /* uses html4 inv_map, which doesn't include apos;. This is a
878- * hack to support it */
879- code = (unsigned ) '\'' ;
880- } else {
881- goto invalid_code ;
882- }
883- }
884- }
885-
886- assert (* next == ';' );
887-
888- if (((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE )) ||
889- (code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE )))
890- /* && code2 == '\0' always true for current maps */ )
891- goto invalid_code ;
892-
893- /* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but
877+ if (valid_entity && (!unicode_cp_is_allowed (code , doctype ) ||
878+ (doctype == ENT_HTML_DOC_HTML5 && code == 0x0D )))
879+ {
880+ valid_entity = 0 ;
881+ }
882+ } else {
883+ /* Processing named entity */
884+ const char * name_start = current_ptr + 1 ;
885+ /* Search for ';' */
886+ const char * semi_colon_ptr = memchr (name_start , ';' , LONGEST_ENTITY_LENGTH + 1 );
887+ if (!semi_colon_ptr ) {
888+ valid_entity = 0 ;
889+ } else {
890+ size_t name_len = semi_colon_ptr - name_start ;
891+ if (name_len == 0 ) {
892+ valid_entity = 0 ;
893+ } else {
894+ if (resolve_named_entity_html (name_start , name_len , inv_map , & code , & code2 ) == FAILURE ) {
895+ if (doctype == ENT_HTML_DOC_XHTML && name_len == 4 &&
896+ name_start [0 ] == 'a' && name_start [1 ] == 'p' &&
897+ name_start [2 ] == 'o' && name_start [3 ] == 's' )
898+ {
899+ /* uses html4 inv_map, which doesn't include apos;. This is a
900+ * hack to support it */
901+ code = (unsigned )'\'' ;
902+ } else {
903+ valid_entity = 0 ;
904+ }
905+ }
906+ entity_end_ptr = semi_colon_ptr ;
907+ }
908+ }
909+ }
910+
911+ /* If entity_end_ptr is not found or does not point to ';', consider the entity invalid */
912+ if (!valid_entity || entity_end_ptr == NULL || * entity_end_ptr != ';' ) {
913+ * output_ptr ++ = * current_ptr ++ ;
914+ continue ;
915+ }
916+
917+ /* Check if quotes are allowed for entities representing ' or " */
918+ if (((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE )) ||
919+ (code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE ))))
920+ {
921+ valid_entity = 0 ;
922+ }
923+
924+ /* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but
894925 * the call is needed to ensure the codepoint <= U+00FF) */
895- if (charset != cs_utf_8 ) {
896- /* replace unicode code point */
897- if (map_from_unicode (code , charset , & code ) == FAILURE || code2 != 0 )
898- goto invalid_code ; /* not representable in target charset */
899- }
900-
901- q += write_octet_sequence ((unsigned char * )q , charset , code );
902- if (code2 ) {
903- q += write_octet_sequence ((unsigned char * )q , charset , code2 );
904- }
905-
906- /* jump over the valid entity; may go beyond size of buffer; np */
907- p = next + 1 ;
908- continue ;
909-
910- invalid_code :
911- for (; p < next ; p ++ ) {
912- * (q ++ ) = * p ;
913- }
914- }
915-
916- * q = '\0' ;
917- ZSTR_LEN (ret ) = (size_t )(q - ZSTR_VAL (ret ));
926+ if (valid_entity && charset != cs_utf_8 ) {
927+ /* replace unicode code point */
928+ if (map_from_unicode (code , charset , & code ) == FAILURE || code2 != 0 )
929+ valid_entity = 0 ;
930+ }
931+
932+ if (valid_entity ) {
933+ /* Write the parsed entity into the output buffer */
934+ output_ptr += write_octet_sequence ((unsigned char * )output_ptr , charset , code );
935+ if (code2 ) {
936+ output_ptr += write_octet_sequence ((unsigned char * )output_ptr , charset , code2 );
937+ }
938+ /* Move current_ptr past the semicolon */
939+ current_ptr = entity_end_ptr + 1 ;
940+ } else {
941+ /* If the entity is invalid, copy characters from current_ptr up to entity_end_ptr */
942+ if (entity_end_ptr ) {
943+ size_t len = entity_end_ptr - current_ptr ;
944+ memcpy (output_ptr , current_ptr , len );
945+ output_ptr += len ;
946+ current_ptr = entity_end_ptr ;
947+ } else {
948+ * output_ptr ++ = * current_ptr ++ ;
949+ }
950+ }
951+ }
952+
953+ * output_ptr = '\0' ;
954+ ZSTR_LEN (output ) = (size_t )(output_ptr - ZSTR_VAL (output ));
918955}
919956/* }}} */
920957
0 commit comments