@@ -852,39 +852,38 @@ static void traverse_for_entities(
852852
853853 unsigned code = 0 , code2 = 0 ;
854854 const char * entity_end_ptr = NULL ;
855- bool valid_entity = true;
856855
857856 if (current_ptr [1 ] == '#' ) {
858857 /* Processing numeric entity */
859858 const char * num_start = current_ptr + 2 ;
860859 entity_end_ptr = num_start ;
861860 if (process_numeric_entity (& entity_end_ptr , & code ) == FAILURE ) {
862- valid_entity = false ;
861+ goto invalid_incomplete_entity ;
863862 }
864- if (valid_entity && !all && (code > 63U || stage3_table_be_apos_00000 [code ].data .ent .entity == NULL )) {
863+ if (!all && (code > 63U || stage3_table_be_apos_00000 [code ].data .ent .entity == NULL )) {
865864 /* If we're in htmlspecialchars_decode, we're only decoding entities
866865 * that represent &, <, >, " and '. Is this one of them? */
867- valid_entity = false ;
868- } else if (valid_entity && ( !unicode_cp_is_allowed (code , doctype ) ||
869- (doctype == ENT_HTML_DOC_HTML5 && code == 0x0D ) )) {
866+ goto invalid_incomplete_entity ;
867+ } else if (!unicode_cp_is_allowed (code , doctype ) ||
868+ (doctype == ENT_HTML_DOC_HTML5 && code == 0x0D )) {
870869 /* are we allowed to decode this entity in this document type?
871870 * HTML 5 is the only that has a character that cannot be used in
872871 * a numeric entity but is allowed literally (U+000D). The
873872 * unoptimized version would be ... || !numeric_entity_is_allowed(code) */
874- valid_entity = false ;
873+ goto invalid_incomplete_entity ;
875874 }
876875 } else {
877- /* Processing named entity */
876+ /* Processing named entity */
878877 const char * name_start = current_ptr + 1 ;
879878 /* Search for ';' */
880879 const size_t max_search_len = MIN (LONGEST_ENTITY_LENGTH + 1 , input_end - name_start );
881880 const char * semi_colon_ptr = memchr (name_start , ';' , max_search_len );
882881 if (!semi_colon_ptr ) {
883- valid_entity = false ;
882+ goto invalid_incomplete_entity ;
884883 } else {
885884 const size_t name_len = semi_colon_ptr - name_start ;
886885 if (name_len == 0 ) {
887- valid_entity = false ;
886+ goto invalid_incomplete_entity ;
888887 } else {
889888 if (resolve_named_entity_html (name_start , name_len , inv_map , & code , & code2 ) == FAILURE ) {
890889 if (doctype == ENT_HTML_DOC_XHTML && name_len == 4 &&
@@ -895,7 +894,7 @@ static void traverse_for_entities(
895894 * hack to support it */
896895 code = (unsigned )'\'' ;
897896 } else {
898- valid_entity = false ;
897+ goto invalid_incomplete_entity ;
899898 }
900899 }
901900 entity_end_ptr = semi_colon_ptr ;
@@ -904,45 +903,51 @@ static void traverse_for_entities(
904903 }
905904
906905 /* If entity_end_ptr is not found or does not point to ';', consider the entity invalid */
907- if (!valid_entity || entity_end_ptr == NULL || * entity_end_ptr != ';' ) {
908- * output_ptr ++ = * current_ptr ++ ;
909- continue ;
906+ if (entity_end_ptr == NULL ) {
907+ goto invalid_incomplete_entity ;
910908 }
911909
912910 /* Check if quotes are allowed for entities representing ' or " */
913911 if ((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE )) ||
914912 (code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE )))
915913 {
916- valid_entity = false ;
914+ goto invalid_complete_entity ;
917915 }
918916
919917 /* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but
920918 * the call is needed to ensure the codepoint <= U+00FF) */
921- if (valid_entity && charset != cs_utf_8 ) {
919+ if (charset != cs_utf_8 ) {
922920 /* replace unicode code point */
923- if (map_from_unicode (code , charset , & code ) == FAILURE || code2 != 0 )
924- valid_entity = false;
921+ if (map_from_unicode (code , charset , & code ) == FAILURE || code2 != 0 ) {
922+ goto invalid_complete_entity ;
923+ }
925924 }
926925
927- if (valid_entity ) {
928- /* Write the parsed entity into the output buffer */
929- output_ptr += write_octet_sequence ((unsigned char * )output_ptr , charset , code );
930- if (code2 ) {
931- output_ptr += write_octet_sequence ((unsigned char * )output_ptr , charset , code2 );
932- }
933- /* Move current_ptr past the semicolon */
934- current_ptr = entity_end_ptr + 1 ;
926+ /* Write the parsed entity into the output buffer */
927+ output_ptr += write_octet_sequence ((unsigned char * )output_ptr , charset , code );
928+ if (code2 ) {
929+ output_ptr += write_octet_sequence ((unsigned char * )output_ptr , charset , code2 );
930+ }
931+ /* Move current_ptr past the semicolon */
932+ current_ptr = entity_end_ptr + 1 ;
933+ continue ;
934+
935+ invalid_incomplete_entity :
936+ /* If the entity is invalid at parse stage or entity_end_ptr was never found, copy '&' as normal */
937+ * output_ptr ++ = * current_ptr ++ ;
938+ continue ;
939+
940+ invalid_complete_entity :
941+ /* If the entity became invalid after we found entity_end_ptr */
942+ if (entity_end_ptr ) {
943+ const size_t len = entity_end_ptr - current_ptr ;
944+ memcpy (output_ptr , current_ptr , len );
945+ output_ptr += len ;
946+ current_ptr = entity_end_ptr ;
935947 } else {
936- /* If the entity is invalid, copy characters from current_ptr up to entity_end_ptr */
937- if (entity_end_ptr ) {
938- const size_t len = entity_end_ptr - current_ptr ;
939- memcpy (output_ptr , current_ptr , len );
940- output_ptr += len ;
941- current_ptr = entity_end_ptr ;
942- } else {
943- * output_ptr ++ = * current_ptr ++ ;
944- }
948+ * output_ptr ++ = * current_ptr ++ ;
945949 }
950+ continue ;
946951 }
947952
948953 * output_ptr = '\0' ;
0 commit comments