@@ -822,3 +822,180 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8
822822 UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE );
823823 return retval ;
824824}
825+
826+ /**
827+ * Helper function used by utf8proc_isequal_normalized.
828+ * Reads and sorts the next sequence of combining characters.
829+ * If buf is not large enough, calculates minimum length by processing
830+ * the whole rest of the string instead of just the next combining characters.
831+ */
832+ static void utf8proc_decompose_next_chars (utf8proc_processing_state_t * state , const utf8proc_option_t options ,
833+ utf8proc_custom_func custom_func , void * custom_data
834+ ) {
835+ utf8proc_ssize_t buf_needed = 0 , buf_needed_max = 1 ;
836+ utf8proc_span32_t buf_remaining = state -> buf ;
837+ int last_boundclass = 0 ;
838+ state -> error = 0 ;
839+ while (state -> str .len > 0 ) {
840+ /* read a char from `state->str` and decompose it to `buf_remaining` */
841+ utf8proc_int32_t c ;
842+ utf8proc_ssize_t str_consumed , buf_consumed ;
843+ str_consumed = utf8proc_iterate (state -> str .ptr , state -> str .len , & c );
844+ if (str_consumed < 1 ) {
845+ /* error or end of string */
846+ state -> error = str_consumed ;
847+ return ;
848+ } else if (str_consumed > state -> str .len ) {
849+ /* string ends mid-way */
850+ state -> error = UTF8PROC_ERROR_INVALIDUTF8 ;
851+ return ;
852+ }
853+ if (custom_func ) {
854+ c = custom_func (c , custom_data );
855+ }
856+ /* successfully read from `state->str`, now time to decompose */
857+ if (c < 0x80 ) {
858+ /* fast path for common ASCII case */
859+ last_boundclass = 0 ;
860+ if (state -> error != 0 ) {
861+ /* just looking for the longest combining sequence, this isn't it */
862+ continue ;
863+ }
864+ if (buf_remaining .len_available < 1 ) {
865+ /* not enough space */
866+ buf_remaining .len_available = 0 ;
867+ state -> buf .ptr = buf_remaining .ptr = NULL ;
868+ state -> error = UTF8PROC_ERROR_NOMEM ;
869+ /* now just looking for the longest combining sequence, this isn't it */
870+ continue ;
871+ }
872+ /* success */
873+ buf_consumed = buf_needed = 1 ;
874+ if ((options & UTF8PROC_CASEFOLD ) && 0x41 <= c && c <= 0x5A ) {
875+ * buf_remaining .ptr = c + 0x20 ;
876+ } else {
877+ * buf_remaining .ptr = c ;
878+ }
879+ state -> str .ptr += str_consumed ;
880+ state -> str .len -= str_consumed ;
881+ buf_remaining .ptr += 1 ;
882+ buf_remaining .len_available -= 1 ;
883+ /* ASCII characters are all zero combining class */
884+ break ;
885+ } else {
886+ buf_consumed = utf8proc_decompose_char (c , buf_remaining .ptr , buf_remaining .len_available , options , & last_boundclass );
887+ if (buf_consumed < 0 ) {
888+ /* error */
889+ state -> error = buf_consumed ;
890+ return ;
891+ }
892+ buf_needed += buf_consumed ;
893+ if (state -> error == 0 && buf_consumed > buf_remaining .len_available ) {
894+ /* not enough space */
895+ buf_remaining .len_available = 0 ;
896+ state -> buf .ptr = buf_remaining .ptr = NULL ;
897+ state -> error = UTF8PROC_ERROR_NOMEM ;
898+ }
899+ }
900+ /* success */
901+ state -> str .ptr += str_consumed ;
902+ state -> str .len -= str_consumed ;
903+ if (buf_needed == 0 ) {
904+ /* ignorable sequence - skip and try next */
905+ continue ;
906+ }
907+ if (state -> error == 0 ) {
908+ buf_remaining .ptr += buf_consumed ;
909+ buf_remaining .len_available -= buf_consumed ;
910+ }
911+ /* decomposed chars must be sorted in ascending order of combining class,
912+ which means we need to keep fetching chars until we get to non-combining */
913+ if (buf_consumed == 0 || state -> str .len <= 0 || unsafe_get_property (c )-> combining_class == 0 ) {
914+ /* done decomposing this sequence */
915+ if (state -> error == 0 ) {
916+ /* time to finish up and optionally sort it */
917+ break ;
918+ }
919+ /* else we're trying to find the longest decomposed sequence */
920+ if (buf_needed > buf_needed_max ) {
921+ buf_needed_max = buf_needed ;
922+ }
923+ /* reset for next sequence */
924+ buf_needed = 0 ;
925+ }
926+ }
927+ if (state -> buf .ptr == NULL ) {
928+ state -> buf .len_used = buf_needed_max ;
929+ } else {
930+ state -> buf .len_used = buf_needed ;
931+ }
932+ if (buf_needed > 1 && state -> error == 0 && buf_needed <= state -> buf .len_available ) {
933+ /* sort by combining class (similar code is in utf8proc_decompose_custom implementation) */
934+ utf8proc_ssize_t pos = 0 ;
935+ const utf8proc_ssize_t second_to_last = buf_needed - 1 ;
936+ while (pos < second_to_last ) {
937+ utf8proc_int32_t uc1 , uc2 ;
938+ const utf8proc_property_t * property1 , * property2 ;
939+ uc1 = state -> buf .ptr [pos ];
940+ uc2 = state -> buf .ptr [pos + 1 ];
941+ property1 = unsafe_get_property (uc1 );
942+ property2 = unsafe_get_property (uc2 );
943+ if (property1 -> combining_class > property2 -> combining_class &&
944+ property2 -> combining_class > 0 ) {
945+ state -> buf .ptr [pos ] = uc2 ;
946+ state -> buf .ptr [pos + 1 ] = uc1 ;
947+ if (pos > 0 ) pos -- ; else pos ++ ;
948+ } else {
949+ pos ++ ;
950+ }
951+ }
952+ }
953+ }
954+
955+ static utf8proc_string8_view_t utf8proc_purify_strlen (utf8proc_string8_view_t str ) {
956+ if (str .len < 0 ) {
957+ if (str .ptr == NULL ) {
958+ str .len = 0 ;
959+ }
960+ else for (str .len = 0 ; str .ptr [str .len ] != '\0' ; ++ str .len ) { }
961+ }
962+ return str ;
963+ }
964+
965+ UTF8PROC_DLLEXPORT void utf8proc_isequal_normalized (utf8proc_processing_state_t * a , utf8proc_processing_state_t * b , utf8proc_option_t options ,
966+ utf8proc_custom_func custom_func , void * custom_data
967+ ) {
968+ a -> str = utf8proc_purify_strlen (a -> str );
969+ b -> str = utf8proc_purify_strlen (b -> str );
970+ options = (utf8proc_option_t )((options & ~(unsigned int )UTF8PROC_COMPOSE )|UTF8PROC_DECOMPOSE );
971+ while (1 ) {
972+ const utf8proc_string8_view_t original_a = a -> str ;
973+ const utf8proc_string8_view_t original_b = b -> str ;
974+ if (a -> str .len == 0 || b -> str .len == 0 ) {
975+ /* end of string */
976+ return ;
977+ }
978+ utf8proc_decompose_next_chars (a , options , custom_func , custom_data );
979+ utf8proc_decompose_next_chars (b , options , custom_func , custom_data );
980+ if (a -> error == 0 && b -> error == 0 ) {
981+ utf8proc_ssize_t pos ;
982+ /* success - compare the work buffers for equality */
983+ if (a -> buf .len_used != b -> buf .len_used ) {
984+ /* mismatch found */
985+ return ;
986+ }
987+ for (pos = 0 ; pos < a -> buf .len_used ; ++ pos ) {
988+ if (a -> buf .ptr [pos ] != b -> buf .ptr [pos ]) {
989+ /* mismatch found */
990+ return ;
991+ }
992+ }
993+ /* equal so far */
994+ continue ;
995+ }
996+ /* error - restore unprocessed strings and exit */
997+ a -> str = original_a ;
998+ b -> str = original_b ;
999+ return ;
1000+ }
1001+ }
0 commit comments