@@ -822,3 +822,289 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8
822822 UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE );
823823 return retval ;
824824}
825+
826+ UTF8PROC_DLLEXPORT void utf8proc_isequal_normalized_custom (utf8proc_processing_state_t * a , utf8proc_processing_state_t * b , utf8proc_option_t options ,
827+ utf8proc_custom_func a_custom_func , void * a_custom_data , utf8proc_custom_func b_custom_func , void * b_custom_data
828+ ) {
829+ const utf8proc_bool a_len_terminated = (a -> str .len >= 0 );
830+ const utf8proc_bool b_len_terminated = (b -> str .len >= 0 );
831+ /* which source string(s) we need to read more from */
832+ utf8proc_bool a_consume = true;
833+ utf8proc_bool b_consume = true;
834+ /* results of utf8proc_iterate for each string */
835+ utf8proc_int32_t a_codepoint = 0 ;
836+ utf8proc_int32_t b_codepoint = 0 ;
837+ utf8proc_ssize_t a_consumed = 0 ;
838+ utf8proc_ssize_t b_consumed = 0 ;
839+ /* structure to simplify rollback for combining char multipass processing */
840+ const utf8proc_ssize_t decomposed_max_len = 8 ;
841+ struct {
842+ /* results of utf8proc_decompose_char for each string */
843+ utf8proc_int32_t a_decomposed [8 ];
844+ utf8proc_int32_t b_decomposed [8 ];
845+ utf8proc_ssize_t a_decomposed_len ;
846+ utf8proc_ssize_t b_decomposed_len ;
847+ int a_last_boundclass ;
848+ int b_last_boundclass ;
849+ /* combing class tracking state for each string */
850+ utf8proc_ssize_t a_decomposed_pos ;
851+ utf8proc_ssize_t b_decomposed_pos ;
852+ } decomposing_current = {0 }, decomposing_combining_start = {0 };
853+ /* combing class tracking state for each string */
854+ utf8proc_propval_t a_combining_class = 0 ;
855+ utf8proc_propval_t b_combining_class = 0 ;
856+ utf8proc_ssize_t pos = 0 ;
857+ utf8proc_bool combining_initialized = false;
858+ utf8proc_propval_t combining_class_current = 0 ;
859+ utf8proc_propval_t combining_class_next = 0 ;
860+ utf8proc_string8_view_t a_combining_start = a -> str ;
861+ utf8proc_string8_view_t b_combining_start = b -> str ;
862+ utf8proc_uint8_t combining_classes_finished [(UTF8PROC_COMBINING_CLASS_MAX + 1 + CHAR_BIT )/CHAR_BIT ] = {0 };
863+ const utf8proc_ssize_t combining_classes_finished_len = sizeof (combining_classes_finished )/sizeof (combining_classes_finished [0 ]);
864+ /* initialize/clear error state */
865+ a -> error = 0 ;
866+ b -> error = 0 ;
867+ a -> str_at_error .ptr = NULL ;
868+ b -> str_at_error .ptr = NULL ;
869+ a -> str_at_error .len = 0 ;
870+ b -> str_at_error .len = 0 ;
871+ /* force compatible options:
872+ - muse use UTF8PROC_DECOMPOSE, not UTF8PROC_COMPOSE.
873+ - we choose when to add UTF8PROC_NULLTERM on a case-by-case basis (not needed currently).
874+ - can't use UTF8PROC_CHARBOUND because it would break `unsafe_get_property`. */
875+ options = (utf8proc_option_t )((options & ~(unsigned int )(UTF8PROC_COMPOSE |UTF8PROC_NULLTERM |UTF8PROC_CHARBOUND ))|UTF8PROC_DECOMPOSE );
876+ /* primary loop: each iteration pulls data from one or both strings */
877+ while (a_consume || b_consume || combining_initialized ) {
878+ /* read a code point from each - utf8proc_iterate handles null termination with negative length on its own */
879+ if (a_consume ) a_consumed = utf8proc_iterate (a -> str .ptr , a -> str .len , & a_codepoint );
880+ if (b_consume ) b_consumed = utf8proc_iterate (b -> str .ptr , b -> str .len , & b_codepoint );
881+ /* check for errors, roll back string views if needed */
882+ if (a_consumed < 0 ) {
883+ a -> error = a_consumed ;
884+ a -> str_at_error = a -> str ;
885+ }
886+ if (b_consumed < 0 ) {
887+ b -> error = b_consumed ;
888+ b -> str_at_error = b -> str ;
889+ }
890+ if (a -> error || b -> error ) {
891+ if (combining_initialized ) {
892+ a -> str = a_combining_start ;
893+ b -> str = b_combining_start ;
894+ }
895+ return ;
896+ }
897+ /* if we reach the end of one string, we may still need to process more
898+ of the other due to ignorable sequences, and the combining class code
899+ needs to make a judgement upon reaching the end of a combining sequence.
900+ so from this point forward code must be guarded against this possibility. */
901+ if (!combining_initialized && a_consumed == 0 && b_consumed == 0 ) {
902+ /* true end of both strings, must be equal */
903+ return ;
904+ }
905+ /* apply each code point filter */
906+ if (a_custom_func && a_consume && a_consumed ) a_codepoint = a_custom_func (a_codepoint , a_custom_data );
907+ if (b_custom_func && b_consume && b_consumed ) b_codepoint = b_custom_func (b_codepoint , b_custom_data );
908+ /* ASCII fast path is only suitable if we consumed both at once and not in combining mode */
909+ if (!combining_initialized && a_consume && b_consume && a_consumed && b_consumed && a_codepoint < 0x80 && b_codepoint < 0x80 ) {
910+ /* fast path for common ASCII case */
911+ if (options & UTF8PROC_CASEFOLD ) {
912+ if (0x41 <= a_codepoint && a_codepoint <= 0x5A ) a_codepoint += 0x20 ;
913+ if (0x41 <= b_codepoint && b_codepoint <= 0x5A ) b_codepoint += 0x20 ;
914+ }
915+ if (a_codepoint != b_codepoint ) {
916+ /* mismatch detected */
917+ return ;
918+ }
919+ /* equal so far */
920+ a -> str .ptr += a_consumed ;
921+ a -> str .len -= a_consumed * a_len_terminated ;
922+ b -> str .ptr += b_consumed ;
923+ b -> str .len -= b_consumed * b_len_terminated ;
924+ a_consume = true;
925+ b_consume = true;
926+ continue ;
927+ }
928+ /* now time to decompose */
929+ #define UTF8PROC_LAMBDA (ab ) \
930+ if (ab##_consume && ab##_consumed) { \
931+ /* we got a code point, decompose it */ \
932+ decomposing_current .ab##_decomposed_len = utf8proc_decompose_char(ab##_codepoint, decomposing_current.ab##_decomposed, decomposed_max_len, options, &decomposing_current.ab##_last_boundclass); \
933+ decomposing_current.ab##_decomposed_pos = 0; \
934+ ab##_consume = false; \
935+ /* check for errors */ \
936+ if (decomposing_current .ab ##_decomposed_len < 0 ) { \
937+ ab -> error = decomposing_current .ab ##_decomposed_len ; \
938+ ab -> str_at_error = ab -> str ; \
939+ } else if (decomposing_current .ab ##_decomposed_len > decomposed_max_len ) { \
940+ /* should never happen in practice, just for static analysis. */ \
941+ ab -> error = UTF8PROC_ERROR_OVERFLOW ; \
942+ ab -> str_at_error = ab -> str ; \
943+ } else if (decomposing_current .ab ##_decomposed_len == 0 ) { \
944+ /* ignorable sequence, need to consume more */ \
945+ ab -> str .ptr += ab ##_consumed ; \
946+ ab -> str .len -= ab ##_consumed * ab ##_len_terminated ; \
947+ ab ##_consume = true; \
948+ } \
949+ } else { \
950+ ab ##_consume = false; \
951+ }
952+ /* run the above for both strings */
953+ UTF8PROC_LAMBDA (a );
954+ UTF8PROC_LAMBDA (b );
955+ #undef UTF8PROC_LAMBDA
956+ /* check for errors, roll back string views if needed */
957+ if (a -> error || b -> error ) {
958+ if (combining_initialized ) {
959+ a -> str = a_combining_start ;
960+ b -> str = b_combining_start ;
961+ }
962+ return ;
963+ }
964+ /* check for ignorable sequences */
965+ if (a_consume || b_consume ) {
966+ continue ;
967+ }
968+ /* now that ignorable sequences have been handled, check for end of either string */
969+ if (!combining_initialized && (a_consumed == 0 || b_consumed == 0 )) {
970+ /* one or both strings ended, either equal or inequal */
971+ return ;
972+ }
973+ /* at this point both decomposed buffers need to be compared. when the
974+ strings are fully normalized, the decomposed chars are sorted in
975+ order of combining class, which could mean having to sort the entire
976+ decomposed string in the worst case. since we only need to compare
977+ them as-if they are normalized, we can just go one combining class
978+ at a time. we have to be careful around ends of strings to make
979+ sure the string views are properly updated to NOT FURTHER THAN the
980+ first difference in the strings, which may be a large combining seq.
981+ */
982+ while (!a_consume && !b_consume ) {
983+ /* do we need to decompose more? */
984+ if (decomposing_current .a_decomposed_pos >= decomposing_current .a_decomposed_len ) {
985+ a_consume = true;
986+ a -> str .ptr += a_consumed ;
987+ a -> str .len -= a_consumed * a_len_terminated ;
988+ }
989+ if (decomposing_current .b_decomposed_pos >= decomposing_current .b_decomposed_len ) {
990+ b_consume = true;
991+ b -> str .ptr += b_consumed ;
992+ b -> str .len -= b_consumed * b_len_terminated ;
993+ }
994+ if (a_consume || b_consume ) {
995+ continue ;
996+ }
997+ /* get the combining class of each current code point */
998+ a_combining_class = unsafe_get_property (decomposing_current .a_decomposed [decomposing_current .a_decomposed_pos ])-> combining_class ;
999+ b_combining_class = unsafe_get_property (decomposing_current .b_decomposed [decomposing_current .b_decomposed_pos ])-> combining_class ;
1000+ /* static analysis guards, always false in practice */
1001+ if (a_combining_class /CHAR_BIT >= combining_classes_finished_len ) {
1002+ a -> error = UTF8PROC_ERROR_OVERFLOW ;
1003+ a -> str_at_error = a -> str ;
1004+ }
1005+ if (b_combining_class /CHAR_BIT >= combining_classes_finished_len ) {
1006+ b -> error = UTF8PROC_ERROR_OVERFLOW ;
1007+ b -> str_at_error = b -> str ;
1008+ }
1009+ if (a -> error || b -> error ) {
1010+ if (combining_initialized ) {
1011+ a -> str = a_combining_start ;
1012+ b -> str = b_combining_start ;
1013+ }
1014+ return ;
1015+ }
1016+ /* do either have a combining class of 0 (non-combining)? */
1017+ if (a_combining_class == 0 || b_combining_class == 0 ) {
1018+ if (combining_initialized ) {
1019+ /* we've reached the end of the combining sequence */
1020+ if (combining_class_next != 0 ) {
1021+ /* prepare for the next pass */
1022+ utf8proc_uint8_t * elem = & (combining_classes_finished [combining_class_current /CHAR_BIT ]);
1023+ const utf8proc_uint8_t mask = (utf8proc_uint8_t )(1 << (combining_class_current % CHAR_BIT ));
1024+ * elem |= mask ;
1025+ combining_class_current = combining_class_next ;
1026+ combining_class_next = 0 ;
1027+ a -> str = a_combining_start ;
1028+ b -> str = b_combining_start ;
1029+ decomposing_current = decomposing_combining_start ;
1030+ a_consume = true;
1031+ b_consume = true;
1032+ continue ;
1033+ }
1034+ /* else exit combining mode and carry on as normal */
1035+ combining_initialized = false;
1036+ }
1037+ if (a_combining_class != b_combining_class ) {
1038+ /* mismatch detected */
1039+ return ;
1040+ }
1041+ if (decomposing_current .a_decomposed [decomposing_current .a_decomposed_pos ] != decomposing_current .b_decomposed [decomposing_current .b_decomposed_pos ]) {
1042+ /* mismatch detected */
1043+ return ;
1044+ }
1045+ /* equal so far */
1046+ ++ decomposing_current .a_decomposed_pos ;
1047+ ++ decomposing_current .b_decomposed_pos ;
1048+ continue ;
1049+ }
1050+ /* both nonzero combining class, initialize combining mode:
1051+ we go one combining class at a time, comparing the decomposed chars
1052+ of that class in order while consuming more from the input strings
1053+ as needed and noting the next class until we reach a non-combining
1054+ char. then, if there's another combining class, we roll back and
1055+ start from the beginning of the sequence again. */
1056+ if (!combining_initialized ) {
1057+ combining_class_current = a_combining_class ;
1058+ combining_class_next = ((a_combining_class == b_combining_class )? 0 : b_combining_class );
1059+ a_combining_start = a -> str ;
1060+ b_combining_start = b -> str ;
1061+ decomposing_combining_start = decomposing_current ;
1062+ for (pos = 0 ; pos < combining_classes_finished_len ; ++ pos ) {
1063+ combining_classes_finished [pos ] = 0 ;
1064+ }
1065+ combining_initialized = true;
1066+ }
1067+ /* pull more data from one or both until we get both to be current class */
1068+ if (a_combining_class != combining_class_current ) {
1069+ /* is this an unseen class we can target next? */
1070+ if (combining_class_next == 0 ) {
1071+ const utf8proc_uint8_t elem = combining_classes_finished [a_combining_class /CHAR_BIT ];
1072+ const utf8proc_uint8_t mask = (utf8proc_uint8_t )(1 << (a_combining_class % CHAR_BIT ));
1073+ if ((elem & mask ) == 0 ) {
1074+ combining_class_next = a_combining_class ;
1075+ }
1076+ }
1077+ ++ decomposing_current .a_decomposed_pos ;
1078+ }
1079+ if (b_combining_class != combining_class_current ) {
1080+ /* is this an unseen class we can target next? */
1081+ if (combining_class_next == 0 ) {
1082+ const utf8proc_uint8_t elem = combining_classes_finished [b_combining_class /CHAR_BIT ];
1083+ const utf8proc_uint8_t mask = (utf8proc_uint8_t )(1 << (b_combining_class % CHAR_BIT ));
1084+ if ((elem & mask ) == 0 ) {
1085+ combining_class_next = b_combining_class ;
1086+ }
1087+ }
1088+ ++ decomposing_current .b_decomposed_pos ;
1089+ }
1090+ if (a_combining_class != combining_class_current || b_combining_class != combining_class_current ) {
1091+ continue ;
1092+ }
1093+ /* both are the current combining class, compare the decomposed buffers */
1094+ if (decomposing_current .a_decomposed [decomposing_current .a_decomposed_pos ] != decomposing_current .b_decomposed [decomposing_current .b_decomposed_pos ]) {
1095+ /* mismatch detected, roll back string views and exit */
1096+ a -> str = a_combining_start ;
1097+ b -> str = b_combining_start ;
1098+ return ;
1099+ }
1100+ /* equal so far */
1101+ ++ decomposing_current .a_decomposed_pos ;
1102+ ++ decomposing_current .b_decomposed_pos ;
1103+ continue ;
1104+ }
1105+ }
1106+ }
1107+
1108+ UTF8PROC_DLLEXPORT void utf8proc_isequal_normalized (utf8proc_processing_state_t * a , utf8proc_processing_state_t * b , utf8proc_option_t options ) {
1109+ utf8proc_isequal_normalized_custom (a , b , options , NULL , NULL , NULL , NULL );
1110+ }
0 commit comments