@@ -822,3 +822,327 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8
822822 UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE );
823823 return retval ;
824824}
825+
826+ UTF8PROC_DLLEXPORT void utf8proc_isequal_normalized_custom (utf8proc_processing_state_t * a , utf8proc_processing_state_t * b , utf8proc_option_t options ,
827+ utf8proc_custom_func a_custom_func , void * a_custom_data , utf8proc_custom_func b_custom_func , void * b_custom_data
828+ ) {
829+ const utf8proc_bool a_len_terminated = (a -> str .len >= 0 );
830+ const utf8proc_bool b_len_terminated = (b -> str .len >= 0 );
831+ /* which source string(s) we need to read more from */
832+ utf8proc_bool a_consume = true;
833+ utf8proc_bool b_consume = true;
834+ /* structure to simplify rollback for combining char multipass processing */
835+ const utf8proc_ssize_t decomposed_max_len = 8 ;
836+ struct {
837+ /* results of utf8proc_iterate */
838+ utf8proc_int32_t codepoint ;
839+ utf8proc_ssize_t consumed ;
840+ /* results of utf8proc_decompose_char */
841+ utf8proc_int32_t decomposed [8 ];
842+ utf8proc_ssize_t decomposed_len ;
843+ int last_boundclass ;
844+ /* combing class tracking state */
845+ utf8proc_ssize_t decomposed_pos ;
846+ utf8proc_propval_t combining_class ;
847+ } a_decomposing_current = {0 },
848+ b_decomposing_current = {0 },
849+ a_decomposing_combining_start = {0 },
850+ b_decomposing_combining_start = {0 },
851+ a_decomposing_combining_end = {0 },
852+ b_decomposing_combining_end = {0 };
853+ /* combining class tracking state */
854+ utf8proc_ssize_t pos = 0 ;
855+ utf8proc_bool combining_initialized = false;
856+ utf8proc_propval_t combining_class_current = 0 ;
857+ utf8proc_propval_t combining_class_next = 0 ;
858+ utf8proc_string8_view_t a_combining_start = a -> str ;
859+ utf8proc_string8_view_t b_combining_start = b -> str ;
860+ utf8proc_string8_view_t a_combining_end = a -> str ;
861+ utf8proc_string8_view_t b_combining_end = b -> str ;
862+ utf8proc_uint8_t combining_classes_finished [(UTF8PROC_COMBINING_CLASS_MAX + 1 + CHAR_BIT )/CHAR_BIT ] = {0 };
863+ const utf8proc_ssize_t combining_classes_finished_len = sizeof (combining_classes_finished )/sizeof (combining_classes_finished [0 ]);
864+ /* initialize/clear error state */
865+ a -> error = 0 ;
866+ b -> error = 0 ;
867+ a -> str_at_error .ptr = NULL ;
868+ b -> str_at_error .ptr = NULL ;
869+ a -> str_at_error .len = 0 ;
870+ b -> str_at_error .len = 0 ;
871+ /* force compatible options:
872+ - must use UTF8PROC_DECOMPOSE, not UTF8PROC_COMPOSE.
873+ - we choose when to add UTF8PROC_NULLTERM on a case-by-case basis (not needed currently).
874+ - can't use UTF8PROC_CHARBOUND because it would break `unsafe_get_property`. */
875+ options = (utf8proc_option_t )((options & ~(unsigned int )(UTF8PROC_COMPOSE |UTF8PROC_NULLTERM |UTF8PROC_CHARBOUND ))|UTF8PROC_DECOMPOSE );
876+ /* primary loop: each iteration pulls data from one or both strings */
877+ while (1 ) {
878+ /* read a code point from each - utf8proc_iterate handles null termination with negative length on its own */
879+ if (a_consume ) a_decomposing_current .consumed = utf8proc_iterate (a -> str .ptr , a -> str .len , & a_decomposing_current .codepoint );
880+ if (b_consume ) b_decomposing_current .consumed = utf8proc_iterate (b -> str .ptr , b -> str .len , & b_decomposing_current .codepoint );
881+ /* check for errors, roll back string views if needed */
882+ if (a_decomposing_current .consumed < 0 ) {
883+ a -> error = a_decomposing_current .consumed ;
884+ a -> str_at_error = a -> str ;
885+ }
886+ if (b_decomposing_current .consumed < 0 ) {
887+ b -> error = b_decomposing_current .consumed ;
888+ b -> str_at_error = b -> str ;
889+ }
890+ if (a -> error || b -> error ) {
891+ if (combining_initialized ) {
892+ a -> str = a_combining_start ;
893+ b -> str = b_combining_start ;
894+ }
895+ return ;
896+ }
897+ /* if we reach the end of one string, we may still need to process more
898+ of the other due to ignorable sequences, and the combining class code
899+ needs to make a judgement upon reaching the end of a combining sequence.
900+ so from this point forward code must be guarded against this possibility. */
901+ if (!combining_initialized && a_decomposing_current .consumed == 0 && b_decomposing_current .consumed == 0 ) {
902+ /* true end of both strings, must be equal */
903+ return ;
904+ }
905+ /* apply each code point filter */
906+ if (a_custom_func && a_consume && a_decomposing_current .consumed ) a_decomposing_current .codepoint = a_custom_func (a_decomposing_current .codepoint , a_custom_data );
907+ if (b_custom_func && b_consume && b_decomposing_current .consumed ) b_decomposing_current .codepoint = b_custom_func (b_decomposing_current .codepoint , b_custom_data );
908+ /* ASCII fast path is only suitable if we consumed both at once and not in combining mode */
909+ if (!combining_initialized && a_consume && b_consume && a_decomposing_current .consumed && b_decomposing_current .consumed
910+ && a_decomposing_current .codepoint < 0x80 && b_decomposing_current .codepoint < 0x80 ) {
911+ /* fast path for common ASCII case */
912+ if (options & UTF8PROC_CASEFOLD ) {
913+ if (0x41 <= a_decomposing_current .codepoint && a_decomposing_current .codepoint <= 0x5A ) a_decomposing_current .codepoint += 0x20 ;
914+ if (0x41 <= b_decomposing_current .codepoint && b_decomposing_current .codepoint <= 0x5A ) b_decomposing_current .codepoint += 0x20 ;
915+ }
916+ if (a_decomposing_current .codepoint != b_decomposing_current .codepoint ) {
917+ /* mismatch detected */
918+ return ;
919+ }
920+ /* equal so far */
921+ a -> str .ptr += a_decomposing_current .consumed ;
922+ a -> str .len -= a_decomposing_current .consumed * a_len_terminated ;
923+ b -> str .ptr += b_decomposing_current .consumed ;
924+ b -> str .len -= b_decomposing_current .consumed * b_len_terminated ;
925+ a_consume = true;
926+ b_consume = true;
927+ continue ;
928+ }
929+ /* now time to decompose */
930+ #define UTF8PROC_LAMBDA (ab ) \
931+ if (ab##_consume && ab##_decomposing_current.consumed) { \
932+ /* we got a code point, decompose it */ \
933+ ab ##_decomposing_current.decomposed_len = utf8proc_decompose_char(ab##_decomposing_current.codepoint, \
934+ ab##_decomposing_current.decomposed, decomposed_max_len, options, &ab##_decomposing_current.last_boundclass); \
935+ ab##_decomposing_current.decomposed_pos = 0; \
936+ ab##_consume = false; \
937+ /* check for errors */ \
938+ if (ab ##_decomposing_current .decomposed_len < 0 ) { \
939+ ab -> error = ab ##_decomposing_current .decomposed_len ; \
940+ ab -> str_at_error = ab -> str ; \
941+ } else if (ab ##_decomposing_current .decomposed_len > decomposed_max_len ) { \
942+ /* should never happen in practice, just for static analysis. */ \
943+ ab -> error = UTF8PROC_ERROR_OVERFLOW ; \
944+ ab -> str_at_error = ab -> str ; \
945+ } else if (ab ##_decomposing_current .decomposed_len == 0 ) { \
946+ /* ignorable sequence, need to consume more */ \
947+ ab -> str .ptr += ab ##_decomposing_current .consumed ; \
948+ ab -> str .len -= ab ##_decomposing_current .consumed * ab ##_len_terminated ; \
949+ ab ##_consume = true; \
950+ } \
951+ } else { \
952+ ab ##_consume = false; \
953+ }
954+ /* run the above for both strings */
955+ UTF8PROC_LAMBDA (a );
956+ UTF8PROC_LAMBDA (b );
957+ #undef UTF8PROC_LAMBDA
958+ /* check for errors, roll back string views if needed */
959+ if (a -> error || b -> error ) {
960+ if (combining_initialized ) {
961+ a -> str = a_combining_start ;
962+ b -> str = b_combining_start ;
963+ }
964+ return ;
965+ }
966+ /* check for ignorable sequences */
967+ if (a_consume || b_consume ) {
968+ continue ;
969+ }
970+ /* now that ignorable sequences have been handled, check for end of either string */
971+ if (!combining_initialized && (a_decomposing_current .consumed == 0 || b_decomposing_current .consumed == 0 )) {
972+ /* one or both strings ended, either equal or inequal */
973+ return ;
974+ }
975+ /* at this point both decomposed buffers need to be compared. when the
976+ strings are fully normalized, the decomposed chars are sorted in
977+ order of combining class, which could mean having to sort the entire
978+ decomposed string in the worst case. since we only need to compare
979+ them as-if they are normalized, we can just go one combining class
980+ at a time. we have to be careful around ends of strings to make
981+ sure the string views are properly updated to NOT FURTHER THAN the
982+ first difference in the strings, which may be a large combining seq.
983+ */
984+ while (1 ) {
985+ /* do we need to decompose more? */
986+ if (a_decomposing_current .consumed && a_decomposing_current .decomposed_pos >= a_decomposing_current .decomposed_len ) {
987+ a_consume = true;
988+ a -> str .ptr += a_decomposing_current .consumed ;
989+ a -> str .len -= a_decomposing_current .consumed * a_len_terminated ;
990+ }
991+ if (b_decomposing_current .consumed && b_decomposing_current .decomposed_pos >= b_decomposing_current .decomposed_len ) {
992+ b_consume = true;
993+ b -> str .ptr += b_decomposing_current .consumed ;
994+ b -> str .len -= b_decomposing_current .consumed * b_len_terminated ;
995+ }
996+ if (a_consume || b_consume ) {
997+ /* use outer loop to pull more data */
998+ break ;
999+ }
1000+ /* get the combining class of each current code point, or 0 for end of string */
1001+ if (a_decomposing_current .consumed ) {
1002+ a_decomposing_current .combining_class = unsafe_get_property (a_decomposing_current .decomposed [a_decomposing_current .decomposed_pos ])-> combining_class ;
1003+ } else {
1004+ a_decomposing_current .combining_class = 0 ;
1005+ }
1006+ if (b_decomposing_current .consumed ) {
1007+ b_decomposing_current .combining_class = unsafe_get_property (b_decomposing_current .decomposed [b_decomposing_current .decomposed_pos ])-> combining_class ;
1008+ } else {
1009+ b_decomposing_current .combining_class = 0 ;
1010+ }
1011+ /* static analysis guards, always false in practice */
1012+ if (a_decomposing_current .combining_class /CHAR_BIT >= combining_classes_finished_len ) {
1013+ a -> error = UTF8PROC_ERROR_OVERFLOW ;
1014+ a -> str_at_error = a -> str ;
1015+ }
1016+ if (b_decomposing_current .combining_class /CHAR_BIT >= combining_classes_finished_len ) {
1017+ b -> error = UTF8PROC_ERROR_OVERFLOW ;
1018+ b -> str_at_error = b -> str ;
1019+ }
1020+ if (a -> error || b -> error ) {
1021+ if (combining_initialized ) {
1022+ a -> str = a_combining_start ;
1023+ b -> str = b_combining_start ;
1024+ }
1025+ return ;
1026+ }
1027+ /* do either have a combining class of 0 (non-combining)? */
1028+ if (a_decomposing_current .combining_class == 0 || b_decomposing_current .combining_class == 0 ) {
1029+ if (combining_initialized ) {
1030+ /* we've reached the end of the combining sequence */
1031+ if (a_decomposing_current .combining_class == 0 ) {
1032+ a_combining_end = a -> str ;
1033+ a_decomposing_combining_end = a_decomposing_current ;
1034+ }
1035+ if (b_decomposing_current .combining_class == 0 ) {
1036+ b_combining_end = b -> str ;
1037+ b_decomposing_combining_end = b_decomposing_current ;
1038+ }
1039+ if (combining_class_next != 0 ) {
1040+ /* prepare for the next pass */
1041+ utf8proc_uint8_t * elem = & (combining_classes_finished [combining_class_current /CHAR_BIT ]);
1042+ const utf8proc_uint8_t mask = (utf8proc_uint8_t )(1 << (combining_class_current % CHAR_BIT ));
1043+ * elem |= mask ;
1044+ combining_class_current = combining_class_next ;
1045+ combining_class_next = 0 ;
1046+ /* roll back for next pass */
1047+ a -> str = a_combining_start ;
1048+ b -> str = b_combining_start ;
1049+ a_decomposing_current = a_decomposing_combining_start ;
1050+ b_decomposing_current = b_decomposing_combining_start ;
1051+ continue ;
1052+ }
1053+ /* else exit combining mode */
1054+ if (a_combining_end .ptr == a_combining_start .ptr && a_decomposing_combining_end .decomposed_pos == a_decomposing_combining_start .decomposed_pos
1055+ || b_combining_end .ptr == b_combining_start .ptr && b_decomposing_combining_end .decomposed_pos == b_decomposing_combining_start .decomposed_pos ) {
1056+ /* didn't reach the end of one of the sequences yet - mismatch detected */
1057+ a -> str = a_combining_start ;
1058+ b -> str = b_combining_start ;
1059+ return ;
1060+ }
1061+ /* roll forward to the ends of the combining sequence */
1062+ a -> str = a_combining_end ;
1063+ b -> str = b_combining_end ;
1064+ a_decomposing_current = a_decomposing_combining_end ;
1065+ b_decomposing_current = b_decomposing_combining_end ;
1066+ /* resume normal processing in outer loop */
1067+ combining_initialized = false;
1068+ break ;
1069+ }
1070+ /* else not in combining mode and at least one is non-combining */
1071+ if (a_decomposing_current .combining_class != b_decomposing_current .combining_class ) {
1072+ /* mismatch detected */
1073+ return ;
1074+ }
1075+ /* both are non-combining,compare the decomposed buffers */
1076+ if (a_decomposing_current .decomposed [a_decomposing_current .decomposed_pos ] != b_decomposing_current .decomposed [b_decomposing_current .decomposed_pos ]) {
1077+ /* mismatch detected */
1078+ return ;
1079+ }
1080+ /* equal so far */
1081+ ++ a_decomposing_current .decomposed_pos ;
1082+ ++ b_decomposing_current .decomposed_pos ;
1083+ continue ;
1084+ }
1085+ /* both nonzero combining class, initialize combining mode:
1086+ we go one combining class at a time, comparing the decomposed chars
1087+ of that class in order while consuming more from the input strings
1088+ as needed and noting the next class until we reach a non-combining
1089+ char. then, if there's another combining class, we roll back and
1090+ start from the beginning of the sequence again. */
1091+ if (!combining_initialized ) {
1092+ combining_class_current = a_decomposing_current .combining_class ;
1093+ combining_class_next = ((a_decomposing_current .combining_class == b_decomposing_current .combining_class )? 0 : b_decomposing_current .combining_class );
1094+ a_combining_start = a -> str ;
1095+ b_combining_start = b -> str ;
1096+ a_combining_end = a -> str ;
1097+ b_combining_end = b -> str ;
1098+ a_decomposing_combining_end = a_decomposing_combining_start = a_decomposing_current ;
1099+ b_decomposing_combining_end = b_decomposing_combining_start = b_decomposing_current ;
1100+ for (pos = 0 ; pos < combining_classes_finished_len ; ++ pos ) {
1101+ combining_classes_finished [pos ] = 0 ;
1102+ }
1103+ combining_initialized = true;
1104+ }
1105+ /* pull more data from one or both until we get both to be current class */
1106+ if (a_decomposing_current .combining_class != combining_class_current ) {
1107+ /* is this an unseen class we can target next? */
1108+ if (combining_class_next == 0 ) {
1109+ const utf8proc_uint8_t elem = combining_classes_finished [a_decomposing_current .combining_class /CHAR_BIT ];
1110+ const utf8proc_uint8_t mask = (utf8proc_uint8_t )(1 << (a_decomposing_current .combining_class % CHAR_BIT ));
1111+ if ((elem & mask ) == 0 ) {
1112+ combining_class_next = a_decomposing_current .combining_class ;
1113+ }
1114+ }
1115+ ++ a_decomposing_current .decomposed_pos ;
1116+ }
1117+ if (b_decomposing_current .combining_class != combining_class_current ) {
1118+ /* is this an unseen class we can target next? */
1119+ if (combining_class_next == 0 ) {
1120+ const utf8proc_uint8_t elem = combining_classes_finished [b_decomposing_current .combining_class /CHAR_BIT ];
1121+ const utf8proc_uint8_t mask = (utf8proc_uint8_t )(1 << (b_decomposing_current .combining_class % CHAR_BIT ));
1122+ if ((elem & mask ) == 0 ) {
1123+ combining_class_next = b_decomposing_current .combining_class ;
1124+ }
1125+ }
1126+ ++ b_decomposing_current .decomposed_pos ;
1127+ }
1128+ if (a_decomposing_current .combining_class != combining_class_current || b_decomposing_current .combining_class != combining_class_current ) {
1129+ continue ;
1130+ }
1131+ /* both are the current combining class, compare the decomposed buffers */
1132+ if (a_decomposing_current .decomposed [a_decomposing_current .decomposed_pos ] != b_decomposing_current .decomposed [b_decomposing_current .decomposed_pos ]) {
1133+ /* mismatch detected, roll back string views and exit */
1134+ a -> str = a_combining_start ;
1135+ b -> str = b_combining_start ;
1136+ return ;
1137+ }
1138+ /* equal so far */
1139+ ++ a_decomposing_current .decomposed_pos ;
1140+ ++ b_decomposing_current .decomposed_pos ;
1141+ continue ;
1142+ }
1143+ }
1144+ }
1145+
1146+ UTF8PROC_DLLEXPORT void utf8proc_isequal_normalized (utf8proc_processing_state_t * a , utf8proc_processing_state_t * b , utf8proc_option_t options ) {
1147+ utf8proc_isequal_normalized_custom (a , b , options , NULL , NULL , NULL , NULL );
1148+ }
0 commit comments