@@ -918,4 +918,219 @@ PHP_FUNCTION(grapheme_str_split)
918918 ubrk_close (bi );
919919}
920920
921+ PHP_FUNCTION (grapheme_levenshtein )
922+ {
923+ zend_string * string1 , * string2 ;
924+ zend_long cost_ins = 1 ;
925+ zend_long cost_rep = 1 ;
926+ zend_long cost_del = 1 ;
927+
928+ ZEND_PARSE_PARAMETERS_START (2 , 5 )
929+ Z_PARAM_STR (string1 )
930+ Z_PARAM_STR (string2 )
931+ Z_PARAM_OPTIONAL
932+ Z_PARAM_LONG (cost_ins )
933+ Z_PARAM_LONG (cost_rep )
934+ Z_PARAM_LONG (cost_del )
935+ ZEND_PARSE_PARAMETERS_END ();
936+
937+ if (cost_ins <= 0 || cost_ins > UINT_MAX / 4 ) {
938+ zend_argument_value_error (3 , "must be greater than 0 and less than or equal to %d" , UINT_MAX / 4 );
939+ RETURN_THROWS ();
940+ }
941+
942+ if (cost_rep <= 0 || cost_rep > UINT_MAX / 4 ) {
943+ zend_argument_value_error (4 , "must be greater than 0 and less than or equal to %d" , UINT_MAX / 4 );
944+ RETURN_THROWS ();
945+ }
946+
947+ if (cost_del <= 0 || cost_del > UINT_MAX / 4 ) {
948+ zend_argument_value_error (5 , "must be greater than 0 and less than or equal to %d" , UINT_MAX / 4 );
949+ RETURN_THROWS ();
950+ }
951+
952+ zend_long c0 , c1 , c2 ;
953+ zend_long retval ;
954+ size_t i2 ;
955+ char * pstr1 , * pstr2 ;
956+
957+ UChar * ustring1 = NULL ;
958+ UChar * ustring2 = NULL ;
959+
960+ int32_t ustring1_len = 0 ;
961+ int32_t ustring2_len = 0 ;
962+
963+ UErrorCode ustatus = U_ZERO_ERROR ;
964+
965+ /* When all costs are equal, levenshtein fulfills the requirements of a metric, which means
966+ * that the distance is symmetric. If string1 is shorter than string2 we can save memory (and CPU time)
967+ * by having shorter rows (p1 & p2). */
968+ if (ZSTR_LEN (string1 ) < ZSTR_LEN (string2 ) && cost_ins == cost_rep && cost_rep == cost_del ) {
969+ zend_string * tmp = string1 ;
970+ string1 = string2 ;
971+ string2 = tmp ;
972+ }
973+
974+ pstr1 = ZSTR_VAL (string1 );
975+ pstr2 = ZSTR_VAL (string2 );
976+
977+ intl_convert_utf8_to_utf16 (& ustring1 , & ustring1_len , pstr1 , ZSTR_LEN (string1 ), & ustatus );
978+
979+ if (U_FAILURE (ustatus )) {
980+ intl_error_set_code (NULL , ustatus );
981+
982+ intl_error_set_custom_msg (NULL , "Error converting input string to UTF-16" , 0 );
983+ efree (ustring1 );
984+ RETURN_FALSE ;
985+ }
986+
987+ intl_convert_utf8_to_utf16 (& ustring2 , & ustring2_len , pstr2 , ZSTR_LEN (string2 ), & ustatus );
988+
989+ if (U_FAILURE (ustatus )) {
990+ intl_error_set_code (NULL , ustatus );
991+
992+ intl_error_set_custom_msg (NULL , "Error converting input string to UTF-16" , 0 );
993+ efree (ustring2 );
994+ efree (ustring1 );
995+ RETURN_FALSE ;
996+ }
997+
998+ UBreakIterator * bi1 , * bi2 ;
999+
1000+ int32_t strlen_1 , strlen_2 ;
1001+ strlen_1 = grapheme_split_string (ustring1 , ustring1_len , NULL , 0 );
1002+ strlen_2 = grapheme_split_string (ustring2 , ustring2_len , NULL , 0 );
1003+
1004+ if (strlen_1 == 0 ) {
1005+ efree (ustring1 );
1006+ efree (ustring2 );
1007+ RETURN_LONG (strlen_2 * cost_ins );
1008+ }
1009+ if (strlen_2 == 0 ) {
1010+ efree (ustring1 );
1011+ efree (ustring2 );
1012+ RETURN_LONG (strlen_1 * cost_del );
1013+ }
1014+
1015+ unsigned char u_break_iterator_buffer1 [U_BRK_SAFECLONE_BUFFERSIZE ];
1016+ unsigned char u_break_iterator_buffer2 [U_BRK_SAFECLONE_BUFFERSIZE ];
1017+ bi1 = grapheme_get_break_iterator (u_break_iterator_buffer1 , & ustatus );
1018+ if (U_FAILURE (ustatus )) {
1019+ intl_error_set_code (NULL , ustatus );
1020+ intl_error_set_custom_msg (NULL , "Error on grapheme_get_break_iterator for argument #1 ($string1)" , 0 );
1021+ efree (ustring2 );
1022+ efree (ustring1 );
1023+ ubrk_close (bi1 );
1024+ RETURN_FALSE ;
1025+ }
1026+
1027+ bi2 = grapheme_get_break_iterator (u_break_iterator_buffer2 , & ustatus );
1028+ if (U_FAILURE (ustatus )) {
1029+ intl_error_set_code (NULL , ustatus );
1030+ intl_error_set_custom_msg (NULL , "Error on grapheme_get_break_iterator for argument #2 ($string2)" , 0 );
1031+ efree (ustring2 );
1032+ efree (ustring1 );
1033+ ubrk_close (bi2 );
1034+ ubrk_close (bi1 );
1035+ RETURN_FALSE ;
1036+ }
1037+ ubrk_setText (bi1 , ustring1 , ustring1_len , & ustatus );
1038+
1039+ if (U_FAILURE (ustatus )) {
1040+ intl_error_set_code (NULL , ustatus );
1041+
1042+ intl_error_set_custom_msg (NULL , "Error on ubrk_setText for argument #1 ($string1)" , 0 );
1043+ efree (ustring2 );
1044+ efree (ustring1 );
1045+ ubrk_close (bi2 );
1046+ ubrk_close (bi1 );
1047+ RETURN_FALSE ;
1048+ }
1049+
1050+ ubrk_setText (bi2 , ustring2 , ustring2_len , & ustatus );
1051+ if (U_FAILURE (ustatus )) {
1052+ intl_error_set_code (NULL , ustatus );
1053+
1054+ intl_error_set_custom_msg (NULL , "Error on ubrk_setText for argument #2 ($string2)" , 0 );
1055+ efree (ustring2 );
1056+ efree (ustring1 );
1057+ ubrk_close (bi2 );
1058+ ubrk_close (bi1 );
1059+ RETURN_FALSE ;
1060+ }
1061+ UCollator * collator = ucol_open ("" , & ustatus );
1062+ if (U_FAILURE (ustatus )) {
1063+ intl_error_set_code (NULL , ustatus );
1064+
1065+ intl_error_set_custom_msg (NULL , "Error on ucol_open" , 0 );
1066+ efree (ustring2 );
1067+ efree (ustring1 );
1068+ ubrk_close (bi2 );
1069+ ubrk_close (bi1 );
1070+ ucol_close (collator );
1071+ RETURN_FALSE ;
1072+ }
1073+
1074+ zend_long * p1 , * p2 , * tmp ;
1075+ p1 = safe_emalloc (strlen_2 + 1 , sizeof (zend_long ), 0 );
1076+ p2 = safe_emalloc (strlen_2 + 1 , sizeof (zend_long ), 0 );
1077+
1078+ for (i2 = 0 ; i2 <= strlen_2 ; i2 ++ ) {
1079+ p1 [i2 ] = i2 * cost_ins ;
1080+ }
1081+
1082+ int32_t current1 = 0 ;
1083+ int32_t current2 = 0 ;
1084+ int32_t pos1 = 0 ;
1085+ int32_t pos2 = 0 ;
1086+
1087+ while (true) {
1088+ current1 = ubrk_current (bi1 );
1089+ pos1 = ubrk_next (bi1 );
1090+ if (pos1 == UBRK_DONE ) {
1091+ break ;
1092+ }
1093+ p2 [0 ] = p1 [0 ] + cost_del ;
1094+ for (i2 = 0 , pos2 = 0 ; pos2 != UBRK_DONE ; i2 ++ ) {
1095+ current2 = ubrk_current (bi2 );
1096+ pos2 = ubrk_next (bi2 );
1097+ if (pos2 == UBRK_DONE ) {
1098+ break ;
1099+ }
1100+ if (ucol_strcoll (collator , ustring1 + current1 , pos1 - current1 , ustring2 + current2 , pos2 - current2 ) == UCOL_EQUAL ) {
1101+ c0 = p1 [i2 ];
1102+ } else {
1103+ c0 = p1 [i2 ] + cost_rep ;
1104+ }
1105+ c1 = p1 [i2 + 1 ] + cost_del ;
1106+ if (c1 < c0 ) {
1107+ c0 = c1 ;
1108+ }
1109+ c2 = p2 [i2 ] + cost_ins ;
1110+ if (c2 < c0 ) {
1111+ c0 = c2 ;
1112+ }
1113+ p2 [i2 + 1 ] = c0 ;
1114+ }
1115+ ubrk_first (bi2 );
1116+ tmp = p1 ;
1117+ p1 = p2 ;
1118+ p2 = tmp ;
1119+ }
1120+
1121+ ucol_close (collator );
1122+
1123+ ubrk_close (bi1 );
1124+ ubrk_close (bi2 );
1125+
1126+ efree (ustring1 );
1127+ efree (ustring2 );
1128+
1129+ retval = p1 [strlen_2 ];
1130+
1131+ efree (p1 );
1132+ efree (p2 );
1133+ RETURN_LONG (retval );
1134+ }
1135+
9211136/* }}} */
0 commit comments