@@ -917,4 +917,185 @@ PHP_FUNCTION(grapheme_str_split)
917917 ubrk_close (bi );
918918}
919919
920+ PHP_FUNCTION (grapheme_levenshtein )
921+ {
922+ zend_string * string1 , * string2 ;
923+ zend_long cost_ins = 1 ;
924+ zend_long cost_rep = 1 ;
925+ zend_long cost_del = 1 ;
926+
927+ ZEND_PARSE_PARAMETERS_START (2 , 5 )
928+ Z_PARAM_STR (string1 )
929+ Z_PARAM_STR (string2 )
930+ Z_PARAM_OPTIONAL
931+ Z_PARAM_LONG (cost_ins )
932+ Z_PARAM_LONG (cost_rep )
933+ Z_PARAM_LONG (cost_del )
934+ ZEND_PARSE_PARAMETERS_END ();
935+
936+ if (cost_ins <= 0 || cost_ins > UINT_MAX / 4 ) {
937+ zend_argument_value_error (3 , "must be greater than 0 and less than or equal to %d" , UINT_MAX / 4 );
938+ RETURN_THROWS ();
939+ }
940+
941+ if (cost_rep <= 0 || cost_rep > UINT_MAX / 4 ) {
942+ zend_argument_value_error (4 , "must be greater than 0 and less than or equal to %d" , UINT_MAX / 4 );
943+ RETURN_THROWS ();
944+ }
945+
946+ if (cost_del <= 0 || cost_del > UINT_MAX / 4 ) {
947+ zend_argument_value_error (5 , "must be greater than 0 and less than or equal to %d" , UINT_MAX / 4 );
948+ RETURN_THROWS ();
949+ }
950+
951+ zend_long * p1 , * p2 , * tmp ;
952+ zend_long c0 , c1 , c2 ;
953+ zend_long retval ;
954+ size_t i2 ;
955+ char * pstr1 , * pstr2 ;
956+
957+ UChar * ustring1 = NULL ;
958+ UChar * ustring2 = NULL ;
959+
960+ int32_t ustring1_len = 0 ;
961+ int32_t ustring2_len = 0 ;
962+
963+ UErrorCode ustatus1 = U_ZERO_ERROR ;
964+ UErrorCode ustatus2 = U_ZERO_ERROR ;
965+
966+ /* When all costs are equal, levenshtein fulfills the requirements of a metric, which means
967+ * that the distance is symmetric. If string1 is shorter than string 2 we can save memory (and CPU time)
968+ * by having shorter rows (p1 & p2). */
969+ if (ZSTR_LEN (string1 ) < ZSTR_LEN (string2 ) && cost_ins == cost_rep && cost_rep == cost_del ) {
970+ zend_string * tmp = string1 ;
971+ string1 = string2 ;
972+ string2 = tmp ;
973+ }
974+
975+ pstr1 = ZSTR_VAL (string1 );
976+ pstr2 = ZSTR_VAL (string2 );
977+
978+ intl_convert_utf8_to_utf16 (& ustring1 , & ustring1_len , pstr1 , ZSTR_LEN (string1 ), & ustatus1 );
979+
980+ if ( U_FAILURE ( ustatus1 ) ) {
981+ /* Set global error code. */
982+ intl_error_set_code ( NULL , ustatus1 );
983+
984+ /* Set error messages. */
985+ intl_error_set_custom_msg ( NULL , "Error converting input string to UTF-16" , 0 );
986+ if (ustring1 ) {
987+ efree ( ustring1 );
988+ }
989+ RETURN_FALSE ;
990+ }
991+
992+ intl_convert_utf8_to_utf16 (& ustring2 , & ustring2_len , pstr2 , ZSTR_LEN (string2 ), & ustatus2 );
993+
994+ if ( U_FAILURE ( ustatus2 ) ) {
995+ /* Set global error code. */
996+ intl_error_set_code ( NULL , ustatus2 );
997+
998+ /* Set error messages. */
999+ intl_error_set_custom_msg ( NULL , "Error converting input string to UTF-16" , 0 );
1000+ if (ustring2 ) {
1001+ efree ( ustring2 );
1002+ }
1003+ if (ustring1 ) {
1004+ efree ( ustring1 );
1005+ }
1006+ RETURN_FALSE ;
1007+ }
1008+
1009+ UText * ut1 = NULL ;
1010+ UText * ut2 = NULL ;
1011+ UBreakIterator * bi1 , * bi2 ;
1012+
1013+ int32_t strlen_1 , strlen_2 ;
1014+ strlen_1 = grapheme_split_string (ustring1 , ustring1_len , NULL , 0 );
1015+ strlen_2 = grapheme_split_string (ustring2 , ustring2_len , NULL , 0 );
1016+
1017+ if (strlen_1 == 0 ) {
1018+ efree (ustring1 );
1019+ efree (ustring2 );
1020+ RETURN_LONG (strlen_2 * cost_ins );
1021+ }
1022+ if (strlen_2 == 0 ) {
1023+ efree (ustring1 );
1024+ efree (ustring2 );
1025+ RETURN_LONG (strlen_1 * cost_del );
1026+ }
1027+
1028+ unsigned char u_break_iterator_buffer1 [U_BRK_SAFECLONE_BUFFERSIZE ];
1029+ unsigned char u_break_iterator_buffer2 [U_BRK_SAFECLONE_BUFFERSIZE ];
1030+ bi1 = grapheme_get_break_iterator ((void * )u_break_iterator_buffer1 , & ustatus1 );
1031+ bi2 = grapheme_get_break_iterator ((void * )u_break_iterator_buffer2 , & ustatus2 );
1032+
1033+ ut1 = utext_openUTF8 (ut1 , pstr1 , ZSTR_LEN (string1 ), & ustatus1 );
1034+ ubrk_setUText (bi1 , ut1 , & ustatus1 );
1035+ ut2 = utext_openUTF8 (ut2 , pstr2 , ZSTR_LEN (string2 ), & ustatus2 );
1036+ ubrk_setUText (bi2 , ut2 , & ustatus2 );
1037+
1038+ p1 = safe_emalloc (strlen_2 + 1 , sizeof (zend_long ), 0 );
1039+ p2 = safe_emalloc (strlen_2 + 1 , sizeof (zend_long ), 0 );
1040+
1041+ for (i2 = 0 ; i2 <= strlen_2 ; i2 ++ ) {
1042+ p1 [i2 ] = i2 * cost_ins ;
1043+ }
1044+
1045+ int32_t current1 = 0 ;
1046+ int32_t current2 = 0 ;
1047+ int32_t pos1 = 0 ;
1048+ int32_t pos2 = 0 ;
1049+ int32_t usrch_pos = 0 ;
1050+ for ( ; pos1 != UBRK_DONE ; ) {
1051+ current1 = ubrk_current (bi1 );
1052+ pos1 = ubrk_next (bi1 );
1053+ if (pos1 == UBRK_DONE ) {
1054+ break ;
1055+ }
1056+ p2 [0 ] = p1 [0 ] + cost_del ;
1057+ for (i2 = 0 , pos2 = 0 ; pos2 != UBRK_DONE ; i2 ++ ) {
1058+ current2 = ubrk_current (bi2 );
1059+ pos2 = ubrk_next (bi2 );
1060+ if (pos2 == UBRK_DONE ) {
1061+ break ;
1062+ }
1063+ usrch_pos = grapheme_strpos_utf16 (pstr1 + current1 , pos1 - current1 , pstr2 + current2 , pos2 - current2 , 0 , NULL , 0 , 0 );
1064+ if (usrch_pos == 0 ) {
1065+ c0 = p1 [i2 ];
1066+ } else {
1067+ c0 = p1 [i2 ] + cost_rep ;
1068+ }
1069+ c1 = p1 [i2 + 1 ] + cost_del ;
1070+ if (c1 < c0 ) {
1071+ c0 = c1 ;
1072+ }
1073+ c2 = p2 [i2 ] + cost_ins ;
1074+ if (c2 < c0 ) {
1075+ c0 = c2 ;
1076+ }
1077+ p2 [i2 + 1 ] = c0 ;
1078+ }
1079+ ubrk_first (bi2 );
1080+ tmp = p1 ;
1081+ p1 = p2 ;
1082+ p2 = tmp ;
1083+ }
1084+
1085+ utext_close (ut1 );
1086+ utext_close (ut2 );
1087+
1088+ ubrk_close (bi1 );
1089+ ubrk_close (bi2 );
1090+
1091+ efree (ustring1 );
1092+ efree (ustring2 );
1093+
1094+ retval = p1 [strlen_2 ];
1095+
1096+ efree (p1 );
1097+ efree (p2 );
1098+ RETURN_LONG (retval );
1099+ }
1100+
9201101/* }}} */
0 commit comments