@@ -918,4 +918,185 @@ PHP_FUNCTION(grapheme_str_split)
918918 ubrk_close (bi );
919919}
920920
921+ PHP_FUNCTION (grapheme_levenshtein )
922+ {
923+ zend_string * string1 , * string2 ;
924+ zend_long cost_ins = 1 ;
925+ zend_long cost_rep = 1 ;
926+ zend_long cost_del = 1 ;
927+
928+ ZEND_PARSE_PARAMETERS_START (2 , 5 )
929+ Z_PARAM_STR (string1 )
930+ Z_PARAM_STR (string2 )
931+ Z_PARAM_OPTIONAL
932+ Z_PARAM_LONG (cost_ins )
933+ Z_PARAM_LONG (cost_rep )
934+ Z_PARAM_LONG (cost_del )
935+ ZEND_PARSE_PARAMETERS_END ();
936+
937+ if (cost_ins <= 0 || cost_ins > UINT_MAX / 4 ) {
938+ zend_argument_value_error (3 , "must be greater than 0 and less than or equal to %d" , UINT_MAX / 4 );
939+ RETURN_THROWS ();
940+ }
941+
942+ if (cost_rep <= 0 || cost_rep > UINT_MAX / 4 ) {
943+ zend_argument_value_error (4 , "must be greater than 0 and less than or equal to %d" , UINT_MAX / 4 );
944+ RETURN_THROWS ();
945+ }
946+
947+ if (cost_del <= 0 || cost_del > UINT_MAX / 4 ) {
948+ zend_argument_value_error (5 , "must be greater than 0 and less than or equal to %d" , UINT_MAX / 4 );
949+ RETURN_THROWS ();
950+ }
951+
952+ zend_long * p1 , * p2 , * tmp ;
953+ zend_long c0 , c1 , c2 ;
954+ zend_long retval ;
955+ size_t i2 ;
956+ char * pstr1 , * pstr2 ;
957+
958+ UChar * ustring1 = NULL ;
959+ UChar * ustring2 = NULL ;
960+
961+ int32_t ustring1_len = 0 ;
962+ int32_t ustring2_len = 0 ;
963+
964+ UErrorCode ustatus1 = U_ZERO_ERROR ;
965+ UErrorCode ustatus2 = U_ZERO_ERROR ;
966+
967+ /* When all costs are equal, levenshtein fulfills the requirements of a metric, which means
968+ * that the distance is symmetric. If string1 is shorter than string 2 we can save memory (and CPU time)
969+ * by having shorter rows (p1 & p2). */
970+ if (ZSTR_LEN (string1 ) < ZSTR_LEN (string2 ) && cost_ins == cost_rep && cost_rep == cost_del ) {
971+ zend_string * tmp = string1 ;
972+ string1 = string2 ;
973+ string2 = tmp ;
974+ }
975+
976+ pstr1 = ZSTR_VAL (string1 );
977+ pstr2 = ZSTR_VAL (string2 );
978+
979+ intl_convert_utf8_to_utf16 (& ustring1 , & ustring1_len , pstr1 , ZSTR_LEN (string1 ), & ustatus1 );
980+
981+ if ( U_FAILURE ( ustatus1 ) ) {
982+ /* Set global error code. */
983+ intl_error_set_code ( NULL , ustatus1 );
984+
985+ /* Set error messages. */
986+ intl_error_set_custom_msg ( NULL , "Error converting input string to UTF-16" , 0 );
987+ if (ustring1 ) {
988+ efree ( ustring1 );
989+ }
990+ RETURN_FALSE ;
991+ }
992+
993+ intl_convert_utf8_to_utf16 (& ustring2 , & ustring2_len , pstr2 , ZSTR_LEN (string2 ), & ustatus2 );
994+
995+ if ( U_FAILURE ( ustatus2 ) ) {
996+ /* Set global error code. */
997+ intl_error_set_code ( NULL , ustatus2 );
998+
999+ /* Set error messages. */
1000+ intl_error_set_custom_msg ( NULL , "Error converting input string to UTF-16" , 0 );
1001+ if (ustring2 ) {
1002+ efree ( ustring2 );
1003+ }
1004+ if (ustring1 ) {
1005+ efree ( ustring1 );
1006+ }
1007+ RETURN_FALSE ;
1008+ }
1009+
1010+ UText * ut1 = NULL ;
1011+ UText * ut2 = NULL ;
1012+ UBreakIterator * bi1 , * bi2 ;
1013+
1014+ int32_t strlen_1 , strlen_2 ;
1015+ strlen_1 = grapheme_split_string (ustring1 , ustring1_len , NULL , 0 );
1016+ strlen_2 = grapheme_split_string (ustring2 , ustring2_len , NULL , 0 );
1017+
1018+ if (strlen_1 == 0 ) {
1019+ efree (ustring1 );
1020+ efree (ustring2 );
1021+ RETURN_LONG (strlen_2 * cost_ins );
1022+ }
1023+ if (strlen_2 == 0 ) {
1024+ efree (ustring1 );
1025+ efree (ustring2 );
1026+ RETURN_LONG (strlen_1 * cost_del );
1027+ }
1028+
1029+ unsigned char u_break_iterator_buffer1 [U_BRK_SAFECLONE_BUFFERSIZE ];
1030+ unsigned char u_break_iterator_buffer2 [U_BRK_SAFECLONE_BUFFERSIZE ];
1031+ bi1 = grapheme_get_break_iterator ((void * )u_break_iterator_buffer1 , & ustatus1 );
1032+ bi2 = grapheme_get_break_iterator ((void * )u_break_iterator_buffer2 , & ustatus2 );
1033+
1034+ ut1 = utext_openUTF8 (ut1 , pstr1 , ZSTR_LEN (string1 ), & ustatus1 );
1035+ ubrk_setUText (bi1 , ut1 , & ustatus1 );
1036+ ut2 = utext_openUTF8 (ut2 , pstr2 , ZSTR_LEN (string2 ), & ustatus2 );
1037+ ubrk_setUText (bi2 , ut2 , & ustatus2 );
1038+
1039+ p1 = safe_emalloc (strlen_2 + 1 , sizeof (zend_long ), 0 );
1040+ p2 = safe_emalloc (strlen_2 + 1 , sizeof (zend_long ), 0 );
1041+
1042+ for (i2 = 0 ; i2 <= strlen_2 ; i2 ++ ) {
1043+ p1 [i2 ] = i2 * cost_ins ;
1044+ }
1045+
1046+ int32_t current1 = 0 ;
1047+ int32_t current2 = 0 ;
1048+ int32_t pos1 = 0 ;
1049+ int32_t pos2 = 0 ;
1050+ int32_t usrch_pos = 0 ;
1051+ for ( ; pos1 != UBRK_DONE ; ) {
1052+ current1 = ubrk_current (bi1 );
1053+ pos1 = ubrk_next (bi1 );
1054+ if (pos1 == UBRK_DONE ) {
1055+ break ;
1056+ }
1057+ p2 [0 ] = p1 [0 ] + cost_del ;
1058+ for (i2 = 0 , pos2 = 0 ; pos2 != UBRK_DONE ; i2 ++ ) {
1059+ current2 = ubrk_current (bi2 );
1060+ pos2 = ubrk_next (bi2 );
1061+ if (pos2 == UBRK_DONE ) {
1062+ break ;
1063+ }
1064+ usrch_pos = grapheme_strpos_utf16 (pstr1 + current1 , pos1 - current1 , pstr2 + current2 , pos2 - current2 , 0 , NULL , 0 , 0 );
1065+ if (usrch_pos == 0 ) {
1066+ c0 = p1 [i2 ];
1067+ } else {
1068+ c0 = p1 [i2 ] + cost_rep ;
1069+ }
1070+ c1 = p1 [i2 + 1 ] + cost_del ;
1071+ if (c1 < c0 ) {
1072+ c0 = c1 ;
1073+ }
1074+ c2 = p2 [i2 ] + cost_ins ;
1075+ if (c2 < c0 ) {
1076+ c0 = c2 ;
1077+ }
1078+ p2 [i2 + 1 ] = c0 ;
1079+ }
1080+ ubrk_first (bi2 );
1081+ tmp = p1 ;
1082+ p1 = p2 ;
1083+ p2 = tmp ;
1084+ }
1085+
1086+ utext_close (ut1 );
1087+ utext_close (ut2 );
1088+
1089+ ubrk_close (bi1 );
1090+ ubrk_close (bi2 );
1091+
1092+ efree (ustring1 );
1093+ efree (ustring2 );
1094+
1095+ retval = p1 [strlen_2 ];
1096+
1097+ efree (p1 );
1098+ efree (p2 );
1099+ RETURN_LONG (retval );
1100+ }
1101+
9211102/* }}} */
0 commit comments