-
-
Notifications
You must be signed in to change notification settings - Fork 152
added the grapheme_levenshtein polyfill (closes #555) #558
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: 1.x
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,7 +11,12 @@ | |
|
|
||
| namespace Symfony\Polyfill\Intl\Grapheme; | ||
|
|
||
| \define('SYMFONY_GRAPHEME_CLUSTER_RX', ((float) \PCRE_VERSION < 10 ? (float) \PCRE_VERSION >= 8.32 : (float) \PCRE_VERSION >= 10.39) ? '\X' : Grapheme::GRAPHEME_CLUSTER_RX); | ||
| \define('SYMFONY_GRAPHEME_CLUSTER_RX', | ||
| (false !== @preg_match('/\X/u', "a")) | ||
| ? '\X' | ||
| : Grapheme::GRAPHEME_CLUSTER_RX | ||
| ); | ||
|
|
||
|
|
||
| /** | ||
| * Partial intl implementation in pure PHP. | ||
|
|
@@ -222,6 +227,82 @@ public static function grapheme_str_split($s, $len = 1) | |
|
|
||
| return $chunks; | ||
| } | ||
| /** | ||
| * @param string $string1 | ||
| * @param string $string2 | ||
| * @param int $insertion_cost | ||
| * @param int $replacement_cost | ||
| * @param int $deletion_cost | ||
| * | ||
| * @return int|false | ||
| * | ||
| * @see https://wiki.php.net/rfc/grapheme_levenshtein | ||
| */ | ||
| public static function grapheme_levenshtein($string1, $string2, $insertion_cost = 1, $replacement_cost = 1, $deletion_cost = 1) | ||
| { | ||
| // Cast (PHP does this) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use native parameter types, so that PHP actually does it. |
||
| $string1 = (string) $string1; | ||
| $string2 = (string) $string2; | ||
| $insertion_cost = (int) $insertion_cost; | ||
| $replacement_cost = (int) $replacement_cost; | ||
| $deletion_cost = (int) $deletion_cost; | ||
|
|
||
| // Validate UTF-8 (RFC: return false if invalid) | ||
| if (1 !== preg_match('//u', $string1) || 1 !== preg_match('//u', $string2)) { | ||
| return false; | ||
| } | ||
|
|
||
| // Costs must be >= 0 | ||
| if ($insertion_cost < 0 || $replacement_cost < 0 || $deletion_cost < 0) { | ||
| if (\PHP_VERSION_ID < 80000) { | ||
| return false; | ||
| } | ||
| throw new \ValueError('grapheme_levenshtein(): costs must be greater than or equal to 0'); | ||
| } | ||
|
|
||
| // Grapheme cluster segmentation (same logic as grapheme_str_split) | ||
| preg_match_all('/(' . SYMFONY_GRAPHEME_CLUSTER_RX . ')/u', $string1, $m1); | ||
| preg_match_all('/(' . SYMFONY_GRAPHEME_CLUSTER_RX . ')/u', $string2, $m2); | ||
|
|
||
| $a = $m1[0] ?? []; | ||
| $b = $m2[0] ?? []; | ||
|
|
||
| $lenA = count($a); | ||
| $lenB = count($b); | ||
|
|
||
| // Edge cases | ||
| if (0 === $lenA) { | ||
| return $lenB * $insertion_cost; | ||
| } | ||
| if (0 === $lenB) { | ||
| return $lenA * $deletion_cost; | ||
| } | ||
|
|
||
| // Levenshtein DP matrix | ||
| $dp = array_fill(0, $lenA + 1, array_fill(0, $lenB + 1, 0)); | ||
|
|
||
| for ($i = 1; $i <= $lenA; $i++) { | ||
| $dp[$i][0] = $dp[$i - 1][0] + $deletion_cost; | ||
| } | ||
| for ($j = 1; $j <= $lenB; $j++) { | ||
| $dp[0][$j] = $dp[0][$j - 1] + $insertion_cost; | ||
| } | ||
|
|
||
| for ($i = 1; $i <= $lenA; $i++) { | ||
| for ($j = 1; $j <= $lenB; $j++) { | ||
| $cost = ($a[$i - 1] === $b[$j - 1]) ? 0 : $replacement_cost; | ||
|
|
||
| $dp[$i][$j] = min( | ||
| $dp[$i - 1][$j] + $deletion_cost, // delete | ||
| $dp[$i][$j - 1] + $insertion_cost, // insert | ||
| $dp[$i - 1][$j - 1] + $cost // replace | ||
| ); | ||
| } | ||
| } | ||
|
|
||
| return $dp[$lenA][$lenB]; | ||
| } | ||
|
|
||
|
|
||
| private static function grapheme_position($s, $needle, $offset, $mode) | ||
| { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,6 +10,7 @@ | |
| */ | ||
|
|
||
| use Symfony\Polyfill\Php85 as p; | ||
| use Symfony\Polyfill\Intl\Grapheme\Grapheme; | ||
|
|
||
| if (\PHP_VERSION_ID >= 80500) { | ||
| return; | ||
|
|
@@ -30,3 +31,10 @@ function array_first(array $array) { return p\Php85::array_first($array); } | |
| if (!function_exists('array_last')) { | ||
| function array_last(array $array) { return p\Php85::array_last($array); } | ||
| } | ||
|
|
||
| if (\extension_loaded('intl') && !function_exists('grapheme_levenshtein')) { | ||
| function grapheme_levenshtein($string1, $string2, $insertion_cost = 1, $replacement_cost = 1, $deletion_cost = 1) | ||
| { | ||
| return Grapheme::grapheme_levenshtein($string1, $string2, $insertion_cost, $replacement_cost, $deletion_cost); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The implementation must be duplicated in the |
||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| <?php | ||
|
|
||
| use Symfony\Polyfill\Intl\Grapheme\Grapheme; | ||
|
|
||
| return [ | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this file does not make sense. It does not provide a phpt test, and we use PHPUnit tests anyway.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just wanted to make test file I'll look more deeper in this implementation
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the file you added is not a test file at all. It is not testing anything.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| // Basic | ||
| [0, 'abc', 'abc'], | ||
| [1, 'abc', 'abd'], | ||
| [2, 'kitten', 'sitting'], | ||
|
|
||
| // Multibyte | ||
| [1, 'àbc', 'abc'], | ||
| [2, 'àbç', 'aBc'], | ||
|
|
||
| // Emoji | ||
| [0, '😊', '😊'], | ||
| [1, '😊', '😂'], | ||
| [1, "👨👩👧👦", "👨👩👦"], | ||
|
|
||
| // Empty | ||
| [0, '', ''], | ||
| [3, '', 'abc'], | ||
| [4, '', "👨👩👧👦"], | ||
|
|
||
| // Different lengths | ||
| [3, 'foo', 'foobar'], | ||
| [1, "🙂🙂", "🙂"], | ||
| ]; | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This change looks suspicious to me. We were explicitly checking the PCRE version because older versions of PCRE also have a broken implementation of
\X(there is even an open discussion mentioning that we might need to increase the bound from which we use the native feature)