From a6621e8a93f6620d94ab9e53b963d994ebd8b67c Mon Sep 17 00:00:00 2001 From: Sudam Ranasinghe Date: Tue, 25 Nov 2025 22:19:47 +0530 Subject: [PATCH 1/2] added the grapheme_levenshtein polyfill (closes #555) --- src/Intl/Grapheme/Grapheme.php | 76 +++++++++++++++++++ src/Intl/Grapheme/bootstrap.php | 8 ++ src/Php85/bootstrap.php | 8 ++ tests/Intl/Grapheme/grapheme_levenshtein.phpt | 28 +++++++ 4 files changed, 120 insertions(+) create mode 100644 tests/Intl/Grapheme/grapheme_levenshtein.phpt diff --git a/src/Intl/Grapheme/Grapheme.php b/src/Intl/Grapheme/Grapheme.php index f9e9e5741..1fb191761 100644 --- a/src/Intl/Grapheme/Grapheme.php +++ b/src/Intl/Grapheme/Grapheme.php @@ -222,6 +222,82 @@ public static function grapheme_str_split($s, $len = 1) return $chunks; } + /** + * @param string $string1 + * @param string $string2 + * @param int $insertion_cost + * @param int $replacement_cost + * @param int $deletion_cost + * + * @return int|false + * + * @see https://wiki.php.net/rfc/grapheme_levenshtein + */ + public static function grapheme_levenshtein($string1, $string2, $insertion_cost = 1, $replacement_cost = 1, $deletion_cost = 1) + { + // Cast (PHP does this) + $string1 = (string) $string1; + $string2 = (string) $string2; + $insertion_cost = (int) $insertion_cost; + $replacement_cost = (int) $replacement_cost; + $deletion_cost = (int) $deletion_cost; + + // Validate UTF-8 (RFC: return false if invalid) + if (1 !== preg_match('//u', $string1) || 1 !== preg_match('//u', $string2)) { + return false; + } + + // Costs must be >= 0 + if ($insertion_cost < 0 || $replacement_cost < 0 || $deletion_cost < 0) { + if (\PHP_VERSION_ID < 80000) { + return false; + } + throw new \ValueError('grapheme_levenshtein(): costs must be greater than or equal to 0'); + } + + // Grapheme cluster segmentation (same logic as grapheme_str_split) + preg_match_all('/(' . SYMFONY_GRAPHEME_CLUSTER_RX . ')/u', $string1, $m1); + preg_match_all('/(' . SYMFONY_GRAPHEME_CLUSTER_RX . ')/u', $string2, $m2); + + $a = $m1[0] ?? []; + $b = $m2[0] ?? []; + + $lenA = count($a); + $lenB = count($b); + + // Edge cases + if (0 === $lenA) { + return $lenB * $insertion_cost; + } + if (0 === $lenB) { + return $lenA * $deletion_cost; + } + + // Levenshtein DP matrix + $dp = array_fill(0, $lenA + 1, array_fill(0, $lenB + 1, 0)); + + for ($i = 1; $i <= $lenA; $i++) { + $dp[$i][0] = $dp[$i - 1][0] + $deletion_cost; + } + for ($j = 1; $j <= $lenB; $j++) { + $dp[0][$j] = $dp[0][$j - 1] + $insertion_cost; + } + + for ($i = 1; $i <= $lenA; $i++) { + for ($j = 1; $j <= $lenB; $j++) { + $cost = ($a[$i - 1] === $b[$j - 1]) ? 0 : $replacement_cost; + + $dp[$i][$j] = min( + $dp[$i - 1][$j] + $deletion_cost, // delete + $dp[$i][$j - 1] + $insertion_cost, // insert + $dp[$i - 1][$j - 1] + $cost // replace + ); + } + } + + return $dp[$lenA][$lenB]; + } + private static function grapheme_position($s, $needle, $offset, $mode) { diff --git a/src/Intl/Grapheme/bootstrap.php b/src/Intl/Grapheme/bootstrap.php index 374dbd3a7..22ebc80c2 100644 --- a/src/Intl/Grapheme/bootstrap.php +++ b/src/Intl/Grapheme/bootstrap.php @@ -55,3 +55,11 @@ function grapheme_substr($string, $offset, $length = null) { return p\Grapheme:: if (!function_exists('grapheme_str_split')) { function grapheme_str_split($string, $length = 1) { return p\Grapheme::grapheme_str_split($string, $length); } } + +if (!function_exists('grapheme_levenshtein')) { + function grapheme_levenshtein($string1, $string2, $insertion_cost = 1, $replacement_cost = 1, $deletion_cost = 1) + { + return Grapheme::grapheme_levenshtein($string1, $string2, $insertion_cost, $replacement_cost, $deletion_cost); + } +} + diff --git a/src/Php85/bootstrap.php b/src/Php85/bootstrap.php index 44e872b1f..e33ed4212 100644 --- a/src/Php85/bootstrap.php +++ b/src/Php85/bootstrap.php @@ -10,6 +10,7 @@ */ use Symfony\Polyfill\Php85 as p; +use Symfony\Polyfill\Intl\Grapheme\Grapheme; if (\PHP_VERSION_ID >= 80500) { return; @@ -30,3 +31,10 @@ function array_first(array $array) { return p\Php85::array_first($array); } if (!function_exists('array_last')) { function array_last(array $array) { return p\Php85::array_last($array); } } + +if (\extension_loaded('intl') && !function_exists('grapheme_levenshtein')) { + function grapheme_levenshtein($string1, $string2, $insertion_cost = 1, $replacement_cost = 1, $deletion_cost = 1) + { + return Grapheme::grapheme_levenshtein($string1, $string2, $insertion_cost, $replacement_cost, $deletion_cost); + } +} \ No newline at end of file diff --git a/tests/Intl/Grapheme/grapheme_levenshtein.phpt b/tests/Intl/Grapheme/grapheme_levenshtein.phpt new file mode 100644 index 000000000..7d4ccd811 --- /dev/null +++ b/tests/Intl/Grapheme/grapheme_levenshtein.phpt @@ -0,0 +1,28 @@ + Date: Tue, 25 Nov 2025 22:41:49 +0530 Subject: [PATCH 2/2] added the grapheme_levenshtein polyfill (closes #555) --- src/Intl/Grapheme/Grapheme.php | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/Intl/Grapheme/Grapheme.php b/src/Intl/Grapheme/Grapheme.php index 1fb191761..e214c55e3 100644 --- a/src/Intl/Grapheme/Grapheme.php +++ b/src/Intl/Grapheme/Grapheme.php @@ -11,7 +11,12 @@ namespace Symfony\Polyfill\Intl\Grapheme; -\define('SYMFONY_GRAPHEME_CLUSTER_RX', ((float) \PCRE_VERSION < 10 ? (float) \PCRE_VERSION >= 8.32 : (float) \PCRE_VERSION >= 10.39) ? '\X' : Grapheme::GRAPHEME_CLUSTER_RX); +\define('SYMFONY_GRAPHEME_CLUSTER_RX', + (false !== @preg_match('/\X/u', "a")) + ? '\X' + : Grapheme::GRAPHEME_CLUSTER_RX +); + /** * Partial intl implementation in pure PHP.