diff --git a/Sources/Utils.php b/Sources/Utils.php index 32544997ef..3caace4b25 100644 --- a/Sources/Utils.php +++ b/Sources/Utils.php @@ -1053,10 +1053,11 @@ public static function buildRegex(array $strings, ?string $delim = null, bool $r // Can we trim common characters from the end? $trailing = ''; + $i = -1; unset($normalized_strings); - while (mb_strlen($strings[0], $encoding) > 1) { - $last_char = mb_substr($strings[0], -1, 1, $encoding); + while (\strlen($strings[0]) > 1) { + $last_char = mb_substr($strings[0], $i, null, $encoding); foreach ($strings as $string) { if (!str_ends_with($string, $last_char)) { @@ -1064,37 +1065,101 @@ public static function buildRegex(array $strings, ?string $delim = null, bool $r } } - $strings = array_map(fn($string) => mb_substr($string, 0, -1, $encoding), $strings); - $trailing = $last_char . $trailing; + $i--; + $trailing = $last_char; } - // Create the trie from the strings. + if ($trailing !== '') { + $strings = array_map(fn($string) => substr($string, 0, -\strlen($trailing)), $strings); + } + + // Build a radix tree from the input strings $trie = []; foreach ($strings as $string) { - $chars = mb_str_split($string, 1, $encoding); + $current_node = &$trie; + + $remaining = $string; + + while ($remaining !== '') { + $matched = false; + + // Check each existing branch at this level + foreach ($current_node as $key => &$subtree) { + // Find longest common prefix + $len_key = \strlen($key); + $len_remaining = \strlen($remaining); + $byte_prefix_len = 0; + + // Determine the length (in bytes) of the common UTF-8 prefix between + // $key and $remaining. + while ($byte_prefix_len < $len_key && $byte_prefix_len < $len_remaining) { + $byte_key = \ord($key[$byte_prefix_len]); + $byte_remaining = \ord($remaining[$byte_prefix_len]); + + // Determine how many bytes the current UTF-8 character occupies + // (1-byte for ASCII, 2/3/4 bytes for multibyte UTF-8 chars) + $byte_len_key = ($byte_key < 0x80) ? 1 : (($byte_key >> 5) === 0x6 ? 2 : (($byte_key >> 4) === 0xE ? 3 : 4)); + $byte_len_remaining = ($byte_remaining < 0x80) ? 1 : (($byte_remaining >> 5) === 0x6 ? 2 : (($byte_remaining >> 4) === 0xE ? 3 : 4)); + + // Safely compares multibyte characters to avoid splitting UTF-8 sequences. + if ($byte_len_key !== $byte_len_remaining || substr($key, $byte_prefix_len, $byte_len_key) !== substr($remaining, $byte_prefix_len, $byte_len_key)) { + break; + } + + // Advance by the full length of the matching UTF-8 character + $byte_prefix_len += $byte_len_key; + } + + if ($byte_prefix_len === 0) { + continue; + } - $node = &$trie; + // Splitting these strings based on manually calculated byte offsets is + // faster than calling multibyte. mb_substr() has to recalculate every time. + $prefix = substr($key, 0, $byte_prefix_len); + $key_remainder = substr($key, $byte_prefix_len); + $remaining_remainder = substr($remaining, $byte_prefix_len); + + // Split existing node if needed + if ($key_remainder !== '') { + unset($current_node[$key]); + $current_node[$prefix] = [$key_remainder => $subtree]; + $subtree = &$current_node[$prefix]; + } - foreach ($chars as $char) { - if (!isset($node[$char])) { - $node[$char] = []; + // Add remainder of current string + if ($remaining_remainder === '') { + $subtree[''] = ''; + } else { + $current_node = &$subtree; + $remaining = $remaining_remainder; + $matched = true; + break; // continue inner loop with updated current_node + } + + $matched = true; + break; } - $node = &$node[$char]; + if (!$matched) { + // No matching branch: insert new + $current_node[$remaining] = ['' => '']; + break; // done with this string + } } - $node[''] = ''; + unset($current_node); // break reference } +//~ var_dump($trie ); + // This recursive closure turns the trie into a regular expression. $trie_to_regex = function (array &$trie, ?string $delim = null) use (&$trie_to_regex, $encoding) { static $depth = 0; $depth++; - // Absolute max length for a regex is 32768, but we might need wiggle room $max_length = 30000; - $regex = []; $length = 0; @@ -1107,9 +1172,8 @@ public static function buildRegex(array $strings, ?string $delim = null, bool $r } else { $sub_regex = $trie_to_regex($value, $delim); - if (\count(array_keys($value)) == 1) { - $new_key_array = explode('(?' . '>', $sub_regex); - $new_key .= $new_key_array[0]; + if (count($value) == 1) { + $new_key .= strtok($sub_regex, '(?>'); } else { $sub_regex = '(?' . '>' . $sub_regex . ')'; }