Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 81 additions & 17 deletions Sources/Utils.php
Original file line number Diff line number Diff line change
Expand Up @@ -1053,48 +1053,113 @@ public static function buildRegex(array $strings, ?string $delim = null, bool $r

// Can we trim common characters from the end?
$trailing = '';
$i = -1;
unset($normalized_strings);

while (mb_strlen($strings[0], $encoding) > 1) {
$last_char = mb_substr($strings[0], -1, 1, $encoding);
while (\strlen($strings[0]) > 1) {
$last_char = mb_substr($strings[0], $i, null, $encoding);

foreach ($strings as $string) {
if (!str_ends_with($string, $last_char)) {
break 2;
}
}

$strings = array_map(fn($string) => mb_substr($string, 0, -1, $encoding), $strings);
$trailing = $last_char . $trailing;
$i--;
$trailing = $last_char;
}

// Create the trie from the strings.
if ($trailing !== '') {
$strings = array_map(fn($string) => substr($string, 0, -\strlen($trailing)), $strings);
}

// Build a radix tree from the input strings
$trie = [];

foreach ($strings as $string) {
$chars = mb_str_split($string, 1, $encoding);
$current_node = &$trie;

$remaining = $string;

while ($remaining !== '') {
$matched = false;

// Check each existing branch at this level
foreach ($current_node as $key => &$subtree) {
// Find longest common prefix
$len_key = \strlen($key);
$len_remaining = \strlen($remaining);
$byte_prefix_len = 0;

// Determine the length (in bytes) of the common UTF-8 prefix between
// $key and $remaining.
while ($byte_prefix_len < $len_key && $byte_prefix_len < $len_remaining) {
$byte_key = \ord($key[$byte_prefix_len]);
$byte_remaining = \ord($remaining[$byte_prefix_len]);

// Determine how many bytes the current UTF-8 character occupies
// (1-byte for ASCII, 2/3/4 bytes for multibyte UTF-8 chars)
$byte_len_key = ($byte_key < 0x80) ? 1 : (($byte_key >> 5) === 0x6 ? 2 : (($byte_key >> 4) === 0xE ? 3 : 4));
$byte_len_remaining = ($byte_remaining < 0x80) ? 1 : (($byte_remaining >> 5) === 0x6 ? 2 : (($byte_remaining >> 4) === 0xE ? 3 : 4));

// Safely compares multibyte characters to avoid splitting UTF-8 sequences.
if ($byte_len_key !== $byte_len_remaining || substr($key, $byte_prefix_len, $byte_len_key) !== substr($remaining, $byte_prefix_len, $byte_len_key)) {
break;
}

// Advance by the full length of the matching UTF-8 character
$byte_prefix_len += $byte_len_key;
}

if ($byte_prefix_len === 0) {
continue;
}

$node = &$trie;
// Splitting these strings based on manually calculated byte offsets is
// faster than calling multibyte. mb_substr() has to recalculate every time.
$prefix = substr($key, 0, $byte_prefix_len);
$key_remainder = substr($key, $byte_prefix_len);
$remaining_remainder = substr($remaining, $byte_prefix_len);

// Split existing node if needed
if ($key_remainder !== '') {
unset($current_node[$key]);
$current_node[$prefix] = [$key_remainder => $subtree];
$subtree = &$current_node[$prefix];
}

foreach ($chars as $char) {
if (!isset($node[$char])) {
$node[$char] = [];
// Add remainder of current string
if ($remaining_remainder === '') {
$subtree[''] = '';
} else {
$current_node = &$subtree;
$remaining = $remaining_remainder;
$matched = true;
break; // continue inner loop with updated current_node
}

$matched = true;
break;
}

$node = &$node[$char];
if (!$matched) {
// No matching branch: insert new
$current_node[$remaining] = ['' => ''];
break; // done with this string
}
}

$node[''] = '';
unset($current_node); // break reference
}

//~ var_dump($trie );

// This recursive closure turns the trie into a regular expression.
$trie_to_regex = function (array &$trie, ?string $delim = null) use (&$trie_to_regex, $encoding) {
static $depth = 0;
$depth++;

// Absolute max length for a regex is 32768, but we might need wiggle room
$max_length = 30000;

$regex = [];
$length = 0;

Expand All @@ -1107,9 +1172,8 @@ public static function buildRegex(array $strings, ?string $delim = null, bool $r
} else {
$sub_regex = $trie_to_regex($value, $delim);

if (\count(array_keys($value)) == 1) {
$new_key_array = explode('(?' . '>', $sub_regex);
$new_key .= $new_key_array[0];
if (count($value) == 1) {
$new_key .= strtok($sub_regex, '(?>');
} else {
$sub_regex = '(?' . '>' . $sub_regex . ')';
}
Expand Down