Skip to content

Commit b4585a9

Browse files
committed
Improve space efficiency for regex building by using a radix tree
1 parent 2ed6754 commit b4585a9

File tree

1 file changed

+65
-18
lines changed

1 file changed

+65
-18
lines changed

Sources/Utils.php

Lines changed: 65 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1031,7 +1031,7 @@ public static function buildRegex(array $strings, ?string $delim = null, bool $r
10311031
$normalized_strings = [];
10321032

10331033
foreach ($strings as $str) {
1034-
if (\is_scalar($str)) {
1034+
if (is_scalar($str)) {
10351035
$s = (string) $str;
10361036
$normalized_strings[$s] = mb_strlen($s, $encoding);
10371037
}
@@ -1053,48 +1053,96 @@ public static function buildRegex(array $strings, ?string $delim = null, bool $r
10531053

10541054
// Can we trim common characters from the end?
10551055
$trailing = '';
1056+
$i = -1;
10561057
unset($normalized_strings);
10571058

1058-
while (mb_strlen($strings[0], $encoding) > 1) {
1059-
$last_char = mb_substr($strings[0], -1, 1, $encoding);
1059+
while (\strlen($strings[0]) > 1) {
1060+
$last_char = mb_substr($strings[0], $i, null, $encoding);
10601061

10611062
foreach ($strings as $string) {
10621063
if (!str_ends_with($string, $last_char)) {
10631064
break 2;
10641065
}
10651066
}
10661067

1067-
$strings = array_map(fn($string) => mb_substr($string, 0, -1, $encoding), $strings);
1068-
$trailing = $last_char . $trailing;
1068+
$i--;
1069+
$trailing = $last_char;
10691070
}
10701071

1071-
// Create the trie from the strings.
1072+
if ($trailing !== '') {
1073+
$strings = array_map(fn($string) => substr($string, 0, -\strlen($trailing)), $strings);
1074+
}
1075+
1076+
// Build a radix tree from the input strings
10721077
$trie = [];
10731078

10741079
foreach ($strings as $string) {
1075-
$chars = mb_str_split($string, 1, $encoding);
1080+
$current_node = &$trie;
1081+
1082+
$remaining = $string;
1083+
1084+
while ($remaining !== '') {
1085+
$matched = false;
1086+
1087+
// Check each existing branch at this level
1088+
foreach ($current_node as $key => &$subtree) {
1089+
// Find longest common prefix
1090+
$len = 0;
1091+
$max = min(mb_strlen($key, $encoding), mb_strlen($remaining, $encoding));
1092+
while (
1093+
$len < $max &&
1094+
mb_substr($key, $len, 1, $encoding) === mb_substr($remaining, $len, 1, $encoding)
1095+
) {
1096+
$len++;
1097+
}
1098+
1099+
if ($len === 0) {
1100+
continue; // no prefix match, try next branch
1101+
}
1102+
1103+
$prefix = mb_substr($key, 0, $len, $encoding);
1104+
$key_remainder = mb_substr($key, $len, null, $encoding);
1105+
$remaining_remainder = mb_substr($remaining, $len, null, $encoding);
1106+
1107+
// Split existing node if needed
1108+
if ($key_remainder !== '') {
1109+
unset($current_node[$key]);
1110+
$current_node[$prefix] = [$key_remainder => $subtree];
1111+
$subtree = &$current_node[$prefix];
1112+
}
10761113

1077-
$node = &$trie;
1114+
// Add remainder of current string
1115+
if ($remaining_remainder === '') {
1116+
$subtree[''] = '';
1117+
} else {
1118+
$current_node = &$subtree;
1119+
$remaining = $remaining_remainder;
1120+
$matched = true;
1121+
break; // continue inner loop with updated current_node
1122+
}
10781123

1079-
foreach ($chars as $char) {
1080-
if (!isset($node[$char])) {
1081-
$node[$char] = [];
1124+
$matched = true;
1125+
break;
10821126
}
10831127

1084-
$node = &$node[$char];
1128+
if (!$matched) {
1129+
// No matching branch: insert new
1130+
$current_node[$remaining] = ['' => ''];
1131+
break; // done with this string
1132+
}
10851133
}
10861134

1087-
$node[''] = '';
1135+
unset($current_node); // break reference
10881136
}
10891137

1138+
//~ var_dump($trie );
1139+
10901140
// This recursive closure turns the trie into a regular expression.
10911141
$trie_to_regex = function (array &$trie, ?string $delim = null) use (&$trie_to_regex, $encoding) {
10921142
static $depth = 0;
10931143
$depth++;
10941144

1095-
// Absolute max length for a regex is 32768, but we might need wiggle room
10961145
$max_length = 30000;
1097-
10981146
$regex = [];
10991147
$length = 0;
11001148

@@ -1107,9 +1155,8 @@ public static function buildRegex(array $strings, ?string $delim = null, bool $r
11071155
} else {
11081156
$sub_regex = $trie_to_regex($value, $delim);
11091157

1110-
if (\count(array_keys($value)) == 1) {
1111-
$new_key_array = explode('(?' . '>', $sub_regex);
1112-
$new_key .= $new_key_array[0];
1158+
if (count($value) == 1) {
1159+
$new_key .= strtok($sub_regex, '(?>');
11131160
} else {
11141161
$sub_regex = '(?' . '>' . $sub_regex . ')';
11151162
}

0 commit comments

Comments
 (0)