Skip to content

Commit dada62c

Browse files
Merge pull request #9066 from Sesquipedalian/3.0/buildregex_optimizations
[3.0] Performance optimizations in Utils::buildRegex()
2 parents e965933 + 0606b2d commit dada62c

File tree

1 file changed

+19
-22
lines changed

1 file changed

+19
-22
lines changed

Sources/Utils.php

Lines changed: 19 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1026,46 +1026,44 @@ public static function buildRegex(array $strings, ?string $delim = null, bool $r
10261026
{
10271027
static $regexes = [];
10281028

1029-
$encoding = mb_detect_encoding(implode(' ', $strings)) ?: mb_internal_encoding();
1030-
1031-
$normalized_strings = [];
1029+
// Only strings are allowed.
1030+
$strings = array_unique(array_map('strval', array_filter($strings, 'is_scalar')));
10321031

1033-
foreach ($strings as $str) {
1034-
if (\is_scalar($str)) {
1035-
$s = (string) $str;
1036-
$normalized_strings[$s] = mb_strlen($s, $encoding);
1037-
}
1038-
}
1039-
1040-
if (empty($normalized_strings)) {
1041-
return '';
1032+
// A regex to match nothing?
1033+
if ($strings === [] || $strings === ['']) {
1034+
return $return_array ? [''] : '';
10421035
}
10431036

1037+
// Don't repeat unnecessarily.
10441038
$regex_key = md5(json_encode([$strings, $delim, $return_array]));
10451039

10461040
if (isset($regexes[$regex_key])) {
10471041
return $regexes[$regex_key];
10481042
}
10491043

1044+
// Which character encoding is being used?
1045+
$encoding = mb_detect_encoding(implode(' ', $strings)) ?: mb_internal_encoding();
1046+
10501047
// Optimizing is faster when we sort by length.
1051-
asort($normalized_strings);
1052-
$strings = array_map('strval', array_keys($normalized_strings));
1048+
$strings = array_combine($strings, array_map(fn($s) => mb_strlen($s, $encoding), $strings));
1049+
asort($strings);
1050+
$strings = array_map('strval', array_keys($strings));
10531051

10541052
// Can we trim common characters from the end?
10551053
$trailing = '';
1056-
unset($normalized_strings);
1054+
$i = -1;
10571055

1058-
while (mb_strlen($strings[0], $encoding) > 1) {
1059-
$last_char = mb_substr($strings[0], -1, 1, $encoding);
1056+
while ($strings[0] !== '') {
1057+
$last_char = mb_substr($strings[0], $i, null, $encoding);
10601058

10611059
foreach ($strings as $string) {
10621060
if (!str_ends_with($string, $last_char)) {
10631061
break 2;
10641062
}
10651063
}
10661064

1067-
$strings = array_map(fn($string) => mb_substr($string, 0, -1, $encoding), $strings);
1068-
$trailing = $last_char . $trailing;
1065+
$i--;
1066+
$trailing = $last_char;
10691067
}
10701068

10711069
// Create the trie from the strings.
@@ -1107,9 +1105,8 @@ public static function buildRegex(array $strings, ?string $delim = null, bool $r
11071105
} else {
11081106
$sub_regex = $trie_to_regex($value, $delim);
11091107

1110-
if (\count(array_keys($value)) == 1) {
1111-
$new_key_array = explode('(?' . '>', $sub_regex);
1112-
$new_key .= $new_key_array[0];
1108+
if (\count($value) == 1) {
1109+
$new_key .= strtok($sub_regex, '(?' . '>');
11131110
} else {
11141111
$sub_regex = '(?' . '>' . $sub_regex . ')';
11151112
}

0 commit comments

Comments
 (0)