@@ -1031,7 +1031,7 @@ public static function buildRegex(array $strings, ?string $delim = null, bool $r
10311031 $ normalized_strings = [];
10321032
10331033 foreach ($ strings as $ str ) {
1034- if (\ is_scalar ($ str )) {
1034+ if (is_scalar ($ str )) {
10351035 $ s = (string ) $ str ;
10361036 $ normalized_strings [$ s ] = mb_strlen ($ s , $ encoding );
10371037 }
@@ -1053,48 +1053,96 @@ public static function buildRegex(array $strings, ?string $delim = null, bool $r
10531053
10541054 // Can we trim common characters from the end?
10551055 $ trailing = '' ;
1056+ $ i = -1 ;
10561057 unset($ normalized_strings );
10571058
1058- while (mb_strlen ($ strings [0 ], $ encoding ) > 1 ) {
1059- $ last_char = mb_substr ($ strings [0 ], - 1 , 1 , $ encoding );
1059+ while (\strlen ($ strings [0 ]) > 1 ) {
1060+ $ last_char = mb_substr ($ strings [0 ], $ i , null , $ encoding );
10601061
10611062 foreach ($ strings as $ string ) {
10621063 if (!str_ends_with ($ string , $ last_char )) {
10631064 break 2 ;
10641065 }
10651066 }
10661067
1067- $ strings = array_map ( fn ( $ string ) => mb_substr ( $ string , 0 , - 1 , $ encoding ), $ strings ) ;
1068- $ trailing = $ last_char . $ trailing ;
1068+ $ i -- ;
1069+ $ trailing = $ last_char ;
10691070 }
10701071
1071- // Create the trie from the strings.
1072+ if ($ trailing !== '' ) {
1073+ $ strings = array_map (fn ($ string ) => substr ($ string , 0 , -\strlen ($ trailing )), $ strings );
1074+ }
1075+
1076+ // Build a radix tree from the input strings
10721077 $ trie = [];
10731078
10741079 foreach ($ strings as $ string ) {
1075- $ chars = mb_str_split ($ string , 1 , $ encoding );
1080+ $ current_node = &$ trie ;
1081+
1082+ $ remaining = $ string ;
1083+
1084+ while ($ remaining !== '' ) {
1085+ $ matched = false ;
1086+
1087+ // Check each existing branch at this level
1088+ foreach ($ current_node as $ key => &$ subtree ) {
1089+ // Find longest common prefix
1090+ $ len = 0 ;
1091+ $ max = min (mb_strlen ($ key , $ encoding ), mb_strlen ($ remaining , $ encoding ));
1092+ while (
1093+ $ len < $ max &&
1094+ mb_substr ($ key , $ len , 1 , $ encoding ) === mb_substr ($ remaining , $ len , 1 , $ encoding )
1095+ ) {
1096+ $ len ++;
1097+ }
1098+
1099+ if ($ len === 0 ) {
1100+ continue ; // no prefix match, try next branch
1101+ }
1102+
1103+ $ prefix = mb_substr ($ key , 0 , $ len , $ encoding );
1104+ $ key_remainder = mb_substr ($ key , $ len , null , $ encoding );
1105+ $ remaining_remainder = mb_substr ($ remaining , $ len , null , $ encoding );
1106+
1107+ // Split existing node if needed
1108+ if ($ key_remainder !== '' ) {
1109+ unset($ current_node [$ key ]);
1110+ $ current_node [$ prefix ] = [$ key_remainder => $ subtree ];
1111+ $ subtree = &$ current_node [$ prefix ];
1112+ }
10761113
1077- $ node = &$ trie ;
1114+ // Add remainder of current string
1115+ if ($ remaining_remainder === '' ) {
1116+ $ subtree ['' ] = '' ;
1117+ } else {
1118+ $ current_node = &$ subtree ;
1119+ $ remaining = $ remaining_remainder ;
1120+ $ matched = true ;
1121+ break ; // continue inner loop with updated current_node
1122+ }
10781123
1079- foreach ($ chars as $ char ) {
1080- if (!isset ($ node [$ char ])) {
1081- $ node [$ char ] = [];
1124+ $ matched = true ;
1125+ break ;
10821126 }
10831127
1084- $ node = &$ node [$ char ];
1128+ if (!$ matched ) {
1129+ // No matching branch: insert new
1130+ $ current_node [$ remaining ] = ['' => '' ];
1131+ break ; // done with this string
1132+ }
10851133 }
10861134
1087- $ node [ '' ] = '' ;
1135+ unset( $ current_node ); // break reference
10881136 }
10891137
1138+ //~ var_dump($trie );
1139+
10901140 // This recursive closure turns the trie into a regular expression.
10911141 $ trie_to_regex = function (array &$ trie , ?string $ delim = null ) use (&$ trie_to_regex , $ encoding ) {
10921142 static $ depth = 0 ;
10931143 $ depth ++;
10941144
1095- // Absolute max length for a regex is 32768, but we might need wiggle room
10961145 $ max_length = 30000 ;
1097-
10981146 $ regex = [];
10991147 $ length = 0 ;
11001148
@@ -1107,9 +1155,8 @@ public static function buildRegex(array $strings, ?string $delim = null, bool $r
11071155 } else {
11081156 $ sub_regex = $ trie_to_regex ($ value , $ delim );
11091157
1110- if (\count (array_keys ($ value )) == 1 ) {
1111- $ new_key_array = explode ('(? ' . '> ' , $ sub_regex );
1112- $ new_key .= $ new_key_array [0 ];
1158+ if (count ($ value ) == 1 ) {
1159+ $ new_key .= strtok ($ sub_regex , '(?> ' );
11131160 } else {
11141161 $ sub_regex = '(? ' . '> ' . $ sub_regex . ') ' ;
11151162 }
0 commit comments