33
44echo "Generating comprehensive test strings from regex patterns... \n" ;
55
6+ /** @var array<string, array<string, string|string[]>>|false */
67$ Rulesets = parse_ini_file ( __DIR__ . '/../rules.ini ' , true , INI_SCANNER_RAW );
78
9+ if ( !is_array ( $ Rulesets ) )
10+ {
11+ echo "Failed to parse rules.ini file \n" ;
12+ exit ( 1 );
13+ }
14+
815foreach ( $ Rulesets as $ Type => $ Rules )
916{
1017 foreach ( $ Rules as $ Name => $ RuleRegexes )
2027 if ( file_exists ( $ File ) )
2128 {
2229 $ Tests = file ( $ File , FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES );
30+
31+ if ( $ Tests === false )
32+ {
33+ $ Tests = [];
34+ }
2335 }
2436
2537 $ Output = [];
2638 $ Added = false ;
2739
28- // Skip generating certain regexes
2940 foreach ( $ RuleRegexes as $ Regex )
3041 {
3142 $ Generated = generateVariations ( $ Regex );
6778 * Native PHP regex pattern generator
6879 * Generates ALL possible variations from regex patterns with smart bounds for infinite cases
6980 * Handles anchors, alternation, quantifiers, character classes, groups, and escapes
81+ * @return string[]
7082 */
7183function generateVariations ( string $ regex ) : array
7284{
7385 // Parse the regex pattern directly
7486 $ parsedPattern = parseRegex ( $ regex );
7587
76- if ( $ parsedPattern === null )
77- {
78- throw new InvalidArgumentException ( "Invalid regex pattern: {$ regex }" );
79- }
80-
8188 return generateFromParsedPattern ( $ parsedPattern );
8289}
8390
84- function parseRegex ( string $ pattern ) : ?array
91+ /**
92+ * @return array<array<string,mixed>>
93+ */
94+ function parseRegex ( string $ pattern ) : array
8595{
8696 $ tokens = [];
8797 $ i = 0 ;
@@ -143,7 +153,7 @@ function parseRegex( string $pattern ) : ?array
143153 {
144154 // Non-capturing group
145155 $ groupEnd = findMatchingParen ( $ pattern , $ i );
146- if ( $ groupEnd !== false )
156+ if ( $ groupEnd !== null )
147157 {
148158 $ groupContent = substr ( $ pattern , $ i + 3 , $ groupEnd - $ i - 3 );
149159 $ tokens [] = [ 'type ' => 'group ' , 'capturing ' => false , 'content ' => parseRegex ( $ groupContent ) ];
@@ -159,7 +169,7 @@ function parseRegex( string $pattern ) : ?array
159169 {
160170 // Capturing group
161171 $ groupEnd = findMatchingParen ( $ pattern , $ i );
162- if ( $ groupEnd !== false )
172+ if ( $ groupEnd !== null )
163173 {
164174 $ groupContent = substr ( $ pattern , $ i + 1 , $ groupEnd - $ i - 1 );
165175 $ tokens [] = [ 'type ' => 'group ' , 'capturing ' => true , 'content ' => parseRegex ( $ groupContent ) ];
@@ -199,10 +209,10 @@ function parseRegex( string $pattern ) : ?array
199209 if ( $ endPos !== false )
200210 {
201211 $ quantifier = substr ( $ pattern , $ i + 1 , $ endPos - $ i - 1 );
202- if ( preg_match ( '/^(\d+)(?:,(\d+)?)?$/ ' , $ quantifier , $ matches ) )
212+ if ( preg_match ( '/^(\d+)(?:,(\d+)?)?$/ ' , $ quantifier , $ matches ) === 1 )
203213 {
204214 $ min = (int )$ matches [1 ];
205- $ max = isset ( $ matches [2 ] ) && $ matches [ 2 ] !== '' ? (int )$ matches [2 ] : ( isset ( $ matches [2 ] ) ? null : $ min );
215+ $ max = ! empty ( $ matches [2 ] ) ? (int )$ matches [2 ] : ( isset ( $ matches [2 ] ) ? null : $ min );
206216 $ tokens [] = [ 'type ' => 'quantifier ' , 'min ' => $ min , 'max ' => $ max ];
207217 $ i = $ endPos + 1 ;
208218 }
@@ -256,6 +266,10 @@ function findMatchingParen( string $pattern, int $start ) : ?int
256266 return $ depth === 0 ? $ i - 1 : null ;
257267}
258268
269+ /**
270+ * @param array<array<string,mixed>> $tokens
271+ * @return string[]
272+ */
259273function generateFromParsedPattern ( array $ tokens , bool $ isSubPattern = false ) : array
260274{
261275 $ hasStartAnchor = detectStartAnchor ( $ tokens );
@@ -288,6 +302,8 @@ function generateFromParsedPattern( array $tokens, bool $isSubPattern = false )
288302 $ min = $ quantifier ['min ' ];
289303 $ max = $ quantifier ['max ' ];
290304
305+ assert ( is_int ( $ min ) );
306+
291307 // Bound infinite quantifiers
292308 if ( $ max === null )
293309 {
@@ -376,30 +392,43 @@ function generateFromParsedPattern( array $tokens, bool $isSubPattern = false )
376392 return array_unique ( $ results );
377393}
378394
395+ /**
396+ * @param array<string,mixed> $token
397+ * @return string[]
398+ */
379399function generateFromToken ( array $ token ) : array
380400{
381401 switch ( $ token ['type ' ] )
382402 {
383403 case 'literal ' :
404+ assert ( is_string ( $ token [ 'value ' ] ) );
384405 return [ $ token ['value ' ] ];
385406
386407 case 'escape ' :
408+ assert ( is_string ( $ token [ 'value ' ] ) );
387409 return generateFromEscape ( $ token ['value ' ] );
388410
389411 case 'any ' :
390412 return [ 'a ' , 'Z ' , '1 ' , '_ ' , '- ' ]; // Sample representative chars
391413
392414 case 'charclass ' :
415+ assert ( is_string ( $ token [ 'value ' ] ) );
393416 return processCharacterClass ( $ token ['value ' ] );
394417
395418 case 'group ' :
396- return generateFromGroupContent ( $ token ['content ' ] );
419+ assert ( is_array ( $ token [ 'content ' ] ) );
420+ /** @var array<array<string,mixed>> $content */
421+ $ content = $ token ['content ' ];
422+ return generateFromGroupContent ( $ content );
397423
398424 default :
399425 return [ '' ];
400426 }
401427}
402428
429+ /**
430+ * @return string[]
431+ */
403432function generateFromEscape ( string $ char ) : array
404433{
405434 switch ( $ char )
@@ -432,23 +461,38 @@ function generateFromEscape( string $char ) : array
432461 }
433462}
434463
464+ /**
465+ * @param array<string,mixed> $token
466+ * @return string[]
467+ */
435468function getSampleCharsForToken ( array $ token ) : array
436469{
470+ if ( !isset ( $ token ['type ' ] ) )
471+ {
472+ return [ 'a ' ];
473+ }
474+
437475 switch ( $ token ['type ' ] )
438476 {
439477 case 'any ' :
440478 return [ 'a ' , 'Z ' , '1 ' ];
441479 case 'escape ' :
480+ assert ( is_string ( $ token ['value ' ] ) );
442481 if ( $ token ['value ' ] === 'd ' ) return [ '0 ' , '1 ' , '9 ' ];
443482 if ( $ token ['value ' ] === 'w ' ) return [ 'a ' , 'B ' , '3 ' ];
444483 return [ $ token ['value ' ] ];
445484 case 'charclass ' :
485+ assert ( is_string ( $ token ['value ' ] ) );
446486 return array_slice ( processCharacterClass ( $ token ['value ' ] ), 0 , 3 );
447487 default :
448488 return [ 'a ' ];
449489 }
450490}
451491
492+ /**
493+ * @param array<array<string,mixed>> $tokens
494+ * @return string[]
495+ */
452496function generateFromGroupContent ( array $ tokens ) : array
453497{
454498 if ( empty ( $ tokens ) )
@@ -497,6 +541,9 @@ function generateFromGroupContent( array $tokens ) : array
497541 return array_unique ( $ allResults );
498542}
499543
544+ /**
545+ * @return string[]
546+ */
500547function processCharacterClass ( string $ charClass ) : array
501548{
502549 // Handle negated classes
@@ -522,7 +569,7 @@ function processCharacterClass( string $charClass ) : array
522569 }
523570
524571 // Handle ranges like a-z, 0-9
525- if ( preg_match_all ( '/(\w)-(\w)/ ' , $ charClass , $ matches , PREG_SET_ORDER ) )
572+ if ( preg_match_all ( '/(\w)-(\w)/ ' , $ charClass , $ matches , PREG_SET_ORDER ) > 0 )
526573 {
527574 foreach ( $ matches as $ match )
528575 {
@@ -567,16 +614,25 @@ function processCharacterClass( string $charClass ) : array
567614 return array_slice ( $ chars , 0 , 6 ); // Limit to reasonable number
568615}
569616
617+ /**
618+ * @param array<array<string,mixed>> $tokens
619+ */
570620function hasStartAnchorInAlternation ( array $ tokens ) : bool
571621{
572622 return hasAnchorInAlternation ( $ tokens , 'start ' , true );
573623}
574624
625+ /**
626+ * @param array<array<string,mixed>> $tokens
627+ */
575628function hasEndAnchorInAlternation ( array $ tokens ) : bool
576629{
577630 return hasAnchorInAlternation ( $ tokens , 'end ' , false );
578631}
579632
633+ /**
634+ * @param array<array<string,mixed>> $tokens
635+ */
580636function hasAnchorInAlternation ( array $ tokens , string $ anchorType , bool $ checkFirst ) : bool
581637{
582638 // Split tokens into alternatives
@@ -607,8 +663,6 @@ function hasAnchorInAlternation( array $tokens, string $anchorType, bool $checkF
607663 // Check each alternative for the anchor
608664 foreach ( $ alternatives as $ alternative )
609665 {
610- if ( empty ( $ alternative ) ) continue ;
611-
612666 if ( $ checkFirst )
613667 {
614668 // Check if alternative starts with the anchor
@@ -640,6 +694,9 @@ function hasAnchorInAlternation( array $tokens, string $anchorType, bool $checkF
640694 return false ;
641695}
642696
697+ /**
698+ * @param array<array<string,mixed>> $tokens
699+ */
643700function detectStartAnchor ( array $ tokens ) : bool
644701{
645702 if ( empty ( $ tokens ) ) return false ;
@@ -653,12 +710,18 @@ function detectStartAnchor( array $tokens ) : bool
653710 // Start anchor in first group (like (?:^|/))
654711 if ( $ tokens [0 ]['type ' ] === 'group ' )
655712 {
656- return hasStartAnchorInAlternation ( $ tokens [0 ]['content ' ] );
713+ assert ( is_array ( $ tokens [0 ]['content ' ] ) );
714+ /** @var array<array<string,mixed>> $content */
715+ $ content = $ tokens [0 ]['content ' ];
716+ return hasStartAnchorInAlternation ( $ content );
657717 }
658718
659719 return false ;
660720}
661721
722+ /**
723+ * @param array<array<string,mixed>> $tokens
724+ */
662725function detectEndAnchor ( array $ tokens ) : bool
663726{
664727 if ( empty ( $ tokens ) ) return false ;
@@ -674,7 +737,10 @@ function detectEndAnchor( array $tokens ) : bool
674737 // End anchor in last group (like (?:$|/))
675738 if ( $ tokens [$ lastIndex ]['type ' ] === 'group ' )
676739 {
677- return hasEndAnchorInAlternation ( $ tokens [$ lastIndex ]['content ' ] );
740+ assert ( is_array ( $ tokens [$ lastIndex ]['content ' ] ) );
741+ /** @var array<array<string,mixed>> $content */
742+ $ content = $ tokens [$ lastIndex ]['content ' ];
743+ return hasEndAnchorInAlternation ( $ content );
678744 }
679745
680746 return false ;
0 commit comments