77use Hoa \Compiler \Llk \TreeNode ;
88use Hoa \Exception \Exception ;
99use Hoa \File \Read ;
10+ use Nette \Utils \RegexpException ;
1011use Nette \Utils \Strings ;
1112use PhpParser \Node \Expr ;
1213use PhpParser \Node \Name ;
3132use function in_array ;
3233use function is_int ;
3334use function is_string ;
35+ use function rtrim ;
3436use function sscanf ;
37+ use function str_replace ;
38+ use function strlen ;
39+ use function substr ;
3540use const PREG_OFFSET_CAPTURE ;
3641use const PREG_UNMATCHED_AS_NULL ;
3742
@@ -375,6 +380,13 @@ private function parseGroups(string $regex): ?array
375380 self ::$ parser = Llk::load (new Read (__DIR__ . '/../../../resources/RegexGrammar.pp ' ));
376381 }
377382
383+ try {
384+ Strings::match ('' , $ regex );
385+ } catch (RegexpException ) {
386+ // pattern is invalid, so let the RegularExpressionPatternRule report it
387+ return null ;
388+ }
389+
378390 try {
379391 $ ast = self ::$ parser ->parse ($ regex );
380392 } catch (Exception ) {
@@ -516,25 +528,37 @@ private function getQuantificationRange(TreeNode $node): array
516528 $ lastChild = $ node ->getChild ($ node ->getChildrenNumber () - 1 );
517529 $ value = $ lastChild ->getValue ();
518530
519- if ($ value ['token ' ] === 'n_to_m ' ) {
520- if (sscanf ($ value ['value ' ], '{%d,%d} ' , $ n , $ m ) !== 2 || !is_int ($ n ) || !is_int ($ m )) {
531+ // normalize away possessive and lazy quantifier-modifiers
532+ $ token = str_replace (['_possessive ' , '_lazy ' ], '' , $ value ['token ' ]);
533+ $ value = rtrim ($ value ['value ' ], '+? ' );
534+
535+ if ($ token === 'n_to_m ' ) {
536+ if (sscanf ($ value , '{%d,%d} ' , $ n , $ m ) !== 2 || !is_int ($ n ) || !is_int ($ m )) {
521537 throw new ShouldNotHappenException ();
522538 }
523539
524540 $ min = $ n ;
525541 $ max = $ m ;
526- } elseif ($ value ['token ' ] === 'exactly_n ' ) {
527- if (sscanf ($ value ['value ' ], '{%d} ' , $ n ) !== 1 || !is_int ($ n )) {
542+ } elseif ($ token === 'n_or_more ' ) {
543+ if (sscanf ($ value , '{%d,} ' , $ n ) !== 1 || !is_int ($ n )) {
544+ throw new ShouldNotHappenException ();
545+ }
546+
547+ $ min = $ n ;
548+ } elseif ($ token === 'exactly_n ' ) {
549+ if (sscanf ($ value , '{%d} ' , $ n ) !== 1 || !is_int ($ n )) {
528550 throw new ShouldNotHappenException ();
529551 }
530552
531553 $ min = $ n ;
532554 $ max = $ n ;
533- } elseif ($ value [ ' token ' ] === 'zero_or_one ' ) {
555+ } elseif ($ token === 'zero_or_one ' ) {
534556 $ min = 0 ;
535557 $ max = 1 ;
536- } elseif ($ value [ ' token ' ] === 'zero_or_more ' ) {
558+ } elseif ($ token === 'zero_or_more ' ) {
537559 $ min = 0 ;
560+ } elseif ($ token === 'one_or_more ' ) {
561+ $ min = 1 ;
538562 }
539563
540564 return [$ min , $ max ];
@@ -591,20 +615,8 @@ private function walkGroupAst(TreeNode $ast, TrinaryLogic &$isNonEmpty, TrinaryL
591615 if ($ literalValue !== null ) {
592616 if (Strings::match ($ literalValue , '/^\d+$/ ' ) === null ) {
593617 $ isNumeric = TrinaryLogic::createNo ();
594- }
595-
596- if (!$ inOptionalQuantification ) {
597- $ isNonEmpty = TrinaryLogic::createYes ();
598- }
599- }
600-
601- if ($ ast ->getValueToken () === 'character_type ' ) {
602- if ($ ast ->getValueValue () === '\d ' ) {
603- if ($ isNumeric ->maybe ()) {
604- $ isNumeric = TrinaryLogic::createYes ();
605- }
606- } else {
607- $ isNumeric = TrinaryLogic::createNo ();
618+ } elseif ($ isNumeric ->maybe ()) {
619+ $ isNumeric = TrinaryLogic::createYes ();
608620 }
609621
610622 if (!$ inOptionalQuantification ) {
@@ -613,32 +625,11 @@ private function walkGroupAst(TreeNode $ast, TrinaryLogic &$isNonEmpty, TrinaryL
613625 }
614626 }
615627
616- if ($ ast ->getId () === '#range ' || $ ast ->getId () === '#class ' ) {
617- if ($ isNumeric ->maybe ()) {
618- $ allNumeric = null ;
619- foreach ($ children as $ child ) {
620- $ literalValue = $ this ->getLiteralValue ($ child );
621-
622- if ($ literalValue === null ) {
623- break ;
624- }
625-
626- if (Strings::match ($ literalValue , '/^\d+$/ ' ) === null ) {
627- $ allNumeric = false ;
628- break ;
629- }
630-
631- $ allNumeric = true ;
632- }
633-
634- if ($ allNumeric === true ) {
635- $ isNumeric = TrinaryLogic::createYes ();
636- }
637- }
638-
639- if (!$ inOptionalQuantification ) {
640- $ isNonEmpty = TrinaryLogic::createYes ();
641- }
628+ // [^0-9] should not parse as numeric-string, and [^list-everything-but-numbers] is technically
629+ // doable but really silly compared to just \d so we can safely assume the string is not numeric
630+ // for negative classes
631+ if ($ ast ->getId () === '#negativeclass ' ) {
632+ $ isNumeric = TrinaryLogic::createNo ();
642633 }
643634
644635 foreach ($ children as $ child ) {
@@ -653,13 +644,65 @@ private function walkGroupAst(TreeNode $ast, TrinaryLogic &$isNonEmpty, TrinaryL
653644
654645 private function getLiteralValue (TreeNode $ node ): ?string
655646 {
656- if ($ node ->getId () === 'token ' && $ node ->getValueToken () === 'literal ' ) {
657- return $ node ->getValueValue ();
647+ if ($ node ->getId () !== 'token ' ) {
648+ return null ;
649+ }
650+
651+ // token is the token name from grammar without the namespace so literal and class:literal are both called literal here
652+ $ token = $ node ->getValueToken ();
653+ $ value = $ node ->getValueValue ();
654+
655+ if (in_array ($ token , ['literal ' , 'escaped_end_class ' ], true )) {
656+ if (strlen ($ node ->getValueValue ()) > 1 && $ value [0 ] === '\\' ) {
657+ return substr ($ value , 1 );
658+ }
659+
660+ return $ value ;
661+ }
662+
663+ // literal "-" in front/back of a character class like '[-a-z]' or '[abc-]', not forming a range
664+ if ($ token === 'range ' ) {
665+ return $ value ;
666+ }
667+
668+ // literal "[" or "]" inside character classes '[[]' or '[]]'
669+ if (in_array ($ token , ['class_ ' , '_class_literal ' ], true )) {
670+ return $ value ;
671+ }
672+
673+ // character escape sequences, just return a fixed string
674+ if (in_array ($ token , ['character ' , 'dynamic_character ' , 'character_type ' ], true )) {
675+ if ($ token === 'character_type ' && $ value === '\d ' ) {
676+ return '0 ' ;
677+ }
678+
679+ return $ value ;
680+ }
681+
682+ // [:digit:] and the like, more support coming later
683+ if ($ token === 'posix_class ' ) {
684+ if ($ value === '[:digit:] ' ) {
685+ return '0 ' ;
686+ }
687+ if (in_array ($ value , ['[:alpha:] ' , '[:alnum:] ' , '[:upper:] ' , '[:lower:] ' , '[:word:] ' , '[:ascii:] ' , '[:print:] ' , '[:xdigit:] ' , '[:graph:] ' ], true )) {
688+ return 'a ' ;
689+ }
690+ if ($ value === '[:blank:] ' ) {
691+ return " \t" ;
692+ }
693+ if ($ value === '[:cntrl:] ' ) {
694+ return "\x00\x1F" ;
695+ }
696+ if ($ value === '[:space:] ' ) {
697+ return " \t\r\n\v\f" ;
698+ }
699+ if ($ value === '[:punct:] ' ) {
700+ return '!"#$%& \'()*+,\-./:;<=>?@[\]^_`{|}~ ' ;
701+ }
658702 }
659703
660- // literal "-" outside of a character class like '~^((\\d{1,6})-)$~'
661- if ($ node ->getId () === 'token ' && $ node ->getValueToken () === 'range ' ) {
662- return $ node ->getValueValue ();
704+ if ($ token === 'anchor ' || $ token === 'match_point_reset ' ) {
705+ return '' ;
663706 }
664707
665708 return null ;
0 commit comments