diff --git a/conf/bleedingEdge.neon b/conf/bleedingEdge.neon index 8fb6e4da6a..6a749d4a63 100644 --- a/conf/bleedingEdge.neon +++ b/conf/bleedingEdge.neon @@ -63,5 +63,6 @@ parameters: explicitThrow: true absentTypeChecks: true requireFileExists: true + reportUnparsedRegexpWhenCompilable: true stubFiles: - ../stubs/bleedingEdge/Rule.stub diff --git a/conf/config.level0.neon b/conf/config.level0.neon index 1382d99ee1..d9bee73b78 100644 --- a/conf/config.level0.neon +++ b/conf/config.level0.neon @@ -294,6 +294,8 @@ services: - class: PHPStan\Rules\Regexp\RegularExpressionPatternRule + arguments: + reportUnparsedRegexpWhenCompilable: %featureToggles.reportUnparsedRegexpWhenCompilable% tags: - phpstan.rules.rule diff --git a/conf/config.neon b/conf/config.neon index 67351dc4da..c9c7aef752 100644 --- a/conf/config.neon +++ b/conf/config.neon @@ -95,6 +95,7 @@ parameters: validatePregQuote: false noImplicitWildcard: false requireFileExists: false + reportUnparsedRegexpWhenCompilable: false narrowPregMatches: true tooWidePropertyType: false explicitThrow: false diff --git a/conf/parametersSchema.neon b/conf/parametersSchema.neon index d3359c6867..6a3b644cfb 100644 --- a/conf/parametersSchema.neon +++ b/conf/parametersSchema.neon @@ -94,6 +94,7 @@ parametersSchema: explicitThrow: bool() absentTypeChecks: bool() requireFileExists: bool() + reportUnparsedRegexpWhenCompilable: bool() ]) fileExtensions: listOf(string()) checkAdvancedIsset: bool() diff --git a/resources/RegexGrammar.pp b/resources/RegexGrammar.pp index b8bea027d3..ba174feb29 100644 --- a/resources/RegexGrammar.pp +++ b/resources/RegexGrammar.pp @@ -42,14 +42,16 @@ // // Character classes. +// tokens suffixed with "fc_" are the same as without such suffix but followed by "class:_class" +%token negative_class_fc_ \[\^(?=\]) -> class_fc +%token class_fc_ \[(?=\]) -> class_fc +%token class_fc:_class \] -> class %token negative_class_ \[\^ -> class %token class_ \[ -> class %token class:posix_class \[:\^?[a-z]+:\] %token class:class_ \[ -%token class:_class_literal (?<=[^\\]\[|[^\\]\[\^)\] %token class:_class \] -> default %token class:range \- -%token class:escaped_end_class \\\] // taken over from literals but class:character has \b support on top (backspace in character classes) %token class:character \\([aefnrtb]|c[\x00-\x7f]) %token class:dynamic_character \\([0-7]{3}|x[0-9a-zA-Z]{2}|x{[0-9a-zA-Z]+}) @@ -58,7 +60,8 @@ // Internal options. // See https://www.regular-expressions.info/refmodifiers.html -%token internal_option \(\?([imsxnJUX^]|xx)?-?([imsxnJUX^]|xx)\) +// and https://www.php.net/manual/en/regexp.reference.internal-options.php +%token internal_option \(\?[imsxnJUX^]*-?[imsxnJUX^]+\) // Lookahead and lookbehind assertions. %token lookahead_ \(\?= @@ -88,7 +91,7 @@ %token nc:_named_capturing > -> default %token nc:capturing_name .+?(?=(?) %token non_capturing_ \(\?: -%token non_capturing_internal_option \(\?([imsxnJUX^]|xx)?-?([imsxnJUX^]|xx): +%token non_capturing_internal_option \(\?[imsxnJUX^]*-?[imsxnJUX^]+: %token non_capturing_reset_ \(\?\| %token atomic_group_ \(\?> %token capturing_ \( @@ -177,10 +180,14 @@ #class: ( - ::negative_class_:: #negativeclass + ::negative_class_fc_:: #negativeclass + <_class> + | ::class_fc_:: + <_class> + | ::negative_class_:: #negativeclass | ::class_:: ) - ( | <_class_literal> )? ( | | range() | literal() | )* ? + ? ( | | range() ? | literal() )* ? ::_class:: #range: diff --git a/src/Rules/Regexp/RegularExpressionPatternRule.php b/src/Rules/Regexp/RegularExpressionPatternRule.php index e15bdeb13b..2c6bf5ab00 100644 --- a/src/Rules/Regexp/RegularExpressionPatternRule.php +++ b/src/Rules/Regexp/RegularExpressionPatternRule.php @@ -2,6 +2,10 @@ namespace PHPStan\Rules\Regexp; +use Hoa\Compiler\Llk\Llk; +use Hoa\Compiler\Llk\Parser; +use Hoa\Exception\Exception as HoaException; +use Hoa\File\Read; use Nette\Utils\RegexpException; use Nette\Utils\Strings; use PhpParser\Node; @@ -21,8 +25,11 @@ final class RegularExpressionPatternRule implements Rule { + private static ?Parser $parser = null; + public function __construct( private RegexExpressionHelper $regexExpressionHelper, + private bool $reportUnparsedRegexpWhenCompilable, ) { } @@ -38,12 +45,24 @@ public function processNode(Node $node, Scope $scope): array $errors = []; foreach ($patterns as $pattern) { - $errorMessage = $this->validatePattern($pattern); - if ($errorMessage === null) { + $errorMessage = $this->validatePatternByPcre($pattern); + + if ($errorMessage !== null) { + $errors[] = RuleErrorBuilder::message(sprintf('Regex pattern is invalid: %s', $errorMessage))->identifier('regexp.pattern')->build(); + continue; } - $errors[] = RuleErrorBuilder::message(sprintf('Regex pattern is invalid: %s', $errorMessage))->identifier('regexp.pattern')->build(); + $errorMessage = $this->validatePatternByParserGrammar($pattern); + + if ($errorMessage === null || !$this->reportUnparsedRegexpWhenCompilable) { + continue; + } + + $errors[] = RuleErrorBuilder::message(sprintf('Regex pattern cannot be parsed by PHPStan: %s', $errorMessage)) + ->identifier('regexp.pattern') + ->tip(sprintf('Please identify minimal reproducible pattern and report it at %s', 'https://github.com/phpstan/phpstan/issues')) + ->build(); } return $errors; @@ -118,7 +137,7 @@ private function extractPatterns(FuncCall $functionCall, Scope $scope): array return $patternStrings; } - private function validatePattern(string $pattern): ?string + private function validatePatternByPcre(string $pattern): ?string { try { Strings::match('', $pattern); @@ -129,4 +148,21 @@ private function validatePattern(string $pattern): ?string return null; } + private function validatePatternByParserGrammar(string $pattern): ?string + { + if (self::$parser === null) { + /** @throws void */ + self::$parser = Llk::load(new Read(__DIR__ . '/../../../resources/RegexGrammar.pp')); + } + + $rawRegex = $this->regexExpressionHelper->removeDelimitersAndModifiers($pattern); + try { + self::$parser->parse($rawRegex); + } catch (HoaException $e) { + return $e->getMessage(); + } + + return null; + } + } diff --git a/src/Type/Regex/RegexGroupParser.php b/src/Type/Regex/RegexGroupParser.php index 9780b2c69a..ec944bb630 100644 --- a/src/Type/Regex/RegexGroupParser.php +++ b/src/Type/Regex/RegexGroupParser.php @@ -525,11 +525,11 @@ private function getLiteralValue(TreeNode $node, ?array &$onlyLiterals, bool $ap if ( in_array($token, [ - 'literal', 'escaped_end_class', + 'literal', // literal "-" in front/back of a character class like '[-a-z]' or '[abc-]', not forming a range 'range', // literal "[" or "]" inside character classes '[[]' or '[]]' - 'class_', '_class_literal', + 'class_', '_class', ], true) ) { if (str_contains($patternModifiers, 'x') && trim($value) === '') { @@ -544,7 +544,6 @@ private function getLiteralValue(TreeNode $node, ?array &$onlyLiterals, bool $ap if ( $appendLiterals - && in_array($token, ['literal', 'range', 'class_', '_class_literal'], true) && $onlyLiterals !== null && (!in_array($value, ['.'], true) || $isEscaped || $inCharacterClass) ) { diff --git a/tests/PHPStan/Analyser/AnalyserIntegrationTest.php b/tests/PHPStan/Analyser/AnalyserIntegrationTest.php index b237cdd33a..1ffb0e98df 100644 --- a/tests/PHPStan/Analyser/AnalyserIntegrationTest.php +++ b/tests/PHPStan/Analyser/AnalyserIntegrationTest.php @@ -14,6 +14,7 @@ use function extension_loaded; use function restore_error_handler; use function sprintf; +use function str_replace; use const PHP_VERSION_ID; class AnalyserIntegrationTest extends PHPStanTestCase @@ -1408,7 +1409,10 @@ public function testBug11283(): void public function testBug11292(): void { $errors = $this->runAnalyse(__DIR__ . '/data/bug-11292.php'); - $this->assertNoErrors($errors); + $this->assertCount(1, $errors); + $this->assertSame(str_replace('\x7f-\xff|$', "\x7f-\xff]|$", 'Regex pattern cannot be parsed by PHPStan: Unrecognized token "\\" at line 1 and column 1: +\\b(https?://)?(?:([^]\\\\\\x00-\\x20\\"(),:-<>[\\x7f-\\xff]{1,64})(:[^]\\\\\\x00-\\x20\\"(),:-<>[\\x7f-\\xff]{1,64})?@)?((?:[-a-zA-Z0-9\\x7f-\\xff]{1,63}\\.)+[a-zA-Z\\x7f-\\xff][-a-zA-Z0-9\\x7f-\\xff]{1,62})((:[0-9]{1,5})?(/[!$-/0-9:;=@_~\':;!a-zA-Z\\x7f-\\xff]*?)?(\\?[!$-/0-9:;=@_\':;!a-zA-Z\\x7f-\\xff]+?)?(#[!$-/0-9?:;=@_\':;!a-zA-Z\\x7f-\\xff]+?)?)(?=[)\'?.!,;:]*([^-_#$+.!*%\'(),;/?:@~=&a-zA-Z0-9\\x7f-\\xff|$)) +↑'), $errors[0]->getMessage()); } public function testBug11297(): void diff --git a/tests/PHPStan/Analyser/nsrt/preg_match_shapes.php b/tests/PHPStan/Analyser/nsrt/preg_match_shapes.php index f92af453fb..0a6b883e4c 100644 --- a/tests/PHPStan/Analyser/nsrt/preg_match_shapes.php +++ b/tests/PHPStan/Analyser/nsrt/preg_match_shapes.php @@ -467,6 +467,9 @@ function bug11323(string $s): void { if (preg_match('{([-\p{L}[\]*|\x03\a\b+?{}(?:)-]+[^[:digit:]?{}a-z0-9#-k]+)(a-z)}', $s, $matches)) { assertType("array{string, non-falsy-string, 'a-z'}", $matches); } + if (preg_match('{(\d+)(?i)insensitive((?xs-i)case SENSITIVE here.+and dot matches new lines)}', $s, $matches)) { + assertType('array{string, numeric-string, non-falsy-string}', $matches); + } if (preg_match('{(\d+)(?i)insensitive((?x-i)case SENSITIVE here(?i:insensitive non-capturing group))}', $s, $matches)) { assertType('array{string, numeric-string, non-falsy-string}', $matches); } @@ -778,3 +781,121 @@ function testLtrimDelimiter (string $string): void { assertType("array{string, 'x'}", $matches); } } + +function testUnescapeBackslash (string $string): void { + if (preg_match(<<<'EOD' + ~(\[)~ + EOD, $string, $matches)) { + assertType("array{string, '['}", $matches); + } + + if (preg_match(<<<'EOD' + ~(\d)~ + EOD, $string, $matches)) { + assertType("array{string, numeric-string}", $matches); + } + + if (preg_match(<<<'EOD' + ~(\\d)~ + EOD, $string, $matches)) { + assertType("array{string, '\\\d'}", $matches); + } + + if (preg_match(<<<'EOD' + ~(\\\d)~ + EOD, $string, $matches)) { + assertType("array{string, non-falsy-string}", $matches); + } + + if (preg_match(<<<'EOD' + ~(\\\\d)~ + EOD, $string, $matches)) { + assertType("array{string, '\\\\\\\d'}", $matches); + } +} + +function testEscapedDelimiter (string $string): void { + if (preg_match(<<<'EOD' + /(\/)/ + EOD, $string, $matches)) { + assertType("array{string, '/'}", $matches); + } + + if (preg_match(<<<'EOD' + ~(\~)~ + EOD, $string, $matches)) { + assertType("array{string, '~'}", $matches); + } + + if (preg_match(<<<'EOD' + ~(\[2])~ + EOD, $string, $matches)) { + assertType("array{string, '[2]'}", $matches); + } + + if (preg_match(<<<'EOD' + [(\[2\])] + EOD, $string, $matches)) { + assertType("array{string, '[2]'}", $matches); + } + + if (preg_match(<<<'EOD' + ~(\{2})~ + EOD, $string, $matches)) { + assertType("array{string, '{2}'}", $matches); + } + + if (preg_match(<<<'EOD' + {(\{2\})} + EOD, $string, $matches)) { + assertType("array{string, '{2}'}", $matches); + } + + if (preg_match(<<<'EOD' + ~([a\]])~ + EOD, $string, $matches)) { + assertType("array{string, ']'|'a'}", $matches); + } + + if (preg_match(<<<'EOD' + ~([a[])~ + EOD, $string, $matches)) { + assertType("array{string, '['|'a'}", $matches); + } + + if (preg_match(<<<'EOD' + ~([a\]b])~ + EOD, $string, $matches)) { + assertType("array{string, ']'|'a'|'b'}", $matches); + } + + if (preg_match(<<<'EOD' + ~([a[b])~ + EOD, $string, $matches)) { + assertType("array{string, '['|'a'|'b'}", $matches); + } + + if (preg_match(<<<'EOD' + ~([a\[b])~ + EOD, $string, $matches)) { + assertType("array{string, '['|'a'|'b'}", $matches); + } + + if (preg_match(<<<'EOD' + [([a\[b])] + EOD, $string, $matches)) { + assertType("array{string, '['|'a'|'b'}", $matches); + } + + if (preg_match(<<<'EOD' + {(x\\\{)|(y\\\\\})} + EOD, $string, $matches)) { + assertType("array{string, '', 'y\\\\\\\}'}|array{string, 'x\\\{'}", $matches); + } +} + +function bugUnescapedDashAfterRange (string $string): void { + if (preg_match('/([0-1-y])/', $string, $matches)) { + assertType("array{string, non-empty-string}", $matches); + } +} diff --git a/tests/PHPStan/Rules/Regexp/RegularExpressionPatternRuleTest.php b/tests/PHPStan/Rules/Regexp/RegularExpressionPatternRuleTest.php index b376382f18..60903dcaf4 100644 --- a/tests/PHPStan/Rules/Regexp/RegularExpressionPatternRuleTest.php +++ b/tests/PHPStan/Rules/Regexp/RegularExpressionPatternRuleTest.php @@ -18,6 +18,7 @@ protected function getRule(): Rule { return new RegularExpressionPatternRule( self::getContainer()->getByType(RegexExpressionHelper::class), + true, ); }