Skip to content

Commit 5263666

Browse files
authored
Merge branch refs/heads/1.11.x into 1.12.x
2 parents c8561d5 + fc75deb commit 5263666

File tree

5 files changed

+185
-73
lines changed

5 files changed

+185
-73
lines changed

resources/RegexGrammar.pp

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,23 @@
4646
%skip nl \n
4747

4848
// Character classes.
49-
%token negative_class_ \[\^
50-
%token class_ \[
51-
%token _class \]
52-
%token range \-
49+
%token negative_class_ \[\^ -> class
50+
%token class_ \[ -> class
51+
%token class:posix_class \[:\^?[a-z]+:\]
52+
%token class:class_ \[
53+
%token class:_class_literal (?<=[^\\]\[|[^\\]\[\^)\]
54+
%token class:_class \] -> default
55+
%token class:range \-
56+
%token class:escaped_end_class \\\]
57+
// taken over from literals but class:character has \b support on top (backspace in character classes)
58+
%token class:character \\([aefnrtb]|c[\x00-\x7f])
59+
%token class:dynamic_character \\([0-7]{3}|x[0-9a-zA-Z]{2}|x{[0-9a-zA-Z]+})
60+
%token class:character_type \\([CdDhHNRsSvVwWX]|[pP]{[^}]+})
61+
%token class:literal \\.|.
5362

5463
// Internal options.
55-
%token internal_option \(\?[\-+]?[imsx]\)
64+
// See https://www.regular-expressions.info/refmodifiers.html
65+
%token internal_option \(\?([imsxnJUX^]|xx)?-?([imsxnJUX^]|xx)\)
5666

5767
// Lookahead and lookbehind assertions.
5868
%token lookahead_ \(\?=
@@ -77,6 +87,7 @@
7787
%token nc:_named_capturing > -> default
7888
%token nc:capturing_name .+?(?=(?<!\\)>)
7989
%token non_capturing_ \(\?:
90+
%token non_capturing_internal_option \(\?([imsxnJUX^]|xx)?-?([imsxnJUX^]|xx):
8091
%token non_capturing_reset_ \(\?\|
8192
%token atomic_group_ \(\?>
8293
%token capturing_ \(
@@ -168,7 +179,7 @@
168179
::negative_class_:: #negativeclass
169180
| ::class_::
170181
)
171-
( <class_> | range() | literal() )+
182+
( <range> | <_class_literal> )? ( <posix_class> | <class_> | range() | literal() | <escaped_end_class> )* <range>?
172183
::_class::
173184

174185
#range:
@@ -183,15 +194,18 @@
183194
| (
184195
::named_capturing_:: <capturing_name> ::_named_capturing:: #namedcapturing
185196
| ::non_capturing_:: #noncapturing
197+
| non_capturing_internal_options() #noncapturing
186198
| ::non_capturing_reset_:: #noncapturingreset
187199
| ::atomic_group_:: #atomicgroup
188200
| ::capturing_::
189201
)
190202
alternation() ::_capturing::
191203

204+
non_capturing_internal_options:
205+
<non_capturing_internal_option>
206+
192207
literal:
193208
<character>
194-
| <range>
195209
| <dynamic_character>
196210
| <character_type>
197211
| <anchor>

src/Type/Php/RegexArrayShapeMatcher.php

Lines changed: 94 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
use Hoa\Compiler\Llk\TreeNode;
88
use Hoa\Exception\Exception;
99
use Hoa\File\Read;
10+
use Nette\Utils\RegexpException;
1011
use Nette\Utils\Strings;
1112
use PhpParser\Node\Expr;
1213
use PhpParser\Node\Name;
@@ -31,7 +32,11 @@
3132
use function in_array;
3233
use function is_int;
3334
use function is_string;
35+
use function rtrim;
3436
use function sscanf;
37+
use function str_replace;
38+
use function strlen;
39+
use function substr;
3540
use const PREG_OFFSET_CAPTURE;
3641
use const PREG_UNMATCHED_AS_NULL;
3742

@@ -375,6 +380,13 @@ private function parseGroups(string $regex): ?array
375380
self::$parser = Llk::load(new Read(__DIR__ . '/../../../resources/RegexGrammar.pp'));
376381
}
377382

383+
try {
384+
Strings::match('', $regex);
385+
} catch (RegexpException) {
386+
// pattern is invalid, so let the RegularExpressionPatternRule report it
387+
return null;
388+
}
389+
378390
try {
379391
$ast = self::$parser->parse($regex);
380392
} catch (Exception) {
@@ -516,25 +528,37 @@ private function getQuantificationRange(TreeNode $node): array
516528
$lastChild = $node->getChild($node->getChildrenNumber() - 1);
517529
$value = $lastChild->getValue();
518530

519-
if ($value['token'] === 'n_to_m') {
520-
if (sscanf($value['value'], '{%d,%d}', $n, $m) !== 2 || !is_int($n) || !is_int($m)) {
531+
// normalize away possessive and lazy quantifier-modifiers
532+
$token = str_replace(['_possessive', '_lazy'], '', $value['token']);
533+
$value = rtrim($value['value'], '+?');
534+
535+
if ($token === 'n_to_m') {
536+
if (sscanf($value, '{%d,%d}', $n, $m) !== 2 || !is_int($n) || !is_int($m)) {
521537
throw new ShouldNotHappenException();
522538
}
523539

524540
$min = $n;
525541
$max = $m;
526-
} elseif ($value['token'] === 'exactly_n') {
527-
if (sscanf($value['value'], '{%d}', $n) !== 1 || !is_int($n)) {
542+
} elseif ($token === 'n_or_more') {
543+
if (sscanf($value, '{%d,}', $n) !== 1 || !is_int($n)) {
544+
throw new ShouldNotHappenException();
545+
}
546+
547+
$min = $n;
548+
} elseif ($token === 'exactly_n') {
549+
if (sscanf($value, '{%d}', $n) !== 1 || !is_int($n)) {
528550
throw new ShouldNotHappenException();
529551
}
530552

531553
$min = $n;
532554
$max = $n;
533-
} elseif ($value['token'] === 'zero_or_one') {
555+
} elseif ($token === 'zero_or_one') {
534556
$min = 0;
535557
$max = 1;
536-
} elseif ($value['token'] === 'zero_or_more') {
558+
} elseif ($token === 'zero_or_more') {
537559
$min = 0;
560+
} elseif ($token === 'one_or_more') {
561+
$min = 1;
538562
}
539563

540564
return [$min, $max];
@@ -591,20 +615,8 @@ private function walkGroupAst(TreeNode $ast, TrinaryLogic &$isNonEmpty, TrinaryL
591615
if ($literalValue !== null) {
592616
if (Strings::match($literalValue, '/^\d+$/') === null) {
593617
$isNumeric = TrinaryLogic::createNo();
594-
}
595-
596-
if (!$inOptionalQuantification) {
597-
$isNonEmpty = TrinaryLogic::createYes();
598-
}
599-
}
600-
601-
if ($ast->getValueToken() === 'character_type') {
602-
if ($ast->getValueValue() === '\d') {
603-
if ($isNumeric->maybe()) {
604-
$isNumeric = TrinaryLogic::createYes();
605-
}
606-
} else {
607-
$isNumeric = TrinaryLogic::createNo();
618+
} elseif ($isNumeric->maybe()) {
619+
$isNumeric = TrinaryLogic::createYes();
608620
}
609621

610622
if (!$inOptionalQuantification) {
@@ -613,32 +625,11 @@ private function walkGroupAst(TreeNode $ast, TrinaryLogic &$isNonEmpty, TrinaryL
613625
}
614626
}
615627

616-
if ($ast->getId() === '#range' || $ast->getId() === '#class') {
617-
if ($isNumeric->maybe()) {
618-
$allNumeric = null;
619-
foreach ($children as $child) {
620-
$literalValue = $this->getLiteralValue($child);
621-
622-
if ($literalValue === null) {
623-
break;
624-
}
625-
626-
if (Strings::match($literalValue, '/^\d+$/') === null) {
627-
$allNumeric = false;
628-
break;
629-
}
630-
631-
$allNumeric = true;
632-
}
633-
634-
if ($allNumeric === true) {
635-
$isNumeric = TrinaryLogic::createYes();
636-
}
637-
}
638-
639-
if (!$inOptionalQuantification) {
640-
$isNonEmpty = TrinaryLogic::createYes();
641-
}
628+
// [^0-9] should not parse as numeric-string, and [^list-everything-but-numbers] is technically
629+
// doable but really silly compared to just \d so we can safely assume the string is not numeric
630+
// for negative classes
631+
if ($ast->getId() === '#negativeclass') {
632+
$isNumeric = TrinaryLogic::createNo();
642633
}
643634

644635
foreach ($children as $child) {
@@ -653,13 +644,65 @@ private function walkGroupAst(TreeNode $ast, TrinaryLogic &$isNonEmpty, TrinaryL
653644

654645
private function getLiteralValue(TreeNode $node): ?string
655646
{
656-
if ($node->getId() === 'token' && $node->getValueToken() === 'literal') {
657-
return $node->getValueValue();
647+
if ($node->getId() !== 'token') {
648+
return null;
649+
}
650+
651+
// token is the token name from grammar without the namespace so literal and class:literal are both called literal here
652+
$token = $node->getValueToken();
653+
$value = $node->getValueValue();
654+
655+
if (in_array($token, ['literal', 'escaped_end_class'], true)) {
656+
if (strlen($node->getValueValue()) > 1 && $value[0] === '\\') {
657+
return substr($value, 1);
658+
}
659+
660+
return $value;
661+
}
662+
663+
// literal "-" in front/back of a character class like '[-a-z]' or '[abc-]', not forming a range
664+
if ($token === 'range') {
665+
return $value;
666+
}
667+
668+
// literal "[" or "]" inside character classes '[[]' or '[]]'
669+
if (in_array($token, ['class_', '_class_literal'], true)) {
670+
return $value;
671+
}
672+
673+
// character escape sequences, just return a fixed string
674+
if (in_array($token, ['character', 'dynamic_character', 'character_type'], true)) {
675+
if ($token === 'character_type' && $value === '\d') {
676+
return '0';
677+
}
678+
679+
return $value;
680+
}
681+
682+
// [:digit:] and the like, more support coming later
683+
if ($token === 'posix_class') {
684+
if ($value === '[:digit:]') {
685+
return '0';
686+
}
687+
if (in_array($value, ['[:alpha:]', '[:alnum:]', '[:upper:]', '[:lower:]', '[:word:]', '[:ascii:]', '[:print:]', '[:xdigit:]', '[:graph:]'], true)) {
688+
return 'a';
689+
}
690+
if ($value === '[:blank:]') {
691+
return " \t";
692+
}
693+
if ($value === '[:cntrl:]') {
694+
return "\x00\x1F";
695+
}
696+
if ($value === '[:space:]') {
697+
return " \t\r\n\v\f";
698+
}
699+
if ($value === '[:punct:]') {
700+
return '!"#$%&\'()*+,\-./:;<=>?@[\]^_`{|}~';
701+
}
658702
}
659703

660-
// literal "-" outside of a character class like '~^((\\d{1,6})-)$~'
661-
if ($node->getId() === 'token' && $node->getValueToken() === 'range') {
662-
return $node->getValueValue();
704+
if ($token === 'anchor' || $token === 'match_point_reset') {
705+
return '';
663706
}
664707

665708
return null;

tests/PHPStan/Analyser/LegacyNodeScopeResolverTest.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8003,7 +8003,7 @@ public function dataPassedByReference(): array
80038003
'$arr',
80048004
],
80058005
[
8006-
'array{0?: string}',
8006+
'array<string>',
80078007
'$matches',
80088008
],
80098009
[

tests/PHPStan/Analyser/nsrt/preg_match_shapes.php

Lines changed: 61 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -127,28 +127,18 @@ function doUnknownFlags(string $s, int $flags): void {
127127
assertType('array<array{string|null, int<-1, max>}|string|null>', $matches);
128128
}
129129

130-
function doNonAutoCapturingModifier(string $s): void {
131-
if (preg_match('/(?n)(\d+)/', $s, $matches)) {
132-
// could be assertType('array{string}', $matches);
133-
assertType('array<string>', $matches);
134-
}
135-
assertType('array<string>', $matches);
136-
}
137-
138130
function doMultipleAlternativeCaptureGroupsWithSameNameWithModifier(string $s): void {
139131
if (preg_match('/(?J)(?<Foo>[a-z]+)|(?<Foo>[0-9]+)/', $s, $matches)) {
140-
// could be assertType('array{0: string, Foo: string, 1: string}', $matches);
141-
assertType('array<string>', $matches);
132+
assertType('array{0: string, Foo: numeric-string|non-empty-string, 1: non-empty-string, 2?: numeric-string}', $matches);
142133
}
143-
assertType('array<string>', $matches);
134+
assertType('array{}|array{0: string, Foo: numeric-string|non-empty-string, 1: non-empty-string, 2?: numeric-string}', $matches);
144135
}
145136

146137
function doMultipleConsecutiveCaptureGroupsWithSameNameWithModifier(string $s): void {
147138
if (preg_match('/(?J)(?<Foo>[a-z]+)|(?<Foo>[0-9]+)/', $s, $matches)) {
148-
// could be assertType('array{0: string, Foo: string, 1: string}', $matches);
149-
assertType('array<string>', $matches);
139+
assertType('array{0: string, Foo: numeric-string|non-empty-string, 1: non-empty-string, 2?: numeric-string}', $matches);
150140
}
151-
assertType('array<string>', $matches);
141+
assertType('array{}|array{0: string, Foo: numeric-string|non-empty-string, 1: non-empty-string, 2?: numeric-string}', $matches);
152142
}
153143

154144
// https://github.com/hoaproject/Regex/issues/31
@@ -472,3 +462,60 @@ function (string $s): void {
472462
assertType("array{string, non-empty-string}", $matches);
473463
}
474464
};
465+
466+
function bug11323(string $s): void {
467+
if (preg_match('/([*|+?{}()]+)([^*|+[:digit:]?{}()]+)/', $s, $matches)) {
468+
assertType('array{string, non-empty-string, non-empty-string}', $matches);
469+
}
470+
if (preg_match('/\p{L}[[\]]+([-*|+?{}(?:)]+)([^*|+[:digit:]?{a-z}(\p{L})\a-]+)/', $s, $matches)) {
471+
assertType('array{string, non-empty-string, non-empty-string}', $matches);
472+
}
473+
if (preg_match('{([-\p{L}[\]*|\x03\a\b+?{}(?:)-]+[^[:digit:]?{}a-z0-9#-k]+)(a-z)}', $s, $matches)) {
474+
assertType('array{string, non-empty-string, non-empty-string}', $matches);
475+
}
476+
if (preg_match('{(\d+)(?i)insensitive((?x-i)case SENSITIVE here(?i:insensitive non-capturing group))}', $s, $matches)) {
477+
assertType('array{string, numeric-string, non-empty-string}', $matches);
478+
}
479+
if (preg_match('{([]] [^]])}', $s, $matches)) {
480+
assertType('array{string, non-empty-string}', $matches);
481+
}
482+
if (preg_match('{([[:digit:]])}', $s, $matches)) {
483+
assertType('array{string, numeric-string}', $matches);
484+
}
485+
if (preg_match('{([\d])(\d)}', $s, $matches)) {
486+
assertType('array{string, numeric-string, numeric-string}', $matches);
487+
}
488+
if (preg_match('{([0-9])}', $s, $matches)) {
489+
assertType('array{string, numeric-string}', $matches);
490+
}
491+
if (preg_match('{(\p{L})(\p{P})(\p{Po})}', $s, $matches)) {
492+
assertType('array{string, non-empty-string, non-empty-string, non-empty-string}', $matches);
493+
}
494+
if (preg_match('{(a)??(b)*+(c++)(d)+?}', $s, $matches)) {
495+
assertType('array{string, string, string, non-empty-string, non-empty-string}', $matches);
496+
}
497+
if (preg_match('{(.\d)}', $s, $matches)) {
498+
assertType('array{string, non-empty-string}', $matches);
499+
}
500+
if (preg_match('{(\d.)}', $s, $matches)) {
501+
assertType('array{string, non-empty-string}', $matches);
502+
}
503+
if (preg_match('{(\d\d)}', $s, $matches)) {
504+
assertType('array{string, numeric-string}', $matches);
505+
}
506+
if (preg_match('{(.(\d))}', $s, $matches)) {
507+
assertType('array{string, non-empty-string, numeric-string}', $matches);
508+
}
509+
if (preg_match('{((\d).)}', $s, $matches)) {
510+
assertType('array{string, non-empty-string, numeric-string}', $matches);
511+
}
512+
if (preg_match('{(\d([1-4])\d)}', $s, $matches)) {
513+
assertType('array{string, numeric-string, numeric-string}', $matches);
514+
}
515+
if (preg_match('{(x?([1-4])\d)}', $s, $matches)) {
516+
assertType('array{string, non-empty-string, numeric-string}', $matches);
517+
}
518+
if (preg_match('{([^1-4])}', $s, $matches)) {
519+
assertType('array{string, non-empty-string}', $matches);
520+
}
521+
}

tests/PHPStan/Analyser/nsrt/preg_match_shapes_php80.php

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,11 @@ function doOffsetCaptureWithUnmatchedNull(string $s): void {
1111
}
1212
assertType('array{}|array{array{string|null, int<-1, max>}, array{non-empty-string|null, int<-1, max>}, array{non-empty-string|null, int<-1, max>}, array{non-empty-string|null, int<-1, max>}}', $matches);
1313
}
14+
15+
function doNonAutoCapturingModifier(string $s): void {
16+
if (preg_match('/(?n)(\d+)/', $s, $matches)) {
17+
// should be assertType('array{string}', $matches);
18+
assertType('array{string, numeric-string}', $matches);
19+
}
20+
assertType('array{}|array{string, numeric-string}', $matches);
21+
}

0 commit comments

Comments
 (0)