Skip to content

Commit 525b00c

Browse files
SONARPY-882 Rule S5850: Alternatives in regular expressions should be grouped when used with anchors (#966)
* SONARPY-882 Rule S5850: Alternatives in regular expressions should be grouped when used with anchors * Update ruling
1 parent 2444d65 commit 525b00c

File tree

8 files changed

+158
-0
lines changed

8 files changed

+158
-0
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{
2+
'project:buildbot-0.8.6p1/buildbot/monkeypatches/sqlalchemy2189.py':[
3+
38,
4+
84,
5+
],
6+
'project:django-2.2.3/django/db/models/sql/constants.py':[
7+
19,
8+
],
9+
'project:tensorflow/tools/dockerfiles/assembler.py':[
10+
605,
11+
],
12+
}

python-checks/src/main/java/org/sonar/python/checks/CheckList.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
import org.sonar.python.checks.hotspots.StrongCryptographicKeysCheck;
5050
import org.sonar.python.checks.hotspots.UnsafeHttpMethodsCheck;
5151
import org.sonar.python.checks.hotspots.UnverifiedHostnameCheck;
52+
import org.sonar.python.checks.regex.AnchorPrecedenceCheck;
5253
import org.sonar.python.checks.regex.EmptyStringRepetitionCheck;
5354
import org.sonar.python.checks.regex.SingleCharacterAlternationCheck;
5455
import org.sonar.python.checks.regex.RedundantRegexAlternativesCheck;
@@ -67,6 +68,7 @@ public static Iterable<Class> getChecks() {
6768
return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
6869
AfterJumpStatementCheck.class,
6970
AllBranchesAreIdenticalCheck.class,
71+
AnchorPrecedenceCheck.class,
7072
ArgumentNumberCheck.class,
7173
ArgumentTypeCheck.class,
7274
BackslashInStringCheck.class,
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/*
2+
* SonarQube Python Plugin
3+
* Copyright (C) 2011-2021 SonarSource SA
4+
* mailto:info AT sonarsource DOT com
5+
*
6+
* This program is free software; you can redistribute it and/or
7+
* modify it under the terms of the GNU Lesser General Public
8+
* License as published by the Free Software Foundation; either
9+
* version 3 of the License, or (at your option) any later version.
10+
*
11+
* This program is distributed in the hope that it will be useful,
12+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14+
* Lesser General Public License for more details.
15+
*
16+
* You should have received a copy of the GNU Lesser General Public License
17+
* along with this program; if not, write to the Free Software Foundation,
18+
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19+
*/
20+
package org.sonar.python.checks.regex;
21+
22+
import org.sonar.check.Rule;
23+
import org.sonar.plugins.python.api.tree.CallExpression;
24+
import org.sonarsource.analyzer.commons.regex.RegexParseResult;
25+
import org.sonarsource.analyzer.commons.regex.finders.AnchorPrecedenceFinder;
26+
27+
@Rule(key = "S5850")
28+
public class AnchorPrecedenceCheck extends AbstractRegexCheck {
29+
30+
@Override
31+
public void checkRegex(RegexParseResult regexParseResult, CallExpression regexFunctionCall) {
32+
new AnchorPrecedenceFinder(this::addIssue).visit(regexParseResult);
33+
}
34+
}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
<p>In regular expressions, anchors (<code>^</code>, <code>$</code>, <code>\A</code>, <code>\Z</code> and <code>\z</code>) have higher precedence than
2+
the <code>|</code> operator. So in a regular expression like <code>^alt1|alt2|alt3$</code>, <code>alt1</code> would be anchored to the beginning,
3+
<code>alt3</code> to the end and <code>alt2</code> wouldn’t be anchored at all. Usually the intended behavior is that all alternatives are anchored at
4+
both ends. To achieve this, a non-capturing group should be used around the alternatives.</p>
5+
<p>In cases where it is intended that the anchors only apply to one alternative each, adding (non-capturing) groups around the anchors and the parts
6+
that they apply to will make it explicit which parts are anchored and avoid readers misunderstanding the precedence or changing it because they
7+
mistakenly assume the precedence was not intended.</p>
8+
<h2>Noncompliant Code Example</h2>
9+
<pre>
10+
r"^a|b|c$"
11+
</pre>
12+
<h2>Compliant Solution</h2>
13+
<pre>
14+
r"^(?:a|b|c)$"
15+
</pre>
16+
<p>or</p>
17+
<pre>
18+
r"^a$|^b$|^c$"
19+
</pre>
20+
<p>or, if you do want the anchors to only apply to <code>a</code> and <code>c</code> respectively:</p>
21+
<pre>
22+
r"(?:^a)|b|(?:c$)"
23+
</pre>
24+
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
"title": "Alternatives in regular expressions should be grouped when used with anchors",
3+
"type": "BUG",
4+
"status": "ready",
5+
"remediation": {
6+
"func": "Constant\/Issue",
7+
"constantCost": "10min"
8+
},
9+
"tags": [
10+
"regex"
11+
],
12+
"defaultSeverity": "Major",
13+
"ruleSpecification": "RSPEC-5850",
14+
"sqKey": "S5850",
15+
"scope": "Main",
16+
"quickfix": "unknown"
17+
}

python-checks/src/main/resources/org/sonar/l10n/py/rules/python/Sonar_way_profile.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@
134134
"S5807",
135135
"S5828",
136136
"S5842",
137+
"S5850",
137138
"S5855",
138139
"S5864",
139140
"S5886",
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
/*
2+
* SonarQube Python Plugin
3+
* Copyright (C) 2011-2021 SonarSource SA
4+
* mailto:info AT sonarsource DOT com
5+
*
6+
* This program is free software; you can redistribute it and/or
7+
* modify it under the terms of the GNU Lesser General Public
8+
* License as published by the Free Software Foundation; either
9+
* version 3 of the License, or (at your option) any later version.
10+
*
11+
* This program is distributed in the hope that it will be useful,
12+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14+
* Lesser General Public License for more details.
15+
*
16+
* You should have received a copy of the GNU Lesser General Public License
17+
* along with this program; if not, write to the Free Software Foundation,
18+
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19+
*/
20+
package org.sonar.python.checks.regex;
21+
22+
import org.junit.Test;
23+
import org.sonar.python.checks.utils.PythonCheckVerifier;
24+
25+
public class AnchorPrecedenceCheckTest {
26+
27+
@Test
28+
public void test() {
29+
PythonCheckVerifier.verify("src/test/resources/checks/regex/anchorPrecedenceCheck.py", new AnchorPrecedenceCheck());
30+
}
31+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import re
2+
3+
4+
def noncompliant(input):
5+
re.match(r"^a|b|c$", input) # Noncompliant {{Group parts of the regex together to make the intended operator precedence explicit.}}
6+
# ^^^^^^^
7+
re.match(r"^a|b|cd", input) # Noncompliant
8+
re.match(r"(?i)^a|b|cd", input) # Noncompliant
9+
re.match(r"(?i:^a|b|cd)", input) # Noncompliant
10+
re.match(r"a|b|c$", input) # Noncompliant
11+
re.match(r"\Aa|b|c\Z", input) # Noncompliant
12+
re.match(r"\Aa|b|c\z", input) # Noncompliant
13+
14+
15+
def compliant(input):
16+
re.match(r"^(?:a|b|c)$", input)
17+
re.match(r"(?:^a)|b|(?:c$)", input)
18+
re.match(r"^abc$", input)
19+
re.match(r"a|b|c", input)
20+
re.match(r"^a$|^b$|^c$", input)
21+
re.match(r"^a$|b|c", input)
22+
re.match(r"a|b|^c$", input)
23+
re.match(r"^a|^b$|c$", input)
24+
re.match(r"^a|^b|c$", input)
25+
re.match(r"^a|b$|c$", input)
26+
# Only beginning and end of line/input boundaries are considered - not word boundaries
27+
re.match(r"\ba|b|c\b", input)
28+
re.match(r"\ba\b|\bb\b|\bc\b", input)
29+
# If multiple alternatives are anchored, but not all, that's more likely to be intentional than if only the first
30+
# one were anchored, so we won't report an issue for the following line:
31+
re.match(r"^a|^b|c", input)
32+
re.match(r"aa|bb|cc", input)
33+
re.match(r"^", input)
34+
re.match(r"^[abc]$", input)
35+
re.match(r"|", input)
36+
re.match(r"[", input)
37+
re.match(r"(?i:^)a|b|c", input) # False negative; we don't find the anchor if it's hidden inside a sub-expression

0 commit comments

Comments
 (0)