Skip to content

Commit 351d3da

Browse files
SONARPY-911 Provide global regex flags to the parser (#968)
* SONARPY-911 Provide global regex flags to the parser * Add Pattern.UNICODE_CASE by default and fix code smell * Add some tests for flags * Some fixes after rebase on master
1 parent b9dc756 commit 351d3da

File tree

6 files changed

+205
-20
lines changed

6 files changed

+205
-20
lines changed

python-checks/src/main/java/org/sonar/python/checks/regex/AbstractRegexCheck.java

Lines changed: 122 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,22 @@
1919
*/
2020
package org.sonar.python.checks.regex;
2121

22-
import java.util.Arrays;
22+
import java.util.Collections;
23+
import java.util.HashMap;
2324
import java.util.HashSet;
2425
import java.util.List;
26+
import java.util.Map;
2527
import java.util.Optional;
2628
import java.util.Set;
29+
import java.util.regex.Pattern;
2730
import javax.annotation.Nullable;
2831
import org.sonar.plugins.python.api.PythonSubscriptionCheck;
2932
import org.sonar.plugins.python.api.SubscriptionContext;
3033
import org.sonar.plugins.python.api.symbols.Symbol;
34+
import org.sonar.plugins.python.api.tree.BinaryExpression;
3135
import org.sonar.plugins.python.api.tree.CallExpression;
3236
import org.sonar.plugins.python.api.tree.Expression;
37+
import org.sonar.plugins.python.api.tree.QualifiedExpression;
3338
import org.sonar.plugins.python.api.tree.RegularArgument;
3439
import org.sonar.plugins.python.api.tree.StringLiteral;
3540
import org.sonar.plugins.python.api.tree.Tree;
@@ -38,19 +43,36 @@
3843
import org.sonar.python.tree.TreeUtils;
3944
import org.sonarsource.analyzer.commons.regex.RegexIssueLocation;
4045
import org.sonarsource.analyzer.commons.regex.RegexParseResult;
46+
import org.sonarsource.analyzer.commons.regex.ast.FlagSet;
4147
import org.sonarsource.analyzer.commons.regex.ast.RegexSyntaxElement;
4248

4349
public abstract class AbstractRegexCheck extends PythonSubscriptionCheck {
4450

45-
private static final Set<String> REGEX_FUNCTIONS = new HashSet<>(Arrays.asList("re.sub", "re.subn", "re.compile", "re.search", "re.match",
46-
"re.fullmatch", "re.split", "re.findall", "re.finditer"));
51+
private static final Map<String, Integer> REGEX_FUNCTIONS_TO_FLAG_PARAM = new HashMap<>();
52+
53+
static {
54+
REGEX_FUNCTIONS_TO_FLAG_PARAM.put("re.sub", 4);
55+
REGEX_FUNCTIONS_TO_FLAG_PARAM.put("re.subn", 4);
56+
REGEX_FUNCTIONS_TO_FLAG_PARAM.put("re.compile", null);
57+
REGEX_FUNCTIONS_TO_FLAG_PARAM.put("re.search", 2);
58+
REGEX_FUNCTIONS_TO_FLAG_PARAM.put("re.match", 2);
59+
REGEX_FUNCTIONS_TO_FLAG_PARAM.put("re.fullmatch", 2);
60+
REGEX_FUNCTIONS_TO_FLAG_PARAM.put("re.split", 3);
61+
REGEX_FUNCTIONS_TO_FLAG_PARAM.put("re.findall", 2);
62+
REGEX_FUNCTIONS_TO_FLAG_PARAM.put("re.finditer", 2);
63+
}
64+
4765
protected RegexContext regexContext;
4866

4967
// We want to report only one issue per element for one rule.
5068
protected final Set<RegexSyntaxElement> reportedRegexTrees = new HashSet<>();
5169

52-
protected Set<String> lookedUpFunctionNames() {
53-
return REGEX_FUNCTIONS;
70+
/**
71+
* Should return a map whose keys are the functions the check is interested in, and the values are the position of the flags parameter.
72+
* Set the position of the flags parameter to {@code null} if there is none.
73+
*/
74+
protected Map<String, Integer> lookedUpFunctions() {
75+
return REGEX_FUNCTIONS_TO_FLAG_PARAM;
5476
}
5577

5678
@Override
@@ -67,17 +89,20 @@ private void checkCall(SubscriptionContext ctx) {
6789
if (calleeSymbol == null || calleeSymbol.fullyQualifiedName() == null) {
6890
return;
6991
}
70-
if (lookedUpFunctionNames().contains(calleeSymbol.fullyQualifiedName())) {
92+
String functionFqn = calleeSymbol.fullyQualifiedName();
93+
if (functionFqn != null && lookedUpFunctions().containsKey(functionFqn)) {
94+
FlagSet flagSet = getFlagSet(callExpression, functionFqn);
95+
7196
patternArgStringLiteral(callExpression)
72-
.flatMap(this::regexForStringLiteral)
97+
.flatMap(l -> regexForStringLiteral(l, flagSet))
7398
.ifPresent(parseResult -> checkRegex(parseResult, callExpression));
7499
}
75100
}
76101

77-
private Optional<RegexParseResult> regexForStringLiteral(StringLiteral literal) {
102+
private Optional<RegexParseResult> regexForStringLiteral(StringLiteral literal, FlagSet flagSet) {
78103
// TODO: for now we only handle strings with an "r" prefix. This will be extended.
79104
if (literal.stringElements().size() == 1 && "r".equalsIgnoreCase(literal.stringElements().get(0).prefix())) {
80-
return Optional.of(regexContext.regexForStringElement(literal.stringElements().get(0)));
105+
return Optional.of(regexContext.regexForStringElement(literal.stringElements().get(0), flagSet));
81106
}
82107
return Optional.empty();
83108
}
@@ -94,6 +119,94 @@ private static Optional<StringLiteral> patternArgStringLiteral(CallExpression re
94119
return Optional.empty();
95120
}
96121

122+
private FlagSet getFlagSet(CallExpression callExpression, String functionFqn) {
123+
HashSet<QualifiedExpression> flags = new HashSet<>();
124+
getFlagsArgValue(callExpression, lookedUpFunctions().get(functionFqn)).ifPresent(f -> flags.addAll(extractFlagExpressions(f)));
125+
FlagSet flagSet = new FlagSet();
126+
flags.stream()
127+
.map(AbstractRegexCheck::mapPythonFlag)
128+
.filter(Optional::isPresent)
129+
.map(Optional::get)
130+
.forEach(flagSet::add);
131+
132+
// TODO: Don't do this when PYTHON_VERSION is 2
133+
// We used Pattern.LITERAL to represent re.ASCII. So we are checking if re.ASCII is set here.
134+
// For python3 matches are Unicode by default, and re.ASCII can be used to deactivate that.
135+
if (!flagSet.contains(Pattern.LITERAL)) {
136+
flagSet.add(Pattern.UNICODE_CHARACTER_CLASS);
137+
flagSet.add(Pattern.UNICODE_CASE);
138+
}
139+
flagSet.removeAll(new FlagSet(Pattern.LITERAL));
140+
141+
return flagSet;
142+
}
143+
144+
private static Optional<Expression> getFlagsArgValue(CallExpression regexFunctionCall, @Nullable Integer argPosition) {
145+
if (argPosition == null) {
146+
return Optional.empty();
147+
}
148+
RegularArgument patternArgument = TreeUtils.nthArgumentOrKeyword(argPosition, "flags", regexFunctionCall.arguments());
149+
return patternArgument != null ? Optional.of(patternArgument.expression()) : Optional.empty();
150+
}
151+
152+
private static HashSet<QualifiedExpression> extractFlagExpressions(Tree flagsSubexpr) {
153+
if (flagsSubexpr.is(Tree.Kind.QUALIFIED_EXPR)) {
154+
return new HashSet<>(Collections.singletonList((QualifiedExpression) flagsSubexpr));
155+
} else if (flagsSubexpr.is(Tree.Kind.BITWISE_OR)) {
156+
// recurse into left and right branch
157+
BinaryExpression orExpr = (BinaryExpression) flagsSubexpr;
158+
HashSet<QualifiedExpression> flags = extractFlagExpressions(orExpr.leftOperand());
159+
flags.addAll(extractFlagExpressions(orExpr.rightOperand()));
160+
return flags;
161+
} else {
162+
// failed to interpret. Ignore leaf.
163+
return new HashSet<>();
164+
}
165+
}
166+
167+
public static Optional<Integer> mapPythonFlag(QualifiedExpression ch) {
168+
Symbol symbol = ch.symbol();
169+
if (symbol == null) {
170+
return Optional.empty();
171+
}
172+
String symbolFqn = symbol.fullyQualifiedName();
173+
if (symbolFqn == null) {
174+
return Optional.empty();
175+
}
176+
177+
Integer result;
178+
switch (symbolFqn) {
179+
case "re.IGNORECASE":
180+
case "re.I":
181+
result = Pattern.CASE_INSENSITIVE;
182+
break;
183+
case "re.MULTILINE":
184+
case "re.M":
185+
result = Pattern.MULTILINE;
186+
break;
187+
case "re.DOTALL":
188+
case "re.S":
189+
result = Pattern.DOTALL;
190+
break;
191+
case "re.VERBOSE":
192+
case "re.X":
193+
result = Pattern.COMMENTS;
194+
break;
195+
case "re.UNICODE":
196+
case "re.U":
197+
result = Pattern.UNICODE_CHARACTER_CLASS;
198+
break;
199+
case "re.ASCII":
200+
case "re.A":
201+
// We misuse Pattern.LITERAL to represent re.ASCII. It will be removed before being provided to the parser.
202+
result = Pattern.LITERAL;
203+
break;
204+
default:
205+
result = null;
206+
}
207+
return Optional.ofNullable(result);
208+
}
209+
97210
public void addIssue(RegexSyntaxElement regexTree, String message, @Nullable Integer cost, List<RegexIssueLocation> secondaries) {
98211
if (reportedRegexTrees.add(regexTree)) {
99212
PreciseIssue issue = regexContext.addIssue(regexTree, message);

python-checks/src/main/java/org/sonar/python/checks/regex/StringReplaceCheck.java

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,8 @@
1919
*/
2020
package org.sonar.python.checks.regex;
2121

22-
import java.util.Collections;
23-
import java.util.HashSet;
24-
import java.util.Set;
22+
import java.util.HashMap;
23+
import java.util.Map;
2524
import org.sonar.check.Rule;
2625
import org.sonar.plugins.python.api.tree.CallExpression;
2726
import org.sonar.python.regex.PythonRegexIssueLocation;
@@ -36,8 +35,10 @@ public class StringReplaceCheck extends AbstractRegexCheck {
3635
private static final String SECONDARY_MESSAGE = "Expression without regular expression features.";
3736

3837
@Override
39-
protected Set<String> lookedUpFunctionNames() {
40-
return new HashSet<>(Collections.singletonList("re.sub"));
38+
protected Map<String, Integer> lookedUpFunctions() {
39+
Map<String, Integer> result = new HashMap<>();
40+
result.put("re.sub", 4);
41+
return result;
4142
}
4243

4344
@Override

python-checks/src/test/java/org/sonar/python/checks/regex/AbstractRegexCheckTest.java

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,15 @@
2424
import java.util.Arrays;
2525
import java.util.Collections;
2626
import java.util.List;
27+
import java.util.regex.Pattern;
2728
import org.junit.Test;
2829
import org.sonar.plugins.python.api.PythonVisitorContext;
2930
import org.sonar.plugins.python.api.tree.CallExpression;
3031
import org.sonar.python.SubscriptionVisitor;
3132
import org.sonar.python.TestPythonVisitorRunner;
3233
import org.sonar.python.checks.utils.PythonCheckVerifier;
3334
import org.sonarsource.analyzer.commons.regex.RegexParseResult;
35+
import org.sonarsource.analyzer.commons.regex.ast.FlagSet;
3436

3537
import static org.assertj.core.api.Assertions.assertThat;
3638

@@ -42,7 +44,6 @@ public class AbstractRegexCheckTest {
4244
public void test_regex_is_visited() {
4345
Check check = new Check();
4446
PythonCheckVerifier.verify(FILE.getAbsolutePath(), check);
45-
assertThat(check.receivedRegexParseResults).hasSize(10);
4647
}
4748

4849
@Test
@@ -73,13 +74,58 @@ public void test_regex_parse_result_is_retrieved_from_cache_in_context() {
7374
}
7475
}
7576

77+
@Test
78+
public void test_flags() {
79+
Check check = new Check(true);
80+
PythonCheckVerifier.verify("src/test/resources/checks/regex/abstractRegexCheckFlags.py", check);
81+
}
82+
7683
private static class Check extends AbstractRegexCheck {
7784
private final List<RegexParseResult> receivedRegexParseResults = new ArrayList<>();
85+
private boolean printFlags;
86+
87+
private Check() {
88+
new Check(false);
89+
}
90+
91+
private Check(boolean printFlags) {
92+
this.printFlags = printFlags;
93+
}
7894

7995
@Override
8096
public void checkRegex(RegexParseResult regexParseResult, CallExpression regexFunctionCall) {
8197
receivedRegexParseResults.add(regexParseResult);
82-
addIssue(regexParseResult.getResult(), "MESSAGE", null, Collections.emptyList());
98+
addIssue(regexParseResult.getResult(), printFlags ? flagsToString(regexParseResult) : "MESSAGE", null, Collections.emptyList());
99+
}
100+
101+
private String flagsToString(RegexParseResult regexParseResult) {
102+
List<String> flags = new ArrayList<>();
103+
FlagSet flagSet = regexParseResult.getInitialFlags();
104+
if (flagSet.contains(Pattern.CASE_INSENSITIVE)) {
105+
flags.add("CASE_INSENSITIVE");
106+
}
107+
if (flagSet.contains(Pattern.MULTILINE)) {
108+
flags.add("MULTILINE");
109+
}
110+
if (flagSet.contains(Pattern.UNICODE_CASE)) {
111+
flags.add("UNICODE_CASE");
112+
}
113+
if (flagSet.contains(Pattern.UNICODE_CHARACTER_CLASS)) {
114+
flags.add("UNICODE_CHARACTER_CLASS");
115+
}
116+
if (flagSet.contains(Pattern.DOTALL)) {
117+
flags.add("DOTALL");
118+
}
119+
if (flagSet.contains(Pattern.COMMENTS)) {
120+
flags.add("VERBOSE");
121+
}
122+
if (!flagSet.contains(Pattern.UNICODE_CHARACTER_CLASS)) {
123+
flags.add("ASCII");
124+
}
125+
if (flags.isEmpty()) {
126+
return "NO FLAGS";
127+
}
128+
return String.join("|", flags);
83129
}
84130
}
85131

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import re
2+
3+
re.match(r'.*', "foo", re.I); # Noncompliant {{CASE_INSENSITIVE|UNICODE_CASE|UNICODE_CHARACTER_CLASS}}
4+
re.match(r'.*', "foo", re.IGNORECASE); # Noncompliant {{CASE_INSENSITIVE|UNICODE_CASE|UNICODE_CHARACTER_CLASS}}
5+
6+
re.match(r'.*', "foo", re.M); # Noncompliant {{MULTILINE|UNICODE_CASE|UNICODE_CHARACTER_CLASS}}
7+
re.match(r'.*', "foo", re.MULTILINE); # Noncompliant {{MULTILINE|UNICODE_CASE|UNICODE_CHARACTER_CLASS}}
8+
9+
re.match(r'.*', "foo", re.S); # Noncompliant {{UNICODE_CASE|UNICODE_CHARACTER_CLASS|DOTALL}}
10+
re.match(r'.*', "foo", re.DOTALL); # Noncompliant {{UNICODE_CASE|UNICODE_CHARACTER_CLASS|DOTALL}}
11+
12+
re.match(r'.*', "foo", re.X); # Noncompliant {{UNICODE_CASE|UNICODE_CHARACTER_CLASS|VERBOSE}}
13+
re.match(r'.*', "foo", re.VERBOSE); # Noncompliant {{UNICODE_CASE|UNICODE_CHARACTER_CLASS|VERBOSE}}
14+
15+
re.match(r'.*', "foo", re.U); # Noncompliant {{UNICODE_CASE|UNICODE_CHARACTER_CLASS}}
16+
re.match(r'.*', "foo", re.UNICODE); # Noncompliant {{UNICODE_CASE|UNICODE_CHARACTER_CLASS}}
17+
18+
re.match(r'.*', "foo", re.A); # Noncompliant {{ASCII}}
19+
re.match(r'.*', "foo", re.ASCII); # Noncompliant {{ASCII}}
20+
21+
re.match(r'.*', "foo", re.UNKNOWN); # Noncompliant {{UNICODE_CASE|UNICODE_CHARACTER_CLASS}}
22+
re.match(r'.*', "foo", not_re.UNKNOWN); # Noncompliant {{UNICODE_CASE|UNICODE_CHARACTER_CLASS}}
23+
re.match(r'.*', "foo", re.I|UNKNOWN); # Noncompliant {{CASE_INSENSITIVE|UNICODE_CASE|UNICODE_CHARACTER_CLASS}}
24+
25+
re.match(r'.*', "foo", re.I|re.M); # Noncompliant {{CASE_INSENSITIVE|MULTILINE|UNICODE_CASE|UNICODE_CHARACTER_CLASS}}

python-frontend/src/main/java/org/sonar/python/SubscriptionVisitor.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -168,10 +168,9 @@ public File workingDirectory() {
168168
return pythonVisitorContext.workingDirectory();
169169
}
170170

171-
public RegexParseResult regexForStringElement(StringElement stringElement) {
172-
// TODO: Real flags have to be provided
171+
public RegexParseResult regexForStringElement(StringElement stringElement, FlagSet flagSet) {
173172
return regexCache.computeIfAbsent(stringElement,
174-
s -> new RegexParser(new PythonAnalyzerRegexSource(s), new FlagSet()).parse());
173+
s -> new RegexParser(new PythonAnalyzerRegexSource(s), flagSet).parse());
175174
}
176175
}
177176
}

python-frontend/src/main/java/org/sonar/python/regex/RegexContext.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,12 @@
2424
import org.sonar.plugins.python.api.tree.StringElement;
2525
import org.sonar.plugins.python.api.tree.Tree;
2626
import org.sonarsource.analyzer.commons.regex.RegexParseResult;
27+
import org.sonarsource.analyzer.commons.regex.ast.FlagSet;
2728
import org.sonarsource.analyzer.commons.regex.ast.RegexSyntaxElement;
2829

2930
public interface RegexContext {
3031

31-
RegexParseResult regexForStringElement(StringElement stringElement);
32+
RegexParseResult regexForStringElement(StringElement stringElement, FlagSet flagSet);
3233

3334
PythonCheck.PreciseIssue addIssue(Tree element, @Nullable String message);
3435

0 commit comments

Comments
 (0)