Skip to content

Commit 2b0d850

Browse files
SONARPY-920 Handle regex in not-raw strings (#976)
* Handle escaping in not raw strings * Do not handle strings that contain a unicode name escape sequence * Handle ignored newline escape sequences * Update ITS expected results * Adapt testcases * Add more tests * Small refactoring * Protect from NoSuchElementException
1 parent 8c84618 commit 2b0d850

File tree

11 files changed

+200
-11
lines changed

11 files changed

+200
-11
lines changed
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,30 @@
11
{
2+
'project:biopython/Bio/PDB/ic_rebuild.py':[
3+
244,
4+
248,
5+
],
6+
'project:biopython/Bio/PDB/internal_coords.py':[
7+
3255,
8+
],
9+
'project:biopython/Bio/SearchIO/InterproscanIO/interproscan_xml.py':[
10+
55,
11+
],
12+
'project:numpy-1.16.4/numpy/distutils/line_endings.py':[
13+
19,
14+
52,
15+
53,
16+
],
217
'project:tensorflow/python/debug/cli/command_parser.py':[
318
264,
419
],
20+
'project:tensorflow/python/debug/lib/grpc_debug_test_server.py':[
21+
64,
22+
67,
23+
],
524
'project:tensorflow/python/keras/layers/einsum_dense.py':[
625
212,
726
],
27+
'project:tensorflow/tools/pip_package/setup.py':[
28+
177,
29+
],
830
}

its/ruling/src/test/resources/expected/python-S5843.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
{
2+
'project:buildbot-0.8.6p1/buildbot/changes/mail.py':[
3+
212,
4+
214,
5+
],
26
'project:buildbot-0.8.6p1/buildbot/steps/shell.py':[
37
382,
48
],
@@ -31,4 +35,7 @@
3135
'project:tornado-2.3/demos/blog/markdown.py':[
3236
718,
3337
],
38+
'project:tornado-2.3/tornado/escape.py':[
39+
235,
40+
],
3441
}

its/ruling/src/test/resources/expected/python-S5850.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
11
{
2+
'project:biopython/Bio/GenBank/__init__.py':[
3+
1283,
4+
],
25
'project:buildbot-0.8.6p1/buildbot/monkeypatches/sqlalchemy2189.py':[
36
38,
47
84,
58
],
9+
'project:buildbot-0.8.6p1/buildbot/steps/shell.py':[
10+
686,
11+
],
612
'project:django-2.2.3/django/db/models/sql/constants.py':[
713
19,
814
],

its/ruling/src/test/resources/expected/python-S5857.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
32,
1616
32,
1717
],
18+
'project:mypy-0.782/mypy/test/teststubtest.py':[
19+
625,
20+
],
1821
'project:numpy-1.16.4/numpy/linalg/lapack_lite/clapack_scrub.py':[
1922
104,
2023
],
@@ -31,4 +34,7 @@
3134
722,
3235
807,
3336
],
37+
'project:twisted-12.1.0/twisted/words/protocols/oscar.py':[
38+
81,
39+
],
3440
}

its/ruling/src/test/resources/expected/python-S6035.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22
'project:buildbot-0.8.6p1/buildbot/__init__.py':[
33
33,
44
],
5+
'project:buildbot-0.8.6p1/buildbot/changes/mail.py':[
6+
122,
7+
122,
8+
],
59
'project:buildbot-slave-0.8.6p1/buildslave/__init__.py':[
610
33,
711
],

python-checks/src/main/java/org/sonar/python/checks/regex/AbstractRegexCheck.java

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import java.util.HashMap;
2424
import java.util.HashSet;
2525
import java.util.List;
26+
import java.util.Locale;
2627
import java.util.Map;
2728
import java.util.Optional;
2829
import java.util.Set;
@@ -36,6 +37,7 @@
3637
import org.sonar.plugins.python.api.tree.Expression;
3738
import org.sonar.plugins.python.api.tree.QualifiedExpression;
3839
import org.sonar.plugins.python.api.tree.RegularArgument;
40+
import org.sonar.plugins.python.api.tree.StringElement;
3941
import org.sonar.plugins.python.api.tree.StringLiteral;
4042
import org.sonar.plugins.python.api.tree.Tree;
4143
import org.sonar.python.regex.PythonRegexIssueLocation;
@@ -100,13 +102,28 @@ private void checkCall(SubscriptionContext ctx) {
100102
}
101103

102104
private Optional<RegexParseResult> regexForStringLiteral(StringLiteral literal, FlagSet flagSet) {
103-
// TODO: for now we only handle strings with an "r" prefix. This will be extended.
104-
if (literal.stringElements().size() == 1 && "r".equalsIgnoreCase(literal.stringElements().get(0).prefix())) {
105+
if (shouldHandleStringLiteral(literal)) {
105106
return Optional.of(regexContext.regexForStringElement(literal.stringElements().get(0), flagSet));
106107
}
107108
return Optional.empty();
108109
}
109110

111+
/**
112+
* We do ignore strings in the following cases:
113+
* - It is a concatenation of multiple elements.
114+
* - It is an f-string containing expressions. We don't have a good mechanism to evaluate these expressions currently.
115+
* - The string is not raw and contains a \N{UNICODE NAME} escape sequence. In Java 8 we cannot make use of Character.codePointOf in the character parser (SONARPY-922).
116+
*/
117+
private static boolean shouldHandleStringLiteral(StringLiteral literal) {
118+
if (literal.stringElements().size() != 1) {
119+
// We do not handle concatenations for now
120+
return false;
121+
}
122+
StringElement stringElement = literal.stringElements().get(0);
123+
return stringElement.formattedExpressions().isEmpty() &&
124+
(stringElement.prefix().toLowerCase(Locale.ROOT).contains("r") || !stringElement.value().contains("\\N{"));
125+
}
126+
110127
private static Optional<StringLiteral> patternArgStringLiteral(CallExpression regexFunctionCall) {
111128
RegularArgument patternArgument = TreeUtils.nthArgumentOrKeyword(0, "pattern", regexFunctionCall.arguments());
112129
if (patternArgument == null) {

python-checks/src/test/java/org/sonar/python/checks/regex/AbstractRegexCheckTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,8 @@ public void checkRegex(RegexParseResult regexParseResult, CallExpression regexFu
5858

5959
PythonVisitorContext fileContext = TestPythonVisitorRunner.createContext(FILE);
6060
SubscriptionVisitor.analyze(Collections.singletonList(check), fileContext);
61-
assertThat(check.reportedRegexTrees).hasSize(10);
62-
assertThat(fileContext.getIssues()).hasSize(10);
61+
assertThat(check.reportedRegexTrees).hasSize(11);
62+
assertThat(fileContext.getIssues()).hasSize(11);
6363
}
6464

6565
@Test

python-checks/src/test/resources/checks/regex/abstractRegexCheck.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,14 @@
99
re.fullmatch(r'.*', "foo") # Noncompliant
1010
re.split(r'.*', "foo") # Noncompliant
1111
re.findall(r'.*', "foo") # Noncompliant
12-
re.finditer(r'.*', "foo") # Noncompliant
12+
re.finditer('.*', "foo") # Noncompliant
13+
14+
re.match('.*\N{GREEK SMALL LETTER FINAL SIGMA}', 'foo') # We do ignore not raw strings containing \N escape sequences
15+
re.match(r'.*\N{GREEK SMALL LETTER FINAL SIGMA}', 'foo') # Noncompliant
16+
17+
some_var = 'foo'
18+
re.match(f'.*{some_var}', 'foo') # We do ignore f-strings that do contain an expression
1319

14-
re.sub('.*', "x", "a") # We only look at raw strings for now
1520
re.sub(r'.*' r'.*', "x", "a") # We do not look at concats for now
1621
re.sub() # Required arguments not provided
1722
re.not_relevant_method(r'.*', "x", "a")

python-frontend/src/main/java/org/sonar/python/regex/PythonAnalyzerRegexSource.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import java.util.ArrayList;
2323
import java.util.Arrays;
2424
import java.util.List;
25+
import java.util.Locale;
2526
import org.sonar.plugins.python.api.LocationInFile;
2627
import org.sonar.plugins.python.api.tree.StringElement;
2728
import org.sonar.plugins.python.api.tree.Token;
@@ -35,14 +36,17 @@ public class PythonAnalyzerRegexSource extends PythonRegexSource {
3536
private final int sourceStartOffset;
3637
private final int[] lineStartOffsets;
3738

39+
private final boolean isRawString;
40+
3841
public PythonAnalyzerRegexSource(StringElement s) {
3942
// TODO: Do we need the quote? If yes, don't hardcode
4043
super(s.trimmedQuotesValue(), '"');
44+
String prefix = s.prefix();
4145
Token firstToken = s.firstToken();
4246
sourceLine = firstToken.line();
43-
// TODO: The +1 represents the prefix size. Right now we only scan patterns with the raw prefix.
44-
sourceStartOffset = firstToken.column() + (s.isTripleQuoted() ? 3 : 1) + 1;
47+
sourceStartOffset = firstToken.column() + (s.isTripleQuoted() ? 3 : 1) + prefix.length();
4548
lineStartOffsets = lineStartOffsets(getSourceText());
49+
isRawString = prefix.toLowerCase(Locale.ROOT).contains("r");
4650
}
4751

4852
@Override
@@ -56,6 +60,10 @@ public LocationInFile locationInFileFor(IndexRange range) {
5660
return new LocationInFile(null, startLineAndOffset[0], startLineAndOffset[1], endLineAndOffset[0], endLineAndOffset[1]);
5761
}
5862

63+
public boolean isRawString() {
64+
return isRawString;
65+
}
66+
5967
private int[] lineAndOffset(int index) {
6068
int line;
6169
int offset;

python-frontend/src/main/java/org/sonar/python/regex/PythonStringCharacterParser.java

Lines changed: 71 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,17 +20,23 @@
2020
package org.sonar.python.regex;
2121

2222
import java.util.NoSuchElementException;
23+
import java.util.regex.Matcher;
24+
import java.util.regex.Pattern;
2325
import javax.annotation.Nullable;
2426
import org.sonarsource.analyzer.commons.regex.CharacterParser;
25-
import org.sonarsource.analyzer.commons.regex.RegexSource;
2627
import org.sonarsource.analyzer.commons.regex.ast.IndexRange;
2728
import org.sonarsource.analyzer.commons.regex.ast.SourceCharacter;
2829

2930
public class PythonStringCharacterParser implements CharacterParser {
3031

32+
private static final Pattern UNICODE_16_BIT_PATTERN = Pattern.compile("\\Au([0-9A-Fa-f]{4})");
33+
private static final Pattern UNICODE_32_BIT_PATTERN = Pattern.compile("\\AU([0-9A-Fa-f]{8})");
34+
private static final Pattern HEX_PATTERN = Pattern.compile("\\Ax([0-9A-Fa-f]{2})");
35+
private static final Pattern OCTAL_PATTERN = Pattern.compile("\\A([0-7]{1,3})");
36+
3137
final String sourceText;
3238
final int textLength;
33-
protected final RegexSource source;
39+
protected final PythonAnalyzerRegexSource source;
3440
protected int index;
3541
@Nullable
3642
private SourceCharacter current;
@@ -48,10 +54,21 @@ public void moveNext() {
4854
if (this.index >= this.textLength) {
4955
this.current = null;
5056
} else {
51-
this.current = this.createCharAndUpdateIndex(this.sourceText.charAt(this.index), 1);
57+
this.current = parsePythonCharacter();
5258
}
5359
}
5460

61+
private SourceCharacter parsePythonCharacter() {
62+
char ch = sourceText.charAt(index);
63+
if (!source.isRawString() && ch == '\\') {
64+
if (index + 1 >= textLength) {
65+
return createCharAndUpdateIndex('\\', 1);
66+
}
67+
return parsePythonEscapeSequence();
68+
}
69+
return createCharAndUpdateIndex(ch, 1);
70+
}
71+
5572
SourceCharacter createCharAndUpdateIndex(char ch, int length) {
5673
int startIndex = this.index;
5774
this.index += length;
@@ -77,4 +94,55 @@ public void resetTo(int index) {
7794
this.index = index;
7895
this.moveNext();
7996
}
97+
98+
private SourceCharacter parsePythonEscapeSequence() {
99+
char charAfterBackslash = sourceText.charAt(index + 1);
100+
switch (charAfterBackslash) {
101+
case '\n':
102+
// \NEWLINE is ignored in python. We skip both characters
103+
if (this.index + 2 >= this.textLength) {
104+
return null;
105+
}
106+
this.index += 2;
107+
this.moveNext();
108+
return getCurrent();
109+
case '\\':
110+
return createCharAndUpdateIndex('\\', 2);
111+
case '\'':
112+
return createCharAndUpdateIndex('\'', 2);
113+
case '"':
114+
return createCharAndUpdateIndex('"', 2);
115+
case 'a':
116+
return createCharAndUpdateIndex('\u0007', 2);
117+
case 'b':
118+
return createCharAndUpdateIndex('\b', 2);
119+
case 'f':
120+
return createCharAndUpdateIndex('\f', 2);
121+
case 'n':
122+
return createCharAndUpdateIndex('\n', 2);
123+
case 'r':
124+
return createCharAndUpdateIndex('\r', 2);
125+
case 't':
126+
return createCharAndUpdateIndex('\t', 2);
127+
case 'v':
128+
return createCharAndUpdateIndex('\u000b', 2);
129+
case 'u':
130+
return createCharacterFromPattern(UNICODE_16_BIT_PATTERN, 16, 2);
131+
case 'U':
132+
return createCharacterFromPattern(UNICODE_32_BIT_PATTERN, 16, 2);
133+
case 'x':
134+
return createCharacterFromPattern(HEX_PATTERN, 16, 2);
135+
default:
136+
return createCharacterFromPattern(OCTAL_PATTERN, 8, 1);
137+
}
138+
}
139+
140+
private SourceCharacter createCharacterFromPattern(Pattern pattern, int radix, int initialLength) {
141+
Matcher matcher = pattern.matcher(sourceText.substring(index + 1));
142+
if (matcher.find()) {
143+
String value = matcher.group(1);
144+
return createCharAndUpdateIndex((char) Integer.parseInt(value, radix), value.length() + initialLength);
145+
}
146+
return createCharAndUpdateIndex('\\', 1);
147+
}
80148
}

0 commit comments

Comments
 (0)