SONARPY-920 Handle regex in not-raw strings (#976)

karim-ouerghemmi-sonarsource · web-flow · commit 2b0d850bc5e2 · 2021-11-03T09:39:50.000+01:00
* Handle escaping in not raw strings

* Do not handle strings that contain a unicode name escape sequence

* Handle ignored newline escape sequences

* Update ITS expected results

* Adapt testcases

* Add more tests

* Small refactoring

* Protect from NoSuchElementException
diff --git a/its/ruling/src/test/resources/expected/python-S5361.json b/its/ruling/src/test/resources/expected/python-S5361.json
@@ -1,8 +1,30 @@
 {
+'project:biopython/Bio/PDB/ic_rebuild.py':[
+244,
+248,
+],
+'project:biopython/Bio/PDB/internal_coords.py':[
+3255,
+],
+'project:biopython/Bio/SearchIO/InterproscanIO/interproscan_xml.py':[
+55,
+],
+'project:numpy-1.16.4/numpy/distutils/line_endings.py':[
+19,
+52,
+53,
+],
 'project:tensorflow/python/debug/cli/command_parser.py':[
 264,
 ],
+'project:tensorflow/python/debug/lib/grpc_debug_test_server.py':[
+64,
+67,
+],
 'project:tensorflow/python/keras/layers/einsum_dense.py':[
 212,
 ],
+'project:tensorflow/tools/pip_package/setup.py':[
+177,
+],
 }
diff --git a/its/ruling/src/test/resources/expected/python-S5843.json b/its/ruling/src/test/resources/expected/python-S5843.json
@@ -1,4 +1,8 @@
 {
+'project:buildbot-0.8.6p1/buildbot/changes/mail.py':[
+212,
+214,
+],
 'project:buildbot-0.8.6p1/buildbot/steps/shell.py':[
 382,
 ],
@@ -31,4 +35,7 @@
 'project:tornado-2.3/demos/blog/markdown.py':[
 718,
 ],
+'project:tornado-2.3/tornado/escape.py':[
+235,
+],
 }
diff --git a/its/ruling/src/test/resources/expected/python-S5850.json b/its/ruling/src/test/resources/expected/python-S5850.json
@@ -1,8 +1,14 @@
 {
+'project:biopython/Bio/GenBank/__init__.py':[
+1283,
+],
 'project:buildbot-0.8.6p1/buildbot/monkeypatches/sqlalchemy2189.py':[
 38,
 84,
 ],
+'project:buildbot-0.8.6p1/buildbot/steps/shell.py':[
+686,
+],
 'project:django-2.2.3/django/db/models/sql/constants.py':[
 19,
 ],
diff --git a/its/ruling/src/test/resources/expected/python-S5857.json b/its/ruling/src/test/resources/expected/python-S5857.json
@@ -15,6 +15,9 @@
 32,
 32,
 ],
+'project:mypy-0.782/mypy/test/teststubtest.py':[
+625,
+],
 'project:numpy-1.16.4/numpy/linalg/lapack_lite/clapack_scrub.py':[
 104,
 ],
@@ -31,4 +34,7 @@
 722,
 807,
 ],
+'project:twisted-12.1.0/twisted/words/protocols/oscar.py':[
+81,
+],
 }
diff --git a/its/ruling/src/test/resources/expected/python-S6035.json b/its/ruling/src/test/resources/expected/python-S6035.json
@@ -2,6 +2,10 @@
 'project:buildbot-0.8.6p1/buildbot/__init__.py':[
 33,
 ],
+'project:buildbot-0.8.6p1/buildbot/changes/mail.py':[
+122,
+122,
+],
 'project:buildbot-slave-0.8.6p1/buildslave/__init__.py':[
 33,
 ],
diff --git a/python-checks/src/main/java/org/sonar/python/checks/regex/AbstractRegexCheck.java b/python-checks/src/main/java/org/sonar/python/checks/regex/AbstractRegexCheck.java
@@ -23,6 +23,7 @@
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Optional;
 import java.util.Set;
@@ -36,6 +37,7 @@
 import org.sonar.plugins.python.api.tree.Expression;
 import org.sonar.plugins.python.api.tree.QualifiedExpression;
 import org.sonar.plugins.python.api.tree.RegularArgument;
+import org.sonar.plugins.python.api.tree.StringElement;
 import org.sonar.plugins.python.api.tree.StringLiteral;
 import org.sonar.plugins.python.api.tree.Tree;
 import org.sonar.python.regex.PythonRegexIssueLocation;
@@ -100,13 +102,28 @@ private void checkCall(SubscriptionContext ctx) {
   }
 
   private Optional<RegexParseResult> regexForStringLiteral(StringLiteral literal, FlagSet flagSet) {
-    // TODO: for now we only handle strings with an "r" prefix. This will be extended.
-    if (literal.stringElements().size() == 1 && "r".equalsIgnoreCase(literal.stringElements().get(0).prefix())) {
+    if (shouldHandleStringLiteral(literal)) {
       return Optional.of(regexContext.regexForStringElement(literal.stringElements().get(0), flagSet));
     }
     return Optional.empty();
   }
 
+  /**
+   * We do ignore strings in the following cases:
+   *  - It is a concatenation of multiple elements.
+   *  - It is an f-string containing expressions. We don't have a good mechanism to evaluate these expressions currently.
+   *  - The string is not raw and contains a \N{UNICODE NAME} escape sequence. In Java 8 we cannot make use of Character.codePointOf in the character parser (SONARPY-922).
+   */
+  private static boolean shouldHandleStringLiteral(StringLiteral literal) {
+    if (literal.stringElements().size() != 1) {
+      // We do not handle concatenations for now
+      return false;
+    }
+    StringElement stringElement = literal.stringElements().get(0);
+    return stringElement.formattedExpressions().isEmpty() &&
+      (stringElement.prefix().toLowerCase(Locale.ROOT).contains("r") || !stringElement.value().contains("\\N{"));
+  }
+
   private static Optional<StringLiteral> patternArgStringLiteral(CallExpression regexFunctionCall) {
     RegularArgument patternArgument = TreeUtils.nthArgumentOrKeyword(0, "pattern", regexFunctionCall.arguments());
     if (patternArgument == null) {
diff --git a/python-checks/src/test/java/org/sonar/python/checks/regex/AbstractRegexCheckTest.java b/python-checks/src/test/java/org/sonar/python/checks/regex/AbstractRegexCheckTest.java
@@ -58,8 +58,8 @@ public void checkRegex(RegexParseResult regexParseResult, CallExpression regexFu
 
     PythonVisitorContext fileContext = TestPythonVisitorRunner.createContext(FILE);
     SubscriptionVisitor.analyze(Collections.singletonList(check), fileContext);
-    assertThat(check.reportedRegexTrees).hasSize(10);
-    assertThat(fileContext.getIssues()).hasSize(10);
+    assertThat(check.reportedRegexTrees).hasSize(11);
+    assertThat(fileContext.getIssues()).hasSize(11);
   }
 
   @Test
diff --git a/python-checks/src/test/resources/checks/regex/abstractRegexCheck.py b/python-checks/src/test/resources/checks/regex/abstractRegexCheck.py
@@ -9,9 +9,14 @@
 re.fullmatch(r'.*', "foo") # Noncompliant
 re.split(r'.*', "foo") # Noncompliant
 re.findall(r'.*', "foo") # Noncompliant
-re.finditer(r'.*', "foo") # Noncompliant
+re.finditer('.*', "foo") # Noncompliant
+
+re.match('.*\N{GREEK SMALL LETTER FINAL SIGMA}', 'foo') # We do ignore not raw strings containing \N escape sequences
+re.match(r'.*\N{GREEK SMALL LETTER FINAL SIGMA}', 'foo') # Noncompliant
+
+some_var = 'foo'
+re.match(f'.*{some_var}', 'foo') # We do ignore f-strings that do contain an expression
 
-re.sub('.*', "x", "a") # We only look at raw strings for now
 re.sub(r'.*' r'.*', "x", "a") # We do not look at concats for now
 re.sub() # Required arguments not provided
 re.not_relevant_method(r'.*', "x", "a")
diff --git a/python-frontend/src/main/java/org/sonar/python/regex/PythonAnalyzerRegexSource.java b/python-frontend/src/main/java/org/sonar/python/regex/PythonAnalyzerRegexSource.java
@@ -22,6 +22,7 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.Locale;
 import org.sonar.plugins.python.api.LocationInFile;
 import org.sonar.plugins.python.api.tree.StringElement;
 import org.sonar.plugins.python.api.tree.Token;
@@ -35,14 +36,17 @@ public class PythonAnalyzerRegexSource extends PythonRegexSource {
   private final int sourceStartOffset;
   private final int[] lineStartOffsets;
 
+  private final boolean isRawString;
+
   public PythonAnalyzerRegexSource(StringElement s) {
     // TODO: Do we need the quote? If yes, don't hardcode
     super(s.trimmedQuotesValue(), '"');
+    String prefix = s.prefix();
     Token firstToken = s.firstToken();
     sourceLine = firstToken.line();
-    // TODO: The +1 represents the prefix size. Right now we only scan patterns with the raw prefix.
-    sourceStartOffset = firstToken.column() + (s.isTripleQuoted() ? 3 : 1) + 1;
+    sourceStartOffset = firstToken.column() + (s.isTripleQuoted() ? 3 : 1) + prefix.length();
     lineStartOffsets = lineStartOffsets(getSourceText());
+    isRawString = prefix.toLowerCase(Locale.ROOT).contains("r");
   }
 
   @Override
@@ -56,6 +60,10 @@ public LocationInFile locationInFileFor(IndexRange range) {
     return new LocationInFile(null, startLineAndOffset[0], startLineAndOffset[1], endLineAndOffset[0], endLineAndOffset[1]);
   }
 
+  public boolean isRawString() {
+    return isRawString;
+  }
+
   private int[] lineAndOffset(int index) {
     int line;
     int offset;
diff --git a/python-frontend/src/main/java/org/sonar/python/regex/PythonStringCharacterParser.java b/python-frontend/src/main/java/org/sonar/python/regex/PythonStringCharacterParser.java
@@ -20,17 +20,23 @@
 package org.sonar.python.regex;
 
 import java.util.NoSuchElementException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 import javax.annotation.Nullable;
 import org.sonarsource.analyzer.commons.regex.CharacterParser;
-import org.sonarsource.analyzer.commons.regex.RegexSource;
 import org.sonarsource.analyzer.commons.regex.ast.IndexRange;
 import org.sonarsource.analyzer.commons.regex.ast.SourceCharacter;
 
 public class PythonStringCharacterParser implements CharacterParser {
 
+  private static final Pattern UNICODE_16_BIT_PATTERN = Pattern.compile("\\Au([0-9A-Fa-f]{4})");
+  private static final Pattern UNICODE_32_BIT_PATTERN = Pattern.compile("\\AU([0-9A-Fa-f]{8})");
+  private static final Pattern HEX_PATTERN = Pattern.compile("\\Ax([0-9A-Fa-f]{2})");
+  private static final Pattern OCTAL_PATTERN = Pattern.compile("\\A([0-7]{1,3})");
+
   final String sourceText;
   final int textLength;
-  protected final RegexSource source;
+  protected final PythonAnalyzerRegexSource source;
   protected int index;
   @Nullable
   private SourceCharacter current;
@@ -48,10 +54,21 @@ public void moveNext() {
     if (this.index >= this.textLength) {
       this.current = null;
     } else {
-      this.current = this.createCharAndUpdateIndex(this.sourceText.charAt(this.index), 1);
+      this.current = parsePythonCharacter();
     }
   }
 
+  private SourceCharacter parsePythonCharacter() {
+    char ch = sourceText.charAt(index);
+    if (!source.isRawString() && ch == '\\') {
+      if (index + 1 >= textLength) {
+        return createCharAndUpdateIndex('\\', 1);
+      }
+      return parsePythonEscapeSequence();
+    }
+    return createCharAndUpdateIndex(ch, 1);
+  }
+
   SourceCharacter createCharAndUpdateIndex(char ch, int length) {
     int startIndex = this.index;
     this.index += length;
@@ -77,4 +94,55 @@ public void resetTo(int index) {
     this.index = index;
     this.moveNext();
   }
+
+  private SourceCharacter parsePythonEscapeSequence() {
+    char charAfterBackslash = sourceText.charAt(index + 1);
+    switch (charAfterBackslash) {
+      case '\n':
+        // \NEWLINE is ignored in python. We skip both characters
+        if (this.index + 2 >= this.textLength) {
+          return null;
+        }
+        this.index += 2;
+        this.moveNext();
+        return getCurrent();
+      case '\\':
+        return createCharAndUpdateIndex('\\', 2);
+      case '\'':
+        return createCharAndUpdateIndex('\'', 2);
+      case '"':
+        return createCharAndUpdateIndex('"', 2);
+      case 'a':
+        return createCharAndUpdateIndex('\u0007', 2);
+      case 'b':
+        return createCharAndUpdateIndex('\b', 2);
+      case 'f':
+        return createCharAndUpdateIndex('\f', 2);
+      case 'n':
+        return createCharAndUpdateIndex('\n', 2);
+      case 'r':
+        return createCharAndUpdateIndex('\r', 2);
+      case 't':
+        return createCharAndUpdateIndex('\t', 2);
+      case 'v':
+        return createCharAndUpdateIndex('\u000b', 2);
+      case 'u':
+        return createCharacterFromPattern(UNICODE_16_BIT_PATTERN, 16, 2);
+      case 'U':
+        return createCharacterFromPattern(UNICODE_32_BIT_PATTERN, 16, 2);
+      case 'x':
+        return createCharacterFromPattern(HEX_PATTERN, 16, 2);
+      default:
+        return createCharacterFromPattern(OCTAL_PATTERN, 8, 1);
+    }
+  }
+
+  private SourceCharacter createCharacterFromPattern(Pattern pattern, int radix, int initialLength) {
+    Matcher matcher = pattern.matcher(sourceText.substring(index + 1));
+    if (matcher.find()) {
+      String value = matcher.group(1);
+      return createCharAndUpdateIndex((char) Integer.parseInt(value, radix), value.length() + initialLength);
+    }
+    return createCharAndUpdateIndex('\\', 1);
+  }
 }
diff --git a/python-frontend/src/test/java/org/sonar/python/regex/PythonStringCharacterParserTest.java b/python-frontend/src/test/java/org/sonar/python/regex/PythonStringCharacterParserTest.java
@@ -19,6 +19,8 @@
  */
 package org.sonar.python.regex;
 
+import java.util.ArrayList;
+import java.util.List;
 import java.util.NoSuchElementException;
 import org.junit.Test;
 import org.sonar.plugins.python.api.tree.StringElement;
@@ -61,6 +63,50 @@ public void test_reset_to() {
     assertThat(characterParser.getCurrent().getCharacter()).isEqualTo('a');
   }
 
+  @Test
+  public void test_escaping_has_no_meaning_in_raw_string() {
+    assertThat(chars("r'\\n'")).containsExactly('\\', 'n');
+  }
+
+  @Test
+  public void test_different_escape_sequences() {
+    assertThat(chars("'a\\\nb'")).containsExactly('a', 'b');
+    assertThat(chars("'a\\\n'")).containsExactly('a');
+    assertThat(chars("'\\\\'")).containsExactly('\\');
+    assertThat(chars("'\\''")).containsExactly('\'');
+    assertThat(chars("'\\\"'")).containsExactly('"');
+    assertThat(chars("'\\a'")).containsExactly('\u0007');
+    assertThat(chars("'\\b'")).containsExactly('\b');
+    assertThat(chars("'\\f'")).containsExactly('\f');
+    assertThat(chars("'\\n'")).containsExactly('\n');
+    assertThat(chars("'\\r'")).containsExactly('\r');
+    assertThat(chars("'\\t'")).containsExactly('\t');
+    assertThat(chars("'\\v'")).containsExactly('\u000b');
+    assertThat(chars("'\\u0041'")).containsExactly('A');
+    assertThat(chars("'\\U00000041'")).containsExactly('A');
+    assertThat(chars("'\\x41'")).containsExactly('A');
+    assertThat(chars("'\\101'")).containsExactly('A');
+
+    assertThat(chars("'\\y'")).containsExactly('\\', 'y');
+  }
+
+  @Test
+  public void invalid_escape_sequences() {
+    assertThat(chars("'\\x4'")).containsExactly('\\', 'x', '4');
+    assertThat(chars("'\\u4'")).containsExactly('\\', 'u', '4');
+    assertThat(chars("'\\U4'")).containsExactly('\\', 'U', '4');
+  }
+
+  private List<Character> chars(String s) {
+    CharacterParser characterParser = getCharacterParser(s);
+    List<Character> result = new ArrayList<>();
+    while (characterParser.isNotAtEnd()) {
+      result.add(characterParser.getCurrent().getCharacter());
+      characterParser.moveNext();
+    }
+    return result;
+  }
+
   private CharacterParser getCharacterParser(String s) {
     PythonAnalyzerRegexSource regexSource = new PythonAnalyzerRegexSource(stringElement(s));
     return regexSource.createCharacterParser();

Original file line number	Diff line number	Diff line change
`@@ -58,8 +58,8 @@ public void checkRegex(RegexParseResult regexParseResult, CallExpression regexFu`
`58`	`58`
`59`	`59`	`PythonVisitorContext fileContext = TestPythonVisitorRunner.createContext(FILE);`
`60`	`60`	`SubscriptionVisitor.analyze(Collections.singletonList(check), fileContext);`
`61`		`- assertThat(check.reportedRegexTrees).hasSize(10);`
`62`		`- assertThat(fileContext.getIssues()).hasSize(10);`
	`61`	`+ assertThat(check.reportedRegexTrees).hasSize(11);`
	`62`	`+ assertThat(fileContext.getIssues()).hasSize(11);`
`63`	`63`	`}`
`64`	`64`
`65`	`65`	`@Test`