SONARPY-928 Nested character classes are not supported by Python (#984)

nils-werner-sonarsource · web-flow · commit 213aeceb9f73 · 2021-11-05T11:18:24.000+01:00
diff --git a/its/ruling/src/test/resources/expected/python-S5856.json b/its/ruling/src/test/resources/expected/python-S5856.json
@@ -1,21 +1,8 @@
 {
-'project:biopython/Bio/motifs/pfm.py':[
-338,
-],
-'project:mypy-0.782/test-data/stdlib-samples/3.2/glob.py':[
-76,
-77,
-],
-'project:numpy-1.16.4/numpy/distutils/mingw32ccompiler.py':[
-53,
-],
-'project:tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py':[
-75,
-],
 'project:tornado-2.3/demos/appengine/markdown.py':[
-826,
+822,
 ],
 'project:tornado-2.3/demos/blog/markdown.py':[
-826,
+822,
 ],
 }
diff --git a/pom.xml b/pom.xml
@@ -91,7 +91,7 @@
     <mockito.version>3.9.0</mockito.version>
     <sonar.version>8.9.0.43852</sonar.version>
     <sonar.orchestrator.version>3.35.1.2719</sonar.orchestrator.version>
-    <sonar-analyzer-commons.version>1.21.0.821</sonar-analyzer-commons.version>
+    <sonar-analyzer-commons.version>1.21.0.829</sonar-analyzer-commons.version>
     <sonarlint-core.version>6.0.0.32513</sonarlint-core.version>
     <sslr.version>1.23</sslr.version>
     <protobuf.version>3.17.3</protobuf.version>
diff --git a/python-checks/src/test/resources/checks/regex/duplicatesInCharacterClassCheck.py b/python-checks/src/test/resources/checks/regex/duplicatesInCharacterClassCheck.py
@@ -31,12 +31,6 @@ def non_compliant(input):
     re.match(r"[\"\".]", input)  # Noncompliant
     re.match(r"[\x{F600}-\x{F637}\x{F608}]", input)  # Noncompliant
     re.match(r"[\Qxx\E]", input)  # Noncompliant
-    re.match(r"[[a][a]]", input)  # Noncompliant
-    re.match(r"[[abc][b]]", input)  # Noncompliant
-    re.match(r"[[^a]b]", input)  # Noncompliant
-    re.match(r"[[^a]z]", input)  # Noncompliant
-    re.match(r"[a[^z]]", input)  # Noncompliant
-    re.match(r"[z[^a]]", input)  # Noncompliant
     re.match(r"[\s\Sx]", input)  # Noncompliant
     re.match(r"(?U)[\s\Sx]", input)  # Noncompliant
     re.match(r"[\w\d]", input)  # Noncompliant
@@ -54,14 +48,15 @@ def non_compliant(input):
     re.match(r"(?i)[äÄ]", input) # Noncompliant
     re.match(r"(?i)[Ä-Üä]", input) # Noncompliant
     re.match(r"(?i)[a-Öö]", input) # Noncompliant
+    re.match(r"[[^\s\S]x]", input) # Noncompliant
+    re.match(r"(?U)[[^\W]a]", input)  # Noncompliant
 
 
 def compliant(input):
     re.match(r"a-z\d", input)
     re.match(r"[0-9][0-9]?", input)
     re.match(r"[xX]", input)
     re.match(r"[\s\S]", input)
-    re.match(r"[[^\s\S]x]", input)
     re.match(r"(?U)[\s\S]", input)
     re.match(r"(?U)[\S\u0085\u2028\u2029]", input)
     re.match(r"[\d\D]", input)
@@ -85,8 +80,6 @@ def compliant(input):
     re.match(r"[z-a9-0]", input)  # Illegal character class should not make the check explode
     re.match(r"[aa", input)  # Check should not run on syntactically invalid regexen
     re.match(r"(?U)[\wä]", input)  # False negative because we don't support Unicode characters in \w and \W
-    re.match(r"(?U)[[^\W]a]", input)  # False negative because once we negate a character class whose contents we don't
-    # fully understand, we ignore it to avoid false positives
     re.match(r"[[a-z&&b-e]c]", input)  # FN because we don't support intersections
     re.match(r"(?i)[A-_d-{]", input)  # FN because we ignore case insensitivity unless both ends of the ranges are letters
     re.match(r"(?i)[A-z_]", input)  # FN because A-z gets misinterpreted as A-Za-z due to the way we handle case insensitivity
diff --git a/python-checks/src/test/resources/checks/regex/invalidRegexCheck.py b/python-checks/src/test/resources/checks/regex/invalidRegexCheck.py
@@ -25,6 +25,12 @@ def unsupported_feature(input):
 
 
 def false_positives():
-    re.compile(r"\s*([ACGT])\s*[[]*[|]*\s*([0-9.\s]+)\s*[]]*\s*")  # Noncompliant
-    re.compile(r'^\s+\[([\s*[0-9]*)\] ([a-zA-Z0-9_]*)')  # Noncompliant
-    re.compile(r'([^,[\]]*)(\[([^\]]+)\])?$')  # Noncompliant
+    re.compile(r'''
+          # Match tail of: [text][id]
+          [ ]?          # one optional space
+          (?:\n[ ]*)?   # one optional newline followed by spaces
+          \[
+            (?P<id>.*?)
+          \]
+        ''', re.X | re.S)
+    # Noncompliant@-5