SONARPY-889 Rule S5869 Character classes in regular expressions should not contain the same character twice (#972)

nils-werner-sonarsource · web-flow · commit d8773c7e676c · 2021-11-02T13:27:58.000+01:00
* SONARPY-889 Rule S5869 Character classes in regular expressions should not contain the same character twice * Update ITs
diff --git a/its/ruling/src/test/resources/expected/python-S5869.json b/its/ruling/src/test/resources/expected/python-S5869.json
@@ -0,0 +1,63 @@
+{
+'project:biopython/Bio/PDB/parse_pdb_header.py':[
+206,
+],
+'project:biopython/Bio/Phylo/PAML/_parse_codeml.py':[
+165,
+],
+'project:biopython/Bio/SCOP/__init__.py':[
+128,
+],
+'project:buildbot-0.8.6p1/buildbot/process/mtrlogobserver.py':[
+85,
+86,
+],
+'project:django-2.2.3/django/core/signing.py':[
+49,
+],
+'project:django-2.2.3/django/test/client.py':[
+36,
+],
+'project:mypy-0.782/mypy/test/data.py':[
+455,
+],
+'project:numpy-1.16.4/numpy/distutils/command/build_src.py':[
+720,
+720,
+756,
+],
+'project:numpy-1.16.4/numpy/distutils/cpuinfo.py':[
+420,
+],
+'project:numpy-1.16.4/numpy/distutils/fcompiler/__init__.py':[
+976,
+],
+'project:numpy-1.16.4/numpy/distutils/from_template.py':[
+87,
+88,
+131,
+211,
+],
+'project:numpy-1.16.4/numpy/distutils/misc_util.py':[
+438,
+],
+'project:numpy-1.16.4/numpy/f2py/crackfortran.py':[
+303,
+2555,
+],
+'project:tensorflow/tools/tensorflow_builder/config_detector/data/cuda_compute_capability.py':[
+74,
+],
+'project:tornado-2.3/demos/appengine/markdown.py':[
+273,
+803,
+803,
+820,
+],
+'project:tornado-2.3/demos/blog/markdown.py':[
+273,
+803,
+803,
+820,
+],
+}
diff --git a/python-checks/src/main/java/org/sonar/python/checks/CheckList.java b/python-checks/src/main/java/org/sonar/python/checks/CheckList.java
@@ -50,6 +50,7 @@
 import org.sonar.python.checks.hotspots.UnsafeHttpMethodsCheck;
 import org.sonar.python.checks.hotspots.UnverifiedHostnameCheck;
 import org.sonar.python.checks.regex.AnchorPrecedenceCheck;
+import org.sonar.python.checks.regex.DuplicatesInCharacterClassCheck;
 import org.sonar.python.checks.regex.EmptyStringRepetitionCheck;
 import org.sonar.python.checks.regex.GraphemeClustersInClassesCheck;
 import org.sonar.python.checks.regex.RegexComplexityCheck;
@@ -110,6 +111,7 @@ public static Iterable<Class> getChecks() {
       DuplicateArgumentCheck.class,
       DuplicatedMethodFieldNamesCheck.class,
       DuplicatedMethodImplementationCheck.class,
+      DuplicatesInCharacterClassCheck.class,
       DynamicCodeExecutionCheck.class,
       ElseAfterLoopsWithoutBreakCheck.class,
       EmailSendingCheck.class,
diff --git a/python-checks/src/main/java/org/sonar/python/checks/regex/DuplicatesInCharacterClassCheck.java b/python-checks/src/main/java/org/sonar/python/checks/regex/DuplicatesInCharacterClassCheck.java
@@ -0,0 +1,34 @@
+/*
+ * SonarQube Python Plugin
+ * Copyright (C) 2011-2021 SonarSource SA
+ * mailto:info AT sonarsource DOT com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+package org.sonar.python.checks.regex;
+
+import org.sonar.check.Rule;
+import org.sonar.plugins.python.api.tree.CallExpression;
+import org.sonarsource.analyzer.commons.regex.RegexParseResult;
+import org.sonarsource.analyzer.commons.regex.finders.DuplicatesInCharacterClassFinder;
+
+@Rule(key = "S5869")
+public class DuplicatesInCharacterClassCheck extends AbstractRegexCheck {
+
+  @Override
+  public void checkRegex(RegexParseResult regexParseResult, CallExpression regexFunctionCall) {
+    new DuplicatesInCharacterClassFinder(this::addIssue).visit(regexParseResult);
+  }
+}
diff --git a/python-checks/src/main/resources/org/sonar/l10n/py/rules/python/S5869.html b/python-checks/src/main/resources/org/sonar/l10n/py/rules/python/S5869.html
@@ -0,0 +1,19 @@
+<p>Character classes in regular expressions are a convenient way to match one of several possible characters by listing the allowed characters or
+ranges of characters. If the same character is listed twice in the same character class or if the character class contains overlapping ranges, this
+has no effect.</p>
+<p>Thus duplicate characters in a character class are either a simple oversight or a sign that a range in the character class matches more than is
+intended or that the author misunderstood how character classes work and wanted to match more than one character. A common example of the latter
+mistake is trying to use a range like <code>[0-99]</code> to match numbers of up to two digits, when in fact it is equivalent to <code>[0-9]</code>.
+Another common cause is forgetting to escape the <code>-</code> character, creating an unintended range that overlaps with other characters in the
+character class.</p>
+<h2>Noncompliant Code Example</h2>
+<pre>
+r"[0-99]" # Noncompliant, this won't actually match strings with two digits
+r"[0-9.-_]" # Noncompliant, .-_ is a range that already contains 0-9 (as well as various other characters such as capital letters)
+</pre>
+<h2>Compliant Solution</h2>
+<pre>
+r"[0-9]{1,2}"
+r"[0-9.\\-_]"
+</pre>
+
diff --git a/python-checks/src/main/resources/org/sonar/l10n/py/rules/python/S5869.json b/python-checks/src/main/resources/org/sonar/l10n/py/rules/python/S5869.json
@@ -0,0 +1,17 @@
+{
+  "title": "Character classes in regular expressions should not contain the same character twice",
+  "type": "CODE_SMELL",
+  "status": "ready",
+  "remediation": {
+    "func": "Constant\/Issue",
+    "constantCost": "5min"
+  },
+  "tags": [
+    "regex"
+  ],
+  "defaultSeverity": "Major",
+  "ruleSpecification": "RSPEC-5869",
+  "sqKey": "S5869",
+  "scope": "Main",
+  "quickfix": "unknown"
+}
diff --git a/python-checks/src/main/resources/org/sonar/l10n/py/rules/python/Sonar_way_profile.json b/python-checks/src/main/resources/org/sonar/l10n/py/rules/python/Sonar_way_profile.json
@@ -140,6 +140,7 @@
     "S5857",
     "S5864",
     "S5868",
+    "S5869",
     "S5886",
     "S5890",
     "S6002",
diff --git a/python-checks/src/test/java/org/sonar/python/checks/regex/DuplicatesInCharacterClassCheckTest.java b/python-checks/src/test/java/org/sonar/python/checks/regex/DuplicatesInCharacterClassCheckTest.java
@@ -0,0 +1,33 @@
+/*
+ * SonarQube Python Plugin
+ * Copyright (C) 2011-2021 SonarSource SA
+ * mailto:info AT sonarsource DOT com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+package org.sonar.python.checks.regex;
+
+import org.junit.Test;
+import org.sonar.python.checks.utils.PythonCheckVerifier;
+
+import static org.junit.Assert.*;
+
+public class DuplicatesInCharacterClassCheckTest {
+
+  @Test
+  public void test() {
+    PythonCheckVerifier.verify("src/test/resources/checks/regex/duplicatesInCharacterClassCheck.py", new DuplicatesInCharacterClassCheck());
+  }
+}
diff --git a/python-checks/src/test/resources/checks/regex/duplicatesInCharacterClassCheck.py b/python-checks/src/test/resources/checks/regex/duplicatesInCharacterClassCheck.py
@@ -0,0 +1,101 @@
+import re
+
+
+def non_compliant(input):
+    re.match(r"[0-99]", input)  # Noncompliant {{Remove duplicates in this character class.}}
+    #           ^^^
+    #              ^@-1< {{Additional duplicate}}
+    re.match(r"[90-9]", input)  # Noncompliant
+    re.match(r"[0-73-9]", input)  # Noncompliant
+    re.match(r"[0-93-57]", input)  # Noncompliant
+    re.match(r"[4-92-68]", input)  # Noncompliant
+    re.match(r"[0-33-9]", input)  # Noncompliant
+    re.match(r"[0-70-9]", input)  # Noncompliant
+    re.match(r"[3-90-7]", input)  # Noncompliant
+    re.match(r"[3-50-9]", input)  # Noncompliant
+    re.match(r"[xxx]", input)  # Noncompliant
+    re.match(r"[A-z_]", input)  # Noncompliant
+    re.match(r"(?i)[A-Za-z]", input)  # Noncompliant
+    re.match(r"(?i)[A-_d]", input)  # Noncompliant
+    re.match(r"(?iu)[Ä-Üä]", input)  # Noncompliant
+    re.match(r"(?iu)[a-Öö]", input)  # Noncompliant
+    re.match(r"[  ]", input)  # Noncompliant
+    re.match(r"(?i)[  ]", input)  # Noncompliant
+    re.match(r"(?iu)[  ]", input)  # Noncompliant
+    re.match(r"(?i)[A-_D]", input)  # Noncompliant
+    re.match(r"(?iu)[A-_D]", input)  # Noncompliant
+    re.match(r"(?i)[xX]", input)  # Noncompliant
+    re.match(r"(?iu)[äÄ]", input)  # Noncompliant
+    re.match(r"(?iU)[äÄ]", input)  # Noncompliant
+    re.match(r"(?iu)[xX]", input)  # Noncompliant
+    re.match(r"[\"\".]", input)  # Noncompliant
+    re.match(r"[\x{F600}-\x{F637}\x{F608}]", input)  # Noncompliant
+    re.match(r"[\Qxx\E]", input)  # Noncompliant
+    re.match(r"[[a][a]]", input)  # Noncompliant
+    re.match(r"[[abc][b]]", input)  # Noncompliant
+    re.match(r"[[^a]b]", input)  # Noncompliant
+    re.match(r"[[^a]z]", input)  # Noncompliant
+    re.match(r"[a[^z]]", input)  # Noncompliant
+    re.match(r"[z[^a]]", input)  # Noncompliant
+    re.match(r"[\s\Sx]", input)  # Noncompliant
+    re.match(r"(?U)[\s\Sx]", input)  # Noncompliant
+    re.match(r"[\w\d]", input)  # Noncompliant
+    re.match(r"[\wa]", input)  # Noncompliant
+    re.match(r"[\d1]", input)  # Noncompliant
+    re.match(r"[\d1-3]", input)  # Noncompliant
+    re.match(r"(?U)[\wa]", input)  # Noncompliant
+    re.match(r"[A-Za-z]", input, re.IGNORECASE)  # Noncompliant
+    re.match(r"[0-9\d]", input)  # Noncompliant
+    re.match(r"[0-9\d]", input)  # Noncompliant
+    re.match(r"[0-9\\\d]", input)  # Noncompliant
+    re.match(r"(?(?=1)[0-99])", input)  # Noncompliant
+    re.match(r"(?(?=1)1|[0-99])", input)  # Noncompliant
+    # UNICODE flag is always enabled
+    re.match(r"(?i)[äÄ]", input) # Noncompliant
+    re.match(r"(?i)[Ä-Üä]", input) # Noncompliant
+    re.match(r"(?i)[a-Öö]", input) # Noncompliant
+
+
+def compliant(input):
+    re.match(r"a-z\d", input)
+    re.match(r"[0-9][0-9]?", input)
+    re.match(r"[xX]", input)
+    re.match(r"[\s\S]", input)
+    re.match(r"[[^\s\S]x]", input)
+    re.match(r"(?U)[\s\S]", input)
+    re.match(r"(?U)[\S\u0085\u2028\u2029]", input)
+    re.match(r"[\d\D]", input)
+    re.match(r"(?U)[\d\D]", input)
+    re.match(r"[\w\W]", input)
+    re.match(r"(?U)[\w\W]", input)
+    re.match(r"[\wä]", input)
+    re.match(r"(?i)[äÄ]", input, re.ASCII)
+    re.match(r"(?i)[Ä-Üä]", input, re.ASCII)
+    re.match(r"(?u)[äÄ]", input)
+    re.match(r"(?u)[xX]", input)
+    re.match(r"[ab-z]", input)
+    re.match(r"[[a][b]]", input)
+    re.match(r"[[^a]a]", input)
+    re.match(r"[Z-ax]", input, re.IGNORECASE)
+    re.match(r"(?i)[a-Öö]", input, re.ASCII)
+    re.match(r"[0-9\Q.-_\E]", input)  # This used to falsely interpret .-_ as a range and complain that it overlaps with 0-9
+    re.match(r"[A-Z\Q-_.\E]", input)
+    re.match(r"[\x00\x01]]", input)  # This used to falsely complain about x and 0 being duplicates
+    re.match(r"[\x00-\x01\x02-\x03]]", input)
+    re.match(r"[z-a9-0]", input)  # Illegal character class should not make the check explode
+    re.match(r"[aa", input)  # Check should not run on syntactically invalid regexen
+    re.match(r"(?U)[\wä]", input)  # False negative because we don't support Unicode characters in \w and \W
+    re.match(r"(?U)[[^\W]a]", input)  # False negative because once we negate a character class whose contents we don't
+    # fully understand, we ignore it to avoid false positives
+    re.match(r"[[a-z&&b-e]c]", input)  # FN because we don't support intersections
+    re.match(r"(?i)[A-_d-{]", input)  # FN because we ignore case insensitivity unless both ends of the ranges are letters
+    re.match(r"(?i)[A-z_]", input)  # FN because A-z gets misinterpreted as A-Za-z due to the way we handle case insensitivity
+    re.match(r"[\p{Armenian}x]", input)  # FN because we don't support \p at the moment
+    re.match(r"[\abc]", input)
+    re.match(r'[\s\'"\:\{\}\[\],&\*\#\?]', input)
+    re.match(r"[0-9\\d]", input)  # Compliant
+
+
+def emoji(input):
+    re.match(r"[😂😊]", input)  # Compliant
+    re.match(r"[^\ud800\udc00-\udbff\udfff]", input)  # Compliant