Skip to content

Commit d8773c7

Browse files
SONARPY-889 Rule S5869 Character classes in regular expressions should not contain the same character twice (#972)
* SONARPY-889 Rule S5869 Character classes in regular expressions should not contain the same character twice * Update ITs
1 parent 83d078c commit d8773c7

File tree

8 files changed

+270
-0
lines changed

8 files changed

+270
-0
lines changed
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
{
2+
'project:biopython/Bio/PDB/parse_pdb_header.py':[
3+
206,
4+
],
5+
'project:biopython/Bio/Phylo/PAML/_parse_codeml.py':[
6+
165,
7+
],
8+
'project:biopython/Bio/SCOP/__init__.py':[
9+
128,
10+
],
11+
'project:buildbot-0.8.6p1/buildbot/process/mtrlogobserver.py':[
12+
85,
13+
86,
14+
],
15+
'project:django-2.2.3/django/core/signing.py':[
16+
49,
17+
],
18+
'project:django-2.2.3/django/test/client.py':[
19+
36,
20+
],
21+
'project:mypy-0.782/mypy/test/data.py':[
22+
455,
23+
],
24+
'project:numpy-1.16.4/numpy/distutils/command/build_src.py':[
25+
720,
26+
720,
27+
756,
28+
],
29+
'project:numpy-1.16.4/numpy/distutils/cpuinfo.py':[
30+
420,
31+
],
32+
'project:numpy-1.16.4/numpy/distutils/fcompiler/__init__.py':[
33+
976,
34+
],
35+
'project:numpy-1.16.4/numpy/distutils/from_template.py':[
36+
87,
37+
88,
38+
131,
39+
211,
40+
],
41+
'project:numpy-1.16.4/numpy/distutils/misc_util.py':[
42+
438,
43+
],
44+
'project:numpy-1.16.4/numpy/f2py/crackfortran.py':[
45+
303,
46+
2555,
47+
],
48+
'project:tensorflow/tools/tensorflow_builder/config_detector/data/cuda_compute_capability.py':[
49+
74,
50+
],
51+
'project:tornado-2.3/demos/appengine/markdown.py':[
52+
273,
53+
803,
54+
803,
55+
820,
56+
],
57+
'project:tornado-2.3/demos/blog/markdown.py':[
58+
273,
59+
803,
60+
803,
61+
820,
62+
],
63+
}

python-checks/src/main/java/org/sonar/python/checks/CheckList.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
import org.sonar.python.checks.hotspots.UnsafeHttpMethodsCheck;
5151
import org.sonar.python.checks.hotspots.UnverifiedHostnameCheck;
5252
import org.sonar.python.checks.regex.AnchorPrecedenceCheck;
53+
import org.sonar.python.checks.regex.DuplicatesInCharacterClassCheck;
5354
import org.sonar.python.checks.regex.EmptyStringRepetitionCheck;
5455
import org.sonar.python.checks.regex.GraphemeClustersInClassesCheck;
5556
import org.sonar.python.checks.regex.RegexComplexityCheck;
@@ -110,6 +111,7 @@ public static Iterable<Class> getChecks() {
110111
DuplicateArgumentCheck.class,
111112
DuplicatedMethodFieldNamesCheck.class,
112113
DuplicatedMethodImplementationCheck.class,
114+
DuplicatesInCharacterClassCheck.class,
113115
DynamicCodeExecutionCheck.class,
114116
ElseAfterLoopsWithoutBreakCheck.class,
115117
EmailSendingCheck.class,
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/*
2+
* SonarQube Python Plugin
3+
* Copyright (C) 2011-2021 SonarSource SA
4+
* mailto:info AT sonarsource DOT com
5+
*
6+
* This program is free software; you can redistribute it and/or
7+
* modify it under the terms of the GNU Lesser General Public
8+
* License as published by the Free Software Foundation; either
9+
* version 3 of the License, or (at your option) any later version.
10+
*
11+
* This program is distributed in the hope that it will be useful,
12+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14+
* Lesser General Public License for more details.
15+
*
16+
* You should have received a copy of the GNU Lesser General Public License
17+
* along with this program; if not, write to the Free Software Foundation,
18+
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19+
*/
20+
package org.sonar.python.checks.regex;
21+
22+
import org.sonar.check.Rule;
23+
import org.sonar.plugins.python.api.tree.CallExpression;
24+
import org.sonarsource.analyzer.commons.regex.RegexParseResult;
25+
import org.sonarsource.analyzer.commons.regex.finders.DuplicatesInCharacterClassFinder;
26+
27+
@Rule(key = "S5869")
28+
public class DuplicatesInCharacterClassCheck extends AbstractRegexCheck {
29+
30+
@Override
31+
public void checkRegex(RegexParseResult regexParseResult, CallExpression regexFunctionCall) {
32+
new DuplicatesInCharacterClassFinder(this::addIssue).visit(regexParseResult);
33+
}
34+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
<p>Character classes in regular expressions are a convenient way to match one of several possible characters by listing the allowed characters or
2+
ranges of characters. If the same character is listed twice in the same character class or if the character class contains overlapping ranges, this
3+
has no effect.</p>
4+
<p>Thus duplicate characters in a character class are either a simple oversight or a sign that a range in the character class matches more than is
5+
intended or that the author misunderstood how character classes work and wanted to match more than one character. A common example of the latter
6+
mistake is trying to use a range like <code>[0-99]</code> to match numbers of up to two digits, when in fact it is equivalent to <code>[0-9]</code>.
7+
Another common cause is forgetting to escape the <code>-</code> character, creating an unintended range that overlaps with other characters in the
8+
character class.</p>
9+
<h2>Noncompliant Code Example</h2>
10+
<pre>
11+
r"[0-99]" # Noncompliant, this won't actually match strings with two digits
12+
r"[0-9.-_]" # Noncompliant, .-_ is a range that already contains 0-9 (as well as various other characters such as capital letters)
13+
</pre>
14+
<h2>Compliant Solution</h2>
15+
<pre>
16+
r"[0-9]{1,2}"
17+
r"[0-9.\\-_]"
18+
</pre>
19+
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
"title": "Character classes in regular expressions should not contain the same character twice",
3+
"type": "CODE_SMELL",
4+
"status": "ready",
5+
"remediation": {
6+
"func": "Constant\/Issue",
7+
"constantCost": "5min"
8+
},
9+
"tags": [
10+
"regex"
11+
],
12+
"defaultSeverity": "Major",
13+
"ruleSpecification": "RSPEC-5869",
14+
"sqKey": "S5869",
15+
"scope": "Main",
16+
"quickfix": "unknown"
17+
}

python-checks/src/main/resources/org/sonar/l10n/py/rules/python/Sonar_way_profile.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@
140140
"S5857",
141141
"S5864",
142142
"S5868",
143+
"S5869",
143144
"S5886",
144145
"S5890",
145146
"S6002",
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/*
2+
* SonarQube Python Plugin
3+
* Copyright (C) 2011-2021 SonarSource SA
4+
* mailto:info AT sonarsource DOT com
5+
*
6+
* This program is free software; you can redistribute it and/or
7+
* modify it under the terms of the GNU Lesser General Public
8+
* License as published by the Free Software Foundation; either
9+
* version 3 of the License, or (at your option) any later version.
10+
*
11+
* This program is distributed in the hope that it will be useful,
12+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14+
* Lesser General Public License for more details.
15+
*
16+
* You should have received a copy of the GNU Lesser General Public License
17+
* along with this program; if not, write to the Free Software Foundation,
18+
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19+
*/
20+
package org.sonar.python.checks.regex;
21+
22+
import org.junit.Test;
23+
import org.sonar.python.checks.utils.PythonCheckVerifier;
24+
25+
import static org.junit.Assert.*;
26+
27+
public class DuplicatesInCharacterClassCheckTest {
28+
29+
@Test
30+
public void test() {
31+
PythonCheckVerifier.verify("src/test/resources/checks/regex/duplicatesInCharacterClassCheck.py", new DuplicatesInCharacterClassCheck());
32+
}
33+
}
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
import re
2+
3+
4+
def non_compliant(input):
5+
re.match(r"[0-99]", input) # Noncompliant {{Remove duplicates in this character class.}}
6+
# ^^^
7+
# ^@-1< {{Additional duplicate}}
8+
re.match(r"[90-9]", input) # Noncompliant
9+
re.match(r"[0-73-9]", input) # Noncompliant
10+
re.match(r"[0-93-57]", input) # Noncompliant
11+
re.match(r"[4-92-68]", input) # Noncompliant
12+
re.match(r"[0-33-9]", input) # Noncompliant
13+
re.match(r"[0-70-9]", input) # Noncompliant
14+
re.match(r"[3-90-7]", input) # Noncompliant
15+
re.match(r"[3-50-9]", input) # Noncompliant
16+
re.match(r"[xxx]", input) # Noncompliant
17+
re.match(r"[A-z_]", input) # Noncompliant
18+
re.match(r"(?i)[A-Za-z]", input) # Noncompliant
19+
re.match(r"(?i)[A-_d]", input) # Noncompliant
20+
re.match(r"(?iu)[Ä-Üä]", input) # Noncompliant
21+
re.match(r"(?iu)[a-Öö]", input) # Noncompliant
22+
re.match(r"[ ]", input) # Noncompliant
23+
re.match(r"(?i)[ ]", input) # Noncompliant
24+
re.match(r"(?iu)[ ]", input) # Noncompliant
25+
re.match(r"(?i)[A-_D]", input) # Noncompliant
26+
re.match(r"(?iu)[A-_D]", input) # Noncompliant
27+
re.match(r"(?i)[xX]", input) # Noncompliant
28+
re.match(r"(?iu)[äÄ]", input) # Noncompliant
29+
re.match(r"(?iU)[äÄ]", input) # Noncompliant
30+
re.match(r"(?iu)[xX]", input) # Noncompliant
31+
re.match(r"[\"\".]", input) # Noncompliant
32+
re.match(r"[\x{F600}-\x{F637}\x{F608}]", input) # Noncompliant
33+
re.match(r"[\Qxx\E]", input) # Noncompliant
34+
re.match(r"[[a][a]]", input) # Noncompliant
35+
re.match(r"[[abc][b]]", input) # Noncompliant
36+
re.match(r"[[^a]b]", input) # Noncompliant
37+
re.match(r"[[^a]z]", input) # Noncompliant
38+
re.match(r"[a[^z]]", input) # Noncompliant
39+
re.match(r"[z[^a]]", input) # Noncompliant
40+
re.match(r"[\s\Sx]", input) # Noncompliant
41+
re.match(r"(?U)[\s\Sx]", input) # Noncompliant
42+
re.match(r"[\w\d]", input) # Noncompliant
43+
re.match(r"[\wa]", input) # Noncompliant
44+
re.match(r"[\d1]", input) # Noncompliant
45+
re.match(r"[\d1-3]", input) # Noncompliant
46+
re.match(r"(?U)[\wa]", input) # Noncompliant
47+
re.match(r"[A-Za-z]", input, re.IGNORECASE) # Noncompliant
48+
re.match(r"[0-9\d]", input) # Noncompliant
49+
re.match(r"[0-9\d]", input) # Noncompliant
50+
re.match(r"[0-9\\\d]", input) # Noncompliant
51+
re.match(r"(?(?=1)[0-99])", input) # Noncompliant
52+
re.match(r"(?(?=1)1|[0-99])", input) # Noncompliant
53+
# UNICODE flag is always enabled
54+
re.match(r"(?i)[äÄ]", input) # Noncompliant
55+
re.match(r"(?i)[Ä-Üä]", input) # Noncompliant
56+
re.match(r"(?i)[a-Öö]", input) # Noncompliant
57+
58+
59+
def compliant(input):
60+
re.match(r"a-z\d", input)
61+
re.match(r"[0-9][0-9]?", input)
62+
re.match(r"[xX]", input)
63+
re.match(r"[\s\S]", input)
64+
re.match(r"[[^\s\S]x]", input)
65+
re.match(r"(?U)[\s\S]", input)
66+
re.match(r"(?U)[\S\u0085\u2028\u2029]", input)
67+
re.match(r"[\d\D]", input)
68+
re.match(r"(?U)[\d\D]", input)
69+
re.match(r"[\w\W]", input)
70+
re.match(r"(?U)[\w\W]", input)
71+
re.match(r"[\wä]", input)
72+
re.match(r"(?i)[äÄ]", input, re.ASCII)
73+
re.match(r"(?i)[Ä-Üä]", input, re.ASCII)
74+
re.match(r"(?u)[äÄ]", input)
75+
re.match(r"(?u)[xX]", input)
76+
re.match(r"[ab-z]", input)
77+
re.match(r"[[a][b]]", input)
78+
re.match(r"[[^a]a]", input)
79+
re.match(r"[Z-ax]", input, re.IGNORECASE)
80+
re.match(r"(?i)[a-Öö]", input, re.ASCII)
81+
re.match(r"[0-9\Q.-_\E]", input) # This used to falsely interpret .-_ as a range and complain that it overlaps with 0-9
82+
re.match(r"[A-Z\Q-_.\E]", input)
83+
re.match(r"[\x00\x01]]", input) # This used to falsely complain about x and 0 being duplicates
84+
re.match(r"[\x00-\x01\x02-\x03]]", input)
85+
re.match(r"[z-a9-0]", input) # Illegal character class should not make the check explode
86+
re.match(r"[aa", input) # Check should not run on syntactically invalid regexen
87+
re.match(r"(?U)[\wä]", input) # False negative because we don't support Unicode characters in \w and \W
88+
re.match(r"(?U)[[^\W]a]", input) # False negative because once we negate a character class whose contents we don't
89+
# fully understand, we ignore it to avoid false positives
90+
re.match(r"[[a-z&&b-e]c]", input) # FN because we don't support intersections
91+
re.match(r"(?i)[A-_d-{]", input) # FN because we ignore case insensitivity unless both ends of the ranges are letters
92+
re.match(r"(?i)[A-z_]", input) # FN because A-z gets misinterpreted as A-Za-z due to the way we handle case insensitivity
93+
re.match(r"[\p{Armenian}x]", input) # FN because we don't support \p at the moment
94+
re.match(r"[\abc]", input)
95+
re.match(r'[\s\'"\:\{\}\[\],&\*\#\?]', input)
96+
re.match(r"[0-9\\d]", input) # Compliant
97+
98+
99+
def emoji(input):
100+
re.match(r"[😂😊]", input) # Compliant
101+
re.match(r"[^\ud800\udc00-\udbff\udfff]", input) # Compliant

0 commit comments

Comments
 (0)