Skip to content

Commit b9dc756

Browse files
SONARPY-888 Rule S5868 Unicode Grapheme Clusters should be avoided inside regex character classes (#967)
1 parent 525b00c commit b9dc756

File tree

7 files changed

+130
-0
lines changed

7 files changed

+130
-0
lines changed

python-checks/src/main/java/org/sonar/python/checks/CheckList.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
import org.sonar.python.checks.hotspots.UnverifiedHostnameCheck;
5252
import org.sonar.python.checks.regex.AnchorPrecedenceCheck;
5353
import org.sonar.python.checks.regex.EmptyStringRepetitionCheck;
54+
import org.sonar.python.checks.regex.GraphemeClustersInClassesCheck;
5455
import org.sonar.python.checks.regex.SingleCharacterAlternationCheck;
5556
import org.sonar.python.checks.regex.RedundantRegexAlternativesCheck;
5657
import org.sonar.python.checks.regex.RegexLookaheadCheck;
@@ -130,6 +131,7 @@ public static Iterable<Class> getChecks() {
130131
FunctionReturnTypeCheck.class,
131132
FunctionUsingLoopVariableCheck.class,
132133
GenericExceptionRaisedCheck.class,
134+
GraphemeClustersInClassesCheck.class,
133135
HardCodedCredentialsCheck.class,
134136
HardcodedIPCheck.class,
135137
HashingDataCheck.class,
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/*
2+
* SonarQube Python Plugin
3+
* Copyright (C) 2011-2021 SonarSource SA
4+
* mailto:info AT sonarsource DOT com
5+
*
6+
* This program is free software; you can redistribute it and/or
7+
* modify it under the terms of the GNU Lesser General Public
8+
* License as published by the Free Software Foundation; either
9+
* version 3 of the License, or (at your option) any later version.
10+
*
11+
* This program is distributed in the hope that it will be useful,
12+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14+
* Lesser General Public License for more details.
15+
*
16+
* You should have received a copy of the GNU Lesser General Public License
17+
* along with this program; if not, write to the Free Software Foundation,
18+
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19+
*/
20+
package org.sonar.python.checks.regex;
21+
22+
import org.sonar.check.Rule;
23+
import org.sonar.plugins.python.api.tree.CallExpression;
24+
import org.sonarsource.analyzer.commons.regex.RegexParseResult;
25+
import org.sonarsource.analyzer.commons.regex.finders.GraphemeInClassFinder;
26+
27+
@Rule(key = "S5868")
28+
public class GraphemeClustersInClassesCheck extends AbstractRegexCheck {
29+
30+
@Override
31+
public void checkRegex(RegexParseResult regexParseResult, CallExpression regexFunctionCall) {
32+
new GraphemeInClassFinder(this::addIssue).visit(regexParseResult);
33+
}
34+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<p>When placing Unicode <a href="https://unicode.org/glossary/#grapheme_cluster">Grapheme Clusters</a> (characters which require to be encoded in
2+
multiple <a href="https://unicode.org/glossary/#code_point">Code Points</a>) inside a character class of a regular expression, this will likely lead
3+
to unintended behavior.</p>
4+
<p>For instance, the grapheme cluster <code></code> requires two code points: one for <code>'c'</code>, followed by one for the <em>umlaut</em>
5+
modifier <code>'\u{0308}'</code>. If placed within a character class, such as <code>[c̈]</code>, the regex will consider the character class being the
6+
enumeration <code>[c\u{0308}]</code> instead. It will, therefore, match every <code>'c'</code> and every <em>umlaut</em> that isn’t expressed as a
7+
single codepoint, which is extremely unlikely to be the intended behavior.</p>
8+
<p>This rule raises an issue every time Unicode Grapheme Clusters are used within a character class of a regular expression.</p>
9+
<h2>Noncompliant Code Example</h2>
10+
<pre>
11+
re.sub(r"[c̈d̈]", "X", "cc̈d̈d"); # Noncompliant, print "XXXXXX" instead of expected "cXXd".
12+
</pre>
13+
<h2>Compliant Solution</h2>
14+
<pre>
15+
re.sub(r"c̈|d̈", "X", "cc̈d̈d"); # print "cXXd"
16+
</pre>
17+
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
"title": "Unicode Grapheme Clusters should be avoided inside regex character classes",
3+
"type": "BUG",
4+
"status": "ready",
5+
"remediation": {
6+
"func": "Constant\/Issue",
7+
"constantCost": "5min"
8+
},
9+
"tags": [
10+
"regex"
11+
],
12+
"defaultSeverity": "Major",
13+
"ruleSpecification": "RSPEC-5868",
14+
"sqKey": "S5868",
15+
"scope": "Main",
16+
"quickfix": "unknown"
17+
}

python-checks/src/main/resources/org/sonar/l10n/py/rules/python/Sonar_way_profile.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@
137137
"S5850",
138138
"S5855",
139139
"S5864",
140+
"S5868",
140141
"S5886",
141142
"S5890",
142143
"S6002",
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
/*
2+
* SonarQube Python Plugin
3+
* Copyright (C) 2011-2021 SonarSource SA
4+
* mailto:info AT sonarsource DOT com
5+
*
6+
* This program is free software; you can redistribute it and/or
7+
* modify it under the terms of the GNU Lesser General Public
8+
* License as published by the Free Software Foundation; either
9+
* version 3 of the License, or (at your option) any later version.
10+
*
11+
* This program is distributed in the hope that it will be useful,
12+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14+
* Lesser General Public License for more details.
15+
*
16+
* You should have received a copy of the GNU Lesser General Public License
17+
* along with this program; if not, write to the Free Software Foundation,
18+
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19+
*/
20+
package org.sonar.python.checks.regex;
21+
22+
import org.junit.Test;
23+
import org.sonar.python.checks.utils.PythonCheckVerifier;
24+
25+
public class GraphemeClustersInClassesCheckTest {
26+
27+
@Test
28+
public void test() {
29+
PythonCheckVerifier.verify("src/test/resources/checks/regex/graphemeClustersInClassesCheck.py", new GraphemeClustersInClassesCheck());
30+
}
31+
32+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import re
2+
3+
4+
def non_compliant(input):
5+
re.match(r'[aaaèaaa]', input) # Noncompliant {{Extract 1 Grapheme Cluster(s) from this character class.}}
6+
# ^^^^^^^^^^
7+
re.match(r'[0Ṩ0]', input) # Noncompliant {{Extract 1 Grapheme Cluster(s) from this character class.}}
8+
re.match(r'aaa[è]aaa', input) # Noncompliant
9+
# two secondary per line: one for the regex location, and one for the cluster location
10+
re.match(r'[èaaèaaè]', input) # Noncompliant {{Extract 3 Grapheme Cluster(s) from this character class.}}
11+
re.match(r'[èa-dä]', input) # Noncompliant
12+
re.match(r'[èa]aaa[dè]', input) # Noncompliant 2
13+
re.match(r'[ä]', input) # Noncompliant
14+
re.match(r'[c̈]', input) # Noncompliant
15+
re.match(r'[e⃝]', input) # Noncompliant
16+
17+
18+
def compliant(input):
19+
re.match(r'[é]', input) # Compliant, a single char
20+
re.match(r'[e\u0300]', input) # Compliant, escaped unicode
21+
re.match(r'[e\x{0300}]', input) # Compliant, escaped unicode
22+
re.match(r'[e\u20DD̀]', input) # Compliant, (letter, escaped unicode, mark) can not be combined
23+
re.match(r'[\u0300e]', input) # Compliant, escaped unicode, letter
24+
re.match(r'[̀̀]', input) # Compliant, two marks
25+
re.match(r'[̀̀]', input) # Compliant, one mark
26+
27+
re.match(r'/ä/', input) # Compliant, not in a class

0 commit comments

Comments
 (0)