Skip to content

Commit 6d4c9c5

Browse files
authored
SONARPY-1572: Fix parse error for challenging unicode characters (#2022)
1 parent fbfb393 commit 6d4c9c5

File tree

2 files changed

+15
-2
lines changed

2 files changed

+15
-2
lines changed

python-frontend/src/main/java/org/sonar/python/lexer/PythonLexer.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import org.sonar.python.api.PythonTokenType;
3030

3131
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.and;
32+
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.or;
3233
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.commentRegexp;
3334
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.o2n;
3435
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.regexp;
@@ -39,6 +40,7 @@ public final class PythonLexer {
3940
private static final String BYTES_PREFIX = "([bB][Rr]?|[rR][bB]?)";
4041
private static final String IMAGINARY_SUFFIX = "(j|J)";
4142
private static final String LONG_INTEGER_SUFFIX = "(l|L)";
43+
private static final String UNICODE_CHAR = "[^\u0000-\u007F]";
4244
private static final String IDENTIFIER_START = "[\\p{Lu}\\p{Ll}\\p{Lt}\\p{Lm}\\p{Lo}\\p{Nl}_]";
4345
private static final String IDENTIFIER_CONTINUE = "[" + IDENTIFIER_START + "\\p{Mn}\\p{Mc}\\p{Nd}\\p{Pc}]";
4446

@@ -79,7 +81,7 @@ private static void addCommonChannels(Lexer.Builder builder, LexerState lexerSta
7981

8082
//https://docs.python.org/3.6/reference/lexical_analysis.html#formatted-string-literals
8183
.withChannel(new FStringChannel(lexerState))
82-
84+
8385
// http://docs.python.org/release/3.2/reference/lexical_analysis.html#string-and-bytes-literals
8486
.withChannel(regexp(PythonTokenType.STRING, BYTES_PREFIX + SINGLE_QUOTE_STRING))
8587
.withChannel(regexp(PythonTokenType.STRING, BYTES_PREFIX + DOUBLE_QUOTES_STRING))
@@ -101,7 +103,9 @@ private static void addCommonChannels(Lexer.Builder builder, LexerState lexerSta
101103
.withChannel(regexp(PythonTokenType.NUMBER, "0(_?0)*+" + LONG_INTEGER_SUFFIX + "?+"))
102104

103105
// http://docs.python.org/reference/lexical_analysis.html#identifiers
104-
.withChannel(new IdentifierAndKeywordChannel(and(IDENTIFIER_START, o2n(IDENTIFIER_CONTINUE)), true, PythonKeyword.values()))
106+
.withChannel(new IdentifierAndKeywordChannel(and(
107+
or(IDENTIFIER_START, UNICODE_CHAR),
108+
o2n(or(IDENTIFIER_CONTINUE, UNICODE_CHAR))), true, PythonKeyword.values()))
105109

106110
// http://docs.python.org/reference/lexical_analysis.html#operators
107111
// http://docs.python.org/reference/lexical_analysis.html#delimiters
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
ä = 1
2+
µ = 2
3+
= 3
4+
x󠄀 = 4
5+
មុ = 1
6+
Q̇_per_meter = 4
7+
8+
A᧚ = 3
9+
A፩ = 8

0 commit comments

Comments
 (0)