Skip to content

Commit 94d3595

Browse files
SONARPY-624 Fix parsing error on non-ASCII characters in identifiers (#653)
1 parent 7a92904 commit 94d3595

File tree

2 files changed

+9
-1
lines changed

2 files changed

+9
-1
lines changed

python-frontend/src/main/java/org/sonar/python/lexer/PythonLexer.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ public final class PythonLexer {
4141
private static final String IMAGINARY_SUFFIX = "(j|J)";
4242
private static final String LONG_INTEGER_SUFFIX = "(l|L)";
4343
private static final String FORMATTED_STRING_PREFIX = "([fF][rR]?|[rR][fF]?)";
44+
private static final String IDENTIFIER_START = "[\\p{Lu}\\p{Ll}\\p{Lt}\\p{Lm}\\p{Lo}\\p{Nl}_]";
45+
private static final String IDENTIFIER_CONTINUE = "[" + IDENTIFIER_START + "\\p{Mn}\\p{Mc}\\p{Nd}\\p{Pc}]";
4446

4547
private PythonLexer() {
4648
}
@@ -87,7 +89,7 @@ public static Lexer create(Charset charset, LexerState lexerState) {
8789
.withChannel(regexp(PythonTokenType.NUMBER, "0(_?0)*+" + LONG_INTEGER_SUFFIX + "?+"))
8890

8991
// http://docs.python.org/reference/lexical_analysis.html#identifiers
90-
.withChannel(new IdentifierAndKeywordChannel(and("[a-zA-Z_]", o2n("\\w")), true, PythonKeyword.values()))
92+
.withChannel(new IdentifierAndKeywordChannel(and(IDENTIFIER_START, o2n(IDENTIFIER_CONTINUE)), true, PythonKeyword.values()))
9193

9294
// http://docs.python.org/reference/lexical_analysis.html#operators
9395
// http://docs.python.org/reference/lexical_analysis.html#delimiters

python-frontend/src/test/java/org/sonar/python/lexer/PythonLexerTest.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,4 +304,10 @@ public void mixed_tabs_spaces() {
304304

305305
}
306306

307+
@Test
308+
public void non_ascii_characters() {
309+
assertThat(lexer.lex("_hello123"), hasToken(GenericTokenType.IDENTIFIER));
310+
assertThat(lexer.lex("こんにちは"), hasToken(GenericTokenType.IDENTIFIER));
311+
assertThat(lexer.lex("_你好"), hasToken(GenericTokenType.IDENTIFIER));
312+
}
307313
}

0 commit comments

Comments
 (0)