SONARPY-1572: Fix parse error for challenging unicode characters (#2022)

joke1196 · web-flow · commit 6d4c9c5d3dcc · 2024-09-30T15:04:10.000+02:00
diff --git a/python-frontend/src/main/java/org/sonar/python/lexer/PythonLexer.java b/python-frontend/src/main/java/org/sonar/python/lexer/PythonLexer.java
@@ -29,6 +29,7 @@
 import org.sonar.python.api.PythonTokenType;
 
 import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.and;
+import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.or;
 import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.commentRegexp;
 import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.o2n;
 import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.regexp;
@@ -39,6 +40,7 @@ public final class PythonLexer {
   private static final String BYTES_PREFIX = "([bB][Rr]?|[rR][bB]?)";
   private static final String IMAGINARY_SUFFIX = "(j|J)";
   private static final String LONG_INTEGER_SUFFIX = "(l|L)";
+  private static final String UNICODE_CHAR = "[^\u0000-\u007F]";
   private static final String IDENTIFIER_START = "[\\p{Lu}\\p{Ll}\\p{Lt}\\p{Lm}\\p{Lo}\\p{Nl}_]";
   private static final String IDENTIFIER_CONTINUE = "[" + IDENTIFIER_START + "\\p{Mn}\\p{Mc}\\p{Nd}\\p{Pc}]";
 
@@ -79,7 +81,7 @@ private static void addCommonChannels(Lexer.Builder builder, LexerState lexerSta
 
       //https://docs.python.org/3.6/reference/lexical_analysis.html#formatted-string-literals
       .withChannel(new FStringChannel(lexerState))
-      
+
       // http://docs.python.org/release/3.2/reference/lexical_analysis.html#string-and-bytes-literals
       .withChannel(regexp(PythonTokenType.STRING, BYTES_PREFIX + SINGLE_QUOTE_STRING))
       .withChannel(regexp(PythonTokenType.STRING, BYTES_PREFIX + DOUBLE_QUOTES_STRING))
@@ -101,7 +103,9 @@ private static void addCommonChannels(Lexer.Builder builder, LexerState lexerSta
       .withChannel(regexp(PythonTokenType.NUMBER, "0(_?0)*+" + LONG_INTEGER_SUFFIX + "?+"))
 
       // http://docs.python.org/reference/lexical_analysis.html#identifiers
-      .withChannel(new IdentifierAndKeywordChannel(and(IDENTIFIER_START, o2n(IDENTIFIER_CONTINUE)), true, PythonKeyword.values()))
+      .withChannel(new IdentifierAndKeywordChannel(and(
+        or(IDENTIFIER_START, UNICODE_CHAR),
+        o2n(or(IDENTIFIER_CONTINUE, UNICODE_CHAR))), true, PythonKeyword.values()))
 
       // http://docs.python.org/reference/lexical_analysis.html#operators
       // http://docs.python.org/reference/lexical_analysis.html#delimiters
diff --git a/python-frontend/src/test/resources/parser/python/own/tricky_unicode_symbols.py b/python-frontend/src/test/resources/parser/python/own/tricky_unicode_symbols.py
@@ -0,0 +1,9 @@
+ä = 1
+µ = 2
+蟒 = 3
+x󠄀 = 4
+មុ = 1
+Q̇_per_meter = 4
+
+A᧚ = 3
+A፩ = 8