29
29
import org .sonar .python .api .PythonTokenType ;
30
30
31
31
import static com .sonar .sslr .impl .channel .RegexpChannelBuilder .and ;
32
+ import static com .sonar .sslr .impl .channel .RegexpChannelBuilder .or ;
32
33
import static com .sonar .sslr .impl .channel .RegexpChannelBuilder .commentRegexp ;
33
34
import static com .sonar .sslr .impl .channel .RegexpChannelBuilder .o2n ;
34
35
import static com .sonar .sslr .impl .channel .RegexpChannelBuilder .regexp ;
@@ -39,6 +40,7 @@ public final class PythonLexer {
39
40
private static final String BYTES_PREFIX = "([bB][Rr]?|[rR][bB]?)" ;
40
41
private static final String IMAGINARY_SUFFIX = "(j|J)" ;
41
42
private static final String LONG_INTEGER_SUFFIX = "(l|L)" ;
43
+ private static final String UNICODE_CHAR = "[^\u0000 -\u007F ]" ;
42
44
private static final String IDENTIFIER_START = "[\\ p{Lu}\\ p{Ll}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{Nl}_]" ;
43
45
private static final String IDENTIFIER_CONTINUE = "[" + IDENTIFIER_START + "\\ p{Mn}\\ p{Mc}\\ p{Nd}\\ p{Pc}]" ;
44
46
@@ -79,7 +81,7 @@ private static void addCommonChannels(Lexer.Builder builder, LexerState lexerSta
79
81
80
82
//https://docs.python.org/3.6/reference/lexical_analysis.html#formatted-string-literals
81
83
.withChannel (new FStringChannel (lexerState ))
82
-
84
+
83
85
// http://docs.python.org/release/3.2/reference/lexical_analysis.html#string-and-bytes-literals
84
86
.withChannel (regexp (PythonTokenType .STRING , BYTES_PREFIX + SINGLE_QUOTE_STRING ))
85
87
.withChannel (regexp (PythonTokenType .STRING , BYTES_PREFIX + DOUBLE_QUOTES_STRING ))
@@ -101,7 +103,9 @@ private static void addCommonChannels(Lexer.Builder builder, LexerState lexerSta
101
103
.withChannel (regexp (PythonTokenType .NUMBER , "0(_?0)*+" + LONG_INTEGER_SUFFIX + "?+" ))
102
104
103
105
// http://docs.python.org/reference/lexical_analysis.html#identifiers
104
- .withChannel (new IdentifierAndKeywordChannel (and (IDENTIFIER_START , o2n (IDENTIFIER_CONTINUE )), true , PythonKeyword .values ()))
106
+ .withChannel (new IdentifierAndKeywordChannel (and (
107
+ or (IDENTIFIER_START , UNICODE_CHAR ),
108
+ o2n (or (IDENTIFIER_CONTINUE , UNICODE_CHAR ))), true , PythonKeyword .values ()))
105
109
106
110
// http://docs.python.org/reference/lexical_analysis.html#operators
107
111
// http://docs.python.org/reference/lexical_analysis.html#delimiters
0 commit comments