Skip to content

Commit 263f1b5

Browse files
authored
Merge pull request #1646 from guwirth/optimize-lexer
improve number lexer
2 parents 4fe19d2 + ed29517 commit 263f1b5

File tree

2 files changed

+50
-43
lines changed

2 files changed

+50
-43
lines changed

cxx-squid/src/main/java/org/sonar/cxx/lexer/CxxLexer.java

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,10 @@
2727
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.ANY_CHAR;
2828
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.and;
2929
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.commentRegexp;
30+
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.g;
3031
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.o2n;
3132
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.opt;
33+
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.or;
3234
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.regexp;
3335
import com.sonar.sslr.impl.channel.UnknownCharacterChannel;
3436
import org.sonar.cxx.CxxConfiguration;
@@ -42,13 +44,17 @@
4244
public final class CxxLexer {
4345

4446
private static final String HEX_PREFIX = "0[xX]";
45-
private static final String EXPONENT = "([Ee][+-]?+[0-9_]([']?+[0-9_]++)*+)";
46-
private static final String BINARY_EXPONENT = "([pP][+-]?+[0-9]([']?+[0-9]++)*+)"; // since C++17
47+
private static final String BIN_PREFIX = "0[bB]";
48+
private static final String EXPONENT = "[Ee][+-]?+[0-9_]([']?+[0-9_]++)*+";
49+
private static final String BINARY_EXPONENT = "[pP][+-]?+[0-9]([']?+[0-9]++)*+"; // since C++17
4750
//private static final String INTEGER_SUFFIX = "(((U|u)(i64|LL|ll|L|l)?)|((i64|LL|ll|L|l)(u|U)?))";
4851
//private static final String FLOAT_SUFFIX = "(f|l|F|L)";
4952
// ud-suffix: identifier (including INTEGER_SUFFIX, FLOAT_SUFFIX)
50-
private static final String UD_SUFFIX = "([_a-zA-Z]([_a-zA-Z0-9]*+))";
51-
private static final String HEXDIGIT_SEQUENCE = "([0-9a-fA-F]([']?+[0-9a-fA-F]++)*+)";
53+
private static final String UD_SUFFIX = "[_a-zA-Z][_a-zA-Z0-9]*+";
54+
private static final String DECDIGIT_SEQUENCE = "[0-9]([']?+[0-9]++)*+";
55+
private static final String HEXDIGIT_SEQUENCE = "[0-9a-fA-F]([']?+[0-9a-fA-F]++)*+";
56+
private static final String BINDIGIT_SEQUENCE = "[01]([']?+[01]++)*+";
57+
private static final String POINT = "\\.";
5258

5359
private CxxLexer() {
5460
}
@@ -77,25 +83,22 @@ public static Lexer create(CxxConfiguration conf, Preprocessor... preprocessors)
7783
.withChannel(new CharacterLiteralsChannel())
7884
// C++ Standard, Section 2.14.5 "String literals"
7985
.withChannel(new StringLiteralsChannel())
80-
// C++ Standard, Section 2.14.4 "Floating literals"
81-
.withChannel(regexp(CxxTokenType.NUMBER, "[0-9]([']?+[0-9]++)*+\\.([0-9]([']?+[0-9]++)*+)*+"
82-
+ opt(EXPONENT) + opt(UD_SUFFIX)))
83-
.withChannel(regexp(CxxTokenType.NUMBER, "\\.[0-9]([']?+[0-9]++)*+"
84-
+ opt(EXPONENT) + opt(UD_SUFFIX)))
85-
.withChannel(regexp(CxxTokenType.NUMBER, "[0-9]([']?+[0-9]++)*+"
86-
+ EXPONENT + opt(UD_SUFFIX)))
87-
.withChannel(regexp(CxxTokenType.NUMBER, HEX_PREFIX + HEXDIGIT_SEQUENCE
88-
+ BINARY_EXPONENT + opt(UD_SUFFIX))) // since C++17
89-
.withChannel(regexp(CxxTokenType.NUMBER, HEX_PREFIX + HEXDIGIT_SEQUENCE + "."
90-
+ BINARY_EXPONENT + opt(UD_SUFFIX))) // since C++17
91-
.withChannel(regexp(CxxTokenType.NUMBER, HEX_PREFIX + opt(HEXDIGIT_SEQUENCE) + "." + HEXDIGIT_SEQUENCE
92-
+ BINARY_EXPONENT + opt(UD_SUFFIX))) // since C++17
86+
9387
// C++ Standard, Section 2.14.2 "Integer literals"
94-
.withChannel(regexp(CxxTokenType.NUMBER, "[1-9]([']?+[0-9]++)*+" + opt(UD_SUFFIX))) // Decimal literals
95-
.withChannel(regexp(CxxTokenType.NUMBER, "0[bB][01]([']?+[01]++)*+" + opt(UD_SUFFIX))) // Binary Literals
96-
.withChannel(regexp(CxxTokenType.NUMBER, "0([']?+[0-7]++)++" + opt(UD_SUFFIX))) // Octal Literals
97-
.withChannel(regexp(CxxTokenType.NUMBER, HEX_PREFIX + HEXDIGIT_SEQUENCE + opt(UD_SUFFIX))) // Hex Literals
98-
.withChannel(regexp(CxxTokenType.NUMBER, "0" + opt(UD_SUFFIX))) // Decimal zero
88+
// C++ Standard, Section 2.14.4 "Floating literals"
89+
.withChannel(
90+
regexp(CxxTokenType.NUMBER,
91+
and(
92+
or(
93+
g(POINT, DECDIGIT_SEQUENCE, opt(g(EXPONENT))),
94+
g(HEX_PREFIX, opt(g(HEXDIGIT_SEQUENCE)), opt(POINT), opt(g(HEXDIGIT_SEQUENCE)), opt(g(BINARY_EXPONENT))),
95+
g(BIN_PREFIX, BINDIGIT_SEQUENCE),
96+
g(DECDIGIT_SEQUENCE, opt(POINT), opt(g(DECDIGIT_SEQUENCE)), opt(g(EXPONENT)))
97+
),
98+
opt(g(UD_SUFFIX))
99+
)
100+
)
101+
)
99102

100103
// C++ Standard, Section 2.14.7 "Pointer literals"
101104
.withChannel(regexp(CxxTokenType.NUMBER, CxxKeyword.NULLPTR.getValue() + "\\b"))

cxx-squid/src/main/java/org/sonar/cxx/preprocessor/CppLexer.java

Lines changed: 25 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,10 @@
2525
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.ANY_CHAR;
2626
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.and;
2727
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.commentRegexp;
28+
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.g;
2829
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.o2n;
2930
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.opt;
31+
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.or;
3032
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.regexp;
3133
import com.sonar.sslr.impl.channel.UnknownCharacterChannel;
3234
import org.sonar.cxx.CxxConfiguration;
@@ -39,13 +41,17 @@
3941
public final class CppLexer {
4042

4143
private static final String HEX_PREFIX = "0[xX]";
42-
private static final String EXPONENT = "([eE][+-]?+[0-9_]([']?+[0-9_]++)*+)";
43-
private static final String BINARY_EXPONENT = "([pP][+-]?+[0-9]([']?+[0-9]++)*+)"; // since C++17
44+
private static final String BIN_PREFIX = "0[bB]";
45+
private static final String EXPONENT = "[eE][+-]?+[0-9_]([']?+[0-9_]++)*+";
46+
private static final String BINARY_EXPONENT = "[pP][+-]?+[0-9]([']?+[0-9]++)*+"; // since C++17
4447
//private static final String INTEGER_SUFFIX = "(((U|u)(LL|ll|L|l)?)|((LL|ll|L|l)(u|U)?))";
4548
//private static final String FLOAT_SUFFIX = "(f|l|F|L)";
4649
// ud-suffix: identifier (including INTEGER_SUFFIX, FLOAT_SUFFIX)
47-
private static final String UD_SUFFIX = "([_a-zA-Z]([_a-zA-Z0-9]*+))";
48-
private static final String HEXDIGIT_SEQUENCE = "([0-9a-fA-F]([']?+[0-9a-fA-F]++)*+)";
50+
private static final String UD_SUFFIX = "[_a-zA-Z][_a-zA-Z0-9]*+";
51+
private static final String DECDIGIT_SEQUENCE = "[0-9]([']?+[0-9]++)*+";
52+
private static final String HEXDIGIT_SEQUENCE = "[0-9a-fA-F]([']?+[0-9a-fA-F]++)*+";
53+
private static final String BINDIGIT_SEQUENCE = "[01]([']?+[01]++)*+";
54+
private static final String POINT = "\\.";
4955

5056
private CppLexer() {
5157
}
@@ -67,24 +73,22 @@ public static Lexer create(CxxConfiguration conf) {
6773
.withChannel(commentRegexp("/\\*", ANY_CHAR + "*?", "\\*/"))
6874
.withChannel(new CharacterLiteralsChannel())
6975
.withChannel(new StringLiteralsChannel())
70-
// C++ Standard, Section 2.14.4 "Floating literals"
71-
.withChannel(regexp(CxxTokenType.NUMBER, "[0-9]([']?+[0-9]++)*+\\.([0-9]([']?+[0-9]++)*+)*+"
72-
+ opt(EXPONENT) + opt(UD_SUFFIX)))
73-
.withChannel(regexp(CxxTokenType.NUMBER, "\\.[0-9]([']?+[0-9]++)*+"
74-
+ opt(EXPONENT) + opt(UD_SUFFIX)))
75-
.withChannel(regexp(CxxTokenType.NUMBER, "[0-9]([']?+[0-9]++)*+" + EXPONENT + opt(UD_SUFFIX)))
76-
.withChannel(regexp(CxxTokenType.NUMBER, HEX_PREFIX + HEXDIGIT_SEQUENCE
77-
+ BINARY_EXPONENT + opt(UD_SUFFIX))) // since C++17
78-
.withChannel(regexp(CxxTokenType.NUMBER, HEX_PREFIX + HEXDIGIT_SEQUENCE + "."
79-
+ BINARY_EXPONENT + opt(UD_SUFFIX))) // since C++17
80-
.withChannel(regexp(CxxTokenType.NUMBER, HEX_PREFIX + opt(HEXDIGIT_SEQUENCE) + "." + HEXDIGIT_SEQUENCE
81-
+ BINARY_EXPONENT + opt(UD_SUFFIX))) // since C++17
76+
8277
// C++ Standard, Section 2.14.2 "Integer literals"
83-
.withChannel(regexp(CxxTokenType.NUMBER, "[1-9]([']?+[0-9]++)*+" + opt(UD_SUFFIX))) // Decimal literals
84-
.withChannel(regexp(CxxTokenType.NUMBER, "0[bB][01]([']?+[01]++)*+" + opt(UD_SUFFIX))) // Binary Literals
85-
.withChannel(regexp(CxxTokenType.NUMBER, "0([']?+[0-7]++)++" + opt(UD_SUFFIX))) // Octal Literals
86-
.withChannel(regexp(CxxTokenType.NUMBER, HEX_PREFIX + HEXDIGIT_SEQUENCE + opt(UD_SUFFIX))) // Hex Literals
87-
.withChannel(regexp(CxxTokenType.NUMBER, "0" + opt(UD_SUFFIX))) // Decimal zero
78+
// C++ Standard, Section 2.14.4 "Floating literals"
79+
.withChannel(
80+
regexp(CxxTokenType.NUMBER,
81+
and(
82+
or(
83+
g(POINT, DECDIGIT_SEQUENCE, opt(g(EXPONENT))),
84+
g(HEX_PREFIX, opt(g(HEXDIGIT_SEQUENCE)), opt(POINT), opt(g(HEXDIGIT_SEQUENCE)), opt(g(BINARY_EXPONENT))),
85+
g(BIN_PREFIX, BINDIGIT_SEQUENCE),
86+
g(DECDIGIT_SEQUENCE, opt(POINT), opt(g(DECDIGIT_SEQUENCE)), opt(g(EXPONENT)))
87+
),
88+
opt(g(UD_SUFFIX))
89+
)
90+
)
91+
)
8892

8993
.withChannel(new KeywordChannel(and("#", o2n("\\s"), "[a-z]", o2n("\\w")), CppKeyword.values()))
9094
.withChannel(new IdentifierAndKeywordChannel(and("[a-zA-Z_]", o2n("\\w")), true))

0 commit comments

Comments
 (0)