Skip to content

Commit 0b33ab7

Browse files
committed
Address remaining review comments:
- Remove 'unknown' token type. - Remove UINT_MAX as a valid token index; always start at 0. - Update IsWord to use Pavel's more efficient StringRef implementation. - Allow '$' anywhere in an identifer string. - Adjust unit tests to handle changes mentioned above. - Clean up test of invalid identifiers (remove if-then-else; split invalid identifiers from unrecognized strings).
1 parent 29e9f26 commit 0b33ab7

File tree

3 files changed

+50
-83
lines changed

3 files changed

+50
-83
lines changed

lldb/include/lldb/ValueObject/DILLexer.h

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ class Token {
3030
identifier,
3131
l_paren,
3232
r_paren,
33-
unknown,
3433
};
3534

3635
Token(Kind kind, std::string spelling, uint32_t start)
@@ -72,10 +71,7 @@ class DILLexer {
7271

7372
/// Advance the current token position by N.
7473
void Advance(uint32_t N = 1) {
75-
// UINT_MAX means uninitialized, no "current" position, so move to start.
76-
if (m_tokens_idx == UINT_MAX)
77-
m_tokens_idx = 0;
78-
else if (m_tokens_idx + N >= m_lexed_tokens.size())
74+
if (m_tokens_idx + N >= m_lexed_tokens.size())
7975
// N is too large; advance to the end of the lexed tokens.
8076
m_tokens_idx = m_lexed_tokens.size() - 1;
8177
else
@@ -99,7 +95,7 @@ class DILLexer {
9995
/// to a particular position. Used for either committing 'look ahead' parsing
10096
/// or rolling back tentative parsing.
10197
void ResetTokenIdx(uint32_t new_value) {
102-
assert(new_value == UINT_MAX || new_value < m_lexed_tokens.size());
98+
assert(new_value < m_lexed_tokens.size());
10399
m_tokens_idx = new_value;
104100
}
105101

@@ -108,7 +104,7 @@ class DILLexer {
108104
private:
109105
DILLexer(llvm::StringRef dil_expr, std::vector<Token> lexed_tokens)
110106
: m_expr(dil_expr), m_lexed_tokens(std::move(lexed_tokens)),
111-
m_tokens_idx(UINT_MAX), m_eof_token(Token(Token::eof, "", 0)) {}
107+
m_tokens_idx(0), m_eof_token(Token(Token::eof, "", 0)) {}
112108

113109
static llvm::Expected<Token> Lex(llvm::StringRef expr,
114110
llvm::StringRef &remainder);

lldb/source/ValueObject/DILLexer.cpp

Lines changed: 9 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,6 @@ llvm::StringRef Token::GetTokenName(Kind kind) {
2929
return "l_paren";
3030
case Kind::r_paren:
3131
return "r_paren";
32-
case Kind::unknown:
33-
return "unknown";
3432
}
3533
}
3634

@@ -44,43 +42,14 @@ static bool IsDigit(char c) { return '0' <= c && c <= '9'; }
4442
// letters ('a'..'z','A'..'Z'), digits ('0'..'9'), and/or underscores.
4543
static std::optional<llvm::StringRef> IsWord(llvm::StringRef expr,
4644
llvm::StringRef &remainder) {
47-
llvm::StringRef::iterator cur_pos = expr.end() - remainder.size();
48-
llvm::StringRef::iterator start = cur_pos;
49-
bool dollar_start = false;
50-
51-
// Must not start with a digit.
52-
if (cur_pos == expr.end() || IsDigit(*cur_pos))
53-
return std::nullopt;
54-
55-
// First character *may* be a '$', for a register name or convenience
56-
// variable.
57-
if (*cur_pos == '$') {
58-
dollar_start = true;
59-
++cur_pos;
60-
}
61-
62-
// Contains only letters, digits or underscores
63-
for (; cur_pos != expr.end(); ++cur_pos) {
64-
char c = *cur_pos;
65-
if (!IsLetter(c) && !IsDigit(c) && c != '_')
66-
break;
67-
}
68-
69-
// If first char is '$', make sure there's at least one mare char, or it's
70-
// invalid.
71-
if (dollar_start && (cur_pos - start <= 1)) {
72-
cur_pos = start;
73-
return std::nullopt;
74-
}
75-
76-
if (cur_pos == start)
45+
// Find the longest prefix consisting of letters, digits, underscors and
46+
// '$'. If it doesn't start with a digit, then it's a word.
47+
llvm::StringRef candidate = remainder.take_while(
48+
[](char c) { return IsDigit(c) || IsLetter(c) || c == '_' || c == '$'; });
49+
if (candidate.empty() || IsDigit(candidate[0]))
7750
return std::nullopt;
78-
79-
llvm::StringRef word = expr.substr(start - expr.begin(), cur_pos - start);
80-
if (remainder.consume_front(word))
81-
return word;
82-
83-
return std::nullopt;
51+
remainder = remainder.drop_front(candidate.size());
52+
return candidate;
8453
}
8554

8655
llvm::Expected<DILLexer> DILLexer::Create(llvm::StringRef expr) {
@@ -100,10 +69,10 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
10069
llvm::StringRef &remainder) {
10170
// Skip over whitespace (spaces).
10271
remainder = remainder.ltrim();
103-
llvm::StringRef::iterator cur_pos = expr.end() - remainder.size();
72+
llvm::StringRef::iterator cur_pos = remainder.begin();
10473

10574
// Check to see if we've reached the end of our input string.
106-
if (remainder.empty() || cur_pos == expr.end())
75+
if (remainder.empty())
10776
return Token(Token::eof, "", (uint32_t)expr.size());
10877

10978
uint32_t position = cur_pos - expr.begin();

lldb/unittests/ValueObject/DILLexerTests.cpp

Lines changed: 38 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,15 @@ ExtractTokenData(llvm::StringRef input_expr) {
2727
if (lexer.NumLexedTokens() == 0)
2828
return llvm::createStringError("No lexed tokens");
2929

30-
lexer.ResetTokenIdx(UINT_MAX);
30+
lexer.ResetTokenIdx(0);
3131
std::vector<std::pair<Token::Kind, std::string>> data;
3232
do {
33-
lexer.Advance();
3433
Token tok = lexer.GetCurrentToken();
3534
data.push_back(std::make_pair(tok.GetKind(), tok.GetSpelling()));
35+
lexer.Advance();
3636
} while (data.back().first != Token::eof);
37+
// Don't return the eof token.
38+
data.pop_back();
3739
return data;
3840
}
3941

@@ -42,11 +44,8 @@ TEST(DILLexerTests, SimpleTest) {
4244
llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr);
4345
ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
4446
DILLexer lexer(*maybe_lexer);
45-
Token token = Token(Token::unknown, "", 0);
46-
EXPECT_EQ(token.GetKind(), Token::unknown);
47+
Token token = lexer.GetCurrentToken();
4748

48-
lexer.Advance();
49-
token = lexer.GetCurrentToken();
5049
EXPECT_EQ(token.GetKind(), Token::identifier);
5150
EXPECT_EQ(token.GetSpelling(), "simple_var");
5251
lexer.Advance();
@@ -69,9 +68,7 @@ TEST(DILLexerTests, LookAheadTest) {
6968
llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr);
7069
ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
7170
DILLexer lexer(*maybe_lexer);
72-
Token token = Token(Token::unknown, "", 0);
73-
lexer.Advance();
74-
token = lexer.GetCurrentToken();
71+
Token token = lexer.GetCurrentToken();
7572

7673
// Current token is '('; check the next 4 tokens, to make
7774
// sure they are the identifier 'anonymous', the identifier 'namespace'
@@ -110,49 +107,54 @@ TEST(DILLexerTests, LookAheadTest) {
110107
TEST(DILLexerTests, MultiTokenLexTest) {
111108
EXPECT_THAT_EXPECTED(
112109
ExtractTokenData("This string has (several ) ::identifiers"),
113-
llvm::HasValue(
114-
testing::ElementsAre(testing::Pair(Token::identifier, "This"),
115-
testing::Pair(Token::identifier, "string"),
116-
testing::Pair(Token::identifier, "has"),
117-
testing::Pair(Token::l_paren, "("),
118-
testing::Pair(Token::identifier, "several"),
119-
testing::Pair(Token::r_paren, ")"),
120-
testing::Pair(Token::coloncolon, "::"),
121-
testing::Pair(Token::identifier, "identifiers"),
122-
testing::Pair(Token::eof, ""))));
110+
llvm::HasValue(testing::ElementsAre(
111+
testing::Pair(Token::identifier, "This"),
112+
testing::Pair(Token::identifier, "string"),
113+
testing::Pair(Token::identifier, "has"),
114+
testing::Pair(Token::l_paren, "("),
115+
testing::Pair(Token::identifier, "several"),
116+
testing::Pair(Token::r_paren, ")"),
117+
testing::Pair(Token::coloncolon, "::"),
118+
testing::Pair(Token::identifier, "identifiers"))));
123119
}
124120

125121
TEST(DILLexerTests, IdentifiersTest) {
122+
// These strings should lex into identifier tokens.
126123
std::vector<std::string> valid_identifiers = {
127-
"$My_name1", "$pc", "abcd", "_", "_a", "_a_",
124+
"$My_name1", "$pc", "abcd", "_", "_a", "_a_", "$",
128125
"a_b", "this", "self", "a", "MyName", "namespace"};
129-
std::vector<std::string> invalid_identifiers = {"234", "2a", "2",
130-
"$", "1MyName", ""};
126+
127+
// The lexer can lex these strings, but they should not be identifiers.
128+
std::vector<std::string> invalid_identifiers = {"", "::", "(", ")"};
129+
130+
// The lexer is expected to fail attempting to lex these strings (it cannot
131+
// create valid tokens out of them).
132+
std::vector<std::string> invalid_tok_strings = {"234", "2a", "2", "1MyName"};
131133

132134
// Verify that all of the valid identifiers come out as identifier tokens.
133135
for (auto &str : valid_identifiers) {
134136
SCOPED_TRACE(str);
135137
EXPECT_THAT_EXPECTED(ExtractTokenData(str),
136138
llvm::HasValue(testing::ElementsAre(
137-
testing::Pair(Token::identifier, str),
138-
testing::Pair(Token::eof, ""))));
139+
testing::Pair(Token::identifier, str))));
140+
}
141+
142+
// Verify that the lexer fails on invalid token strings.
143+
for (auto &str : invalid_tok_strings) {
144+
SCOPED_TRACE(str);
145+
auto maybe_lexer = DILLexer::Create(str);
146+
EXPECT_THAT_EXPECTED(maybe_lexer, llvm::Failed());
139147
}
140148

141149
// Verify that none of the invalid identifiers come out as identifier tokens.
142150
for (auto &str : invalid_identifiers) {
143151
SCOPED_TRACE(str);
144152
llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(str);
145-
if (!maybe_lexer) {
146-
llvm::consumeError(maybe_lexer.takeError());
147-
// In this case, it's ok for lexing to return an error.
148-
} else {
149-
DILLexer lexer(*maybe_lexer);
150-
Token token = Token(Token::unknown, "", 0);
151-
// We didn't get an error; make sure we did not get an identifier token.
152-
lexer.Advance();
153-
token = lexer.GetCurrentToken();
154-
EXPECT_TRUE(token.IsNot(Token::identifier));
155-
EXPECT_TRUE(token.IsOneOf(Token::unknown, Token::eof));
156-
}
153+
EXPECT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
154+
DILLexer lexer(*maybe_lexer);
155+
Token token = lexer.GetCurrentToken();
156+
EXPECT_TRUE(token.IsNot(Token::identifier));
157+
EXPECT_TRUE(token.IsOneOf(Token::eof, Token::coloncolon, Token::l_paren,
158+
Token::r_paren));
157159
}
158160
}

0 commit comments

Comments
 (0)