Skip to content
Merged
156 changes: 156 additions & 0 deletions lldb/include/lldb/ValueObject/DILLexer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
//===-- DILLexer.h ----------------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLDB_VALUEOBJECT_DILLEXER_H_
#define LLDB_VALUEOBJECT_DILLEXER_H_
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe our headers don't have the ending underscore.

Suggested change
#ifndef LLDB_VALUEOBJECT_DILLEXER_H_
#define LLDB_VALUEOBJECT_DILLEXER_H_
#ifndef LLDB_VALUEOBJECT_DILLEXER_H
#define LLDB_VALUEOBJECT_DILLEXER_H


#include "llvm/ADT/StringRef.h"
#include <cstdint>
#include <limits.h>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It fits in nicer with the other headers (though maybe you don't need it if you remove the UINT_MAX thing below)

Suggested change
#include <limits.h>
#include <climits>

#include <memory>
#include <string>
#include <vector>

namespace lldb_private {

namespace dil {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
namespace lldb_private {
namespace dil {
namespace lldb_private::dil {


enum class TokenKind {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Optional idea: Make this a non-class enum inside the (DIL)Token class so that it can be referred to as Token::coloncolon

coloncolon,
eof,
identifier,
invalid,
kw_namespace,
l_paren,
none,
r_paren,
unknown,
};

/// Class defining the tokens generated by the DIL lexer and used by the
/// DIL parser.
class DILToken {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we already have a dill:: namespace, maybe we can drop DIL prefix?

public:
DILToken(dil::TokenKind kind, std::string spelling, uint32_t start)
: m_kind(kind), m_spelling(spelling), m_start_pos(start) {}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
: m_kind(kind), m_spelling(spelling), m_start_pos(start) {}
: m_kind(kind), m_spelling(std::move(spelling)), m_start_pos(start) {}


DILToken() : m_kind(dil::TokenKind::none), m_spelling(""), m_start_pos(0) {}

void setKind(dil::TokenKind kind) { m_kind = kind; }
dil::TokenKind getKind() const { return m_kind; }

std::string getSpelling() const { return m_spelling; }

uint32_t getLength() const { return m_spelling.size(); }

bool is(dil::TokenKind kind) const { return m_kind == kind; }

bool isNot(dil::TokenKind kind) const { return m_kind != kind; }

bool isOneOf(dil::TokenKind kind1, dil::TokenKind kind2) const {
return is(kind1) || is(kind2);
}

template <typename... Ts> bool isOneOf(dil::TokenKind kind, Ts... Ks) const {
return is(kind) || isOneOf(Ks...);
}

uint32_t getLocation() const { return m_start_pos; }

void setValues(dil::TokenKind kind, std::string spelling, uint32_t start) {
m_kind = kind;
m_spelling = spelling;
m_start_pos = start;
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could we use the assignment operator instead (token = Token(kind, spelling, start)) ?


static const std::string getTokenName(dil::TokenKind kind);

private:
dil::TokenKind m_kind;
std::string m_spelling;
uint32_t m_start_pos; // within entire expression string
};

/// Class for doing the simple lexing required by DIL.
class DILLexer {
public:
DILLexer(llvm::StringRef dil_expr) : m_expr(dil_expr.str()) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think if we accept StringRef as input, we shouldn't copy the data and work with the provided string view (and assume we don't outlive it).
If you need the lexer to own the text (and you probably don't), then accept std::string and move from it.

m_cur_pos = m_expr.begin();
// Use UINT_MAX to indicate invalid/uninitialized value.
m_tokens_idx = UINT_MAX;
}

bool Lex(DILToken &result, bool look_ahead = false);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
bool Lex(DILToken &result, bool look_ahead = false);
std::optional<DILToken> Lex(bool look_ahead = false);


bool Is_Word(std::string::iterator start, uint32_t &length);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
bool Is_Word(std::string::iterator start, uint32_t &length);
bool IsWord(std::string::iterator start, uint32_t &length);


uint32_t GetLocation() { return m_cur_pos - m_expr.begin(); }

/// Update 'result' with the other paremeter values, create a
/// duplicate token, and push the duplicate token onto the vector of
/// lexed tokens.
void UpdateLexedTokens(DILToken &result, dil::TokenKind tok_kind,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should all of these APIs really be public?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A few of them can be private (I'll take care of that). Many of them could be protected (mostly just called from the parser, which I can designate as a friend), except that if I make them protected then my unittest, which needs to access them, can't access them. I've been trying and trying to find a way to allow my unit tests access these methods when they're protected, but I can't seem to make it work. Do you know the magic secret for this?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are workarounds, but no magic secrets. If something is called by the parser (which is the only user of the class anyway), then it should be public (btw, friends can see even private members, not just the public ones). And unit tests should generally test using the public APIs (as that's what the users will use). There are exceptions to that, but they usually involve situations where the public API requires arguments that would be difficult/impossible to mock in a test, which shouldn't be the case here.

Separating the public API from the private implementation details would really help this class, as I really don't know which one of these functions is meant to be called from the outside.

std::string tok_str, uint32_t tok_pos);

/// Return the lexed token N+1 positions ahead of the 'current' token
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like there are too many ways to navigate the token stream here. You can either call GetCurrentToken+IncrementTokenIdx, or GetNextToken(which I guess increments the index automatically), or LookAhead+AcceptLookAhead.

I think it would be better to start with something simple (we can add more or revamp the existing API if it turns out to be clunky). What would you say to something like:

const Token &LookAhead(uint32_t N /* add `=1` if you want*/);
const Token &GetCurrentToken() { return LookAhead(0); } // just a fancy name for a look ahead of zero
void Advance(uint32_t N = 1); // advance the token stream

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The parser really needs a way to save & restore/reset the token index, because there are places in the parser where it does tentative parsing & then decides to rollback. It does so by saving the current token index, then doing the tentative parsing (which can advance the index some number of tokens), and then (for rolling back) setting the current token index back to the saved value.

So I don't think the simple API you've outlined above would be sufficient.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm fine with tentative parse and roll back APIs. I'm commenting on the other APIs which advance through the token stream linearly (but in a very baroque fashion). IOW, my proposal was to replace GetCurrentToken, IncrementTokenIdx, GetNextToken, LookAhead and AcceptLookAhead with the three functions above (exact names TBD), and keep GetCurrentTokenIdx and ResetTokenIdx as they are.

/// being handled by the DIL parser.
const DILToken &LookAhead(uint32_t N);

const DILToken &AcceptLookAhead(uint32_t N);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this API might be simpler. The lexer doesn't actually need to re-lex, the results will always be the same. We only need to rollback occasionally, but we'll always be process the same sequence of tokens the second time.

So the lexer can always add the tokens to the m_lexed_tokens vector and we only need GetCurrentTokenIdx() and ResetTokenIdx() to do the rollback.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would definitely be nice, and if its true, I'd consider taking this even further: given that we're going to be processing about a line of text at most, we might be able to just eagerly lex the whole string and avoid the whole on-demand parsing business. If so, maybe we don't even need the lexer class and the lexing could consist of a single function like std::vector<Token> Lex(string) ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Re werat's comment: This is already what the lexer does. It never re-lexes anything. It stores the lexed tokens in a vector as it lexes them, and keeps track, via index into the vector, which token is the 'current' one (as far as the parser is concerned). Accepting the lookahead basically involves just updating the index.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's good to hear, but it still makes the implementation complicated. Separating lexing from the traversal through the lexed tokens would make things much easier to follow. I understand traditional lexers can't do that, but hey are parsing megabytes of text. We're going to be parsing approximately one line of code here.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yesterday I also thought of another advantage of early/eager parsing -- we can report errors (things like invalid tokens and such) early. Then the other functions ("get next token" and stuff) shouldn't have any failure modes except running off the end of token stream (which I think we could handle by pretending the stream ends with an infinite amount of EOF tokens)


/// Return the index for the 'current' token being handled by the DIL parser.
uint32_t GetCurrentTokenIdx() { return m_tokens_idx; }

/// Return the current token to be handled by the DIL parser.
DILToken &GetCurrentToken() { return m_lexed_tokens[m_tokens_idx]; }

/// Update the index for the 'current' token, to point to the next lexed
/// token.
bool IncrementTokenIdx() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't Lex() do this automatically?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not really. Lex() is called from LookAhead, when we definitely do not want to automatically increment the token index.

if (m_tokens_idx >= m_lexed_tokens.size() - 1)
return false;

m_tokens_idx++;
return true;
}

/// Set the index for the 'current' token (to be handled by the parser)
/// to a particular position. Used for either committing 'look ahead' parsing
/// or rolling back tentative parsing.
bool ResetTokenIdx(uint32_t new_value) {
if (new_value > m_lexed_tokens.size() - 1)
return false;

m_tokens_idx = new_value;
return true;
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
bool ResetTokenIdx(uint32_t new_value) {
if (new_value > m_lexed_tokens.size() - 1)
return false;
m_tokens_idx = new_value;
return true;
}
void ResetTokenIdx(uint32_t new_value) {
assert(new_value < m_lexed_tokens.size());
m_tokens_idx = new_value;
}

(AIUI, the only usage of this function will be to restore a previous (and valid) position, so any error here is definitely a bug)


private:
// The input string we are lexing & parsing.
std::string m_expr;

// The current position of the lexer within m_expr (the character position,
// within the string, of the next item to be lexed).
std::string::iterator m_cur_pos;

// Holds all of the tokens lexed so far.
std::vector<DILToken> m_lexed_tokens;

// Index into m_lexed_tokens; indicates which token the DIL parser is
// currently trying to parse/handle.
uint32_t m_tokens_idx;

// "invalid" token; to be returned by lexer when 'look ahead' fails.
DILToken m_invalid_token;
};

} // namespace dil

} // namespace lldb_private

#endif // LLDB_VALUEOBJECT_DILLEXER_H_
1 change: 1 addition & 0 deletions lldb/source/ValueObject/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
add_lldb_library(lldbValueObject
DILLexer.cpp
ValueObject.cpp
ValueObjectCast.cpp
ValueObjectChild.cpp
Expand Down
205 changes: 205 additions & 0 deletions lldb/source/ValueObject/DILLexer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
//===-- DILLexer.cpp ------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// This implements the recursive descent parser for the Data Inspection
// Language (DIL), and its helper functions, which will eventually underlie the
// 'frame variable' command. The language that this parser recognizes is
// described in lldb/docs/dil-expr-lang.ebnf
//
//===----------------------------------------------------------------------===//

#include "lldb/ValueObject/DILLexer.h"
#include "llvm/ADT/StringMap.h"

namespace lldb_private {

namespace dil {

// For fast keyword lookup. More keywords will be added later.
const llvm::StringMap<dil::TokenKind> Keywords = {
{"namespace", dil::TokenKind::kw_namespace},
};
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This violatees https://llvm.org/docs/CodingStandards.html#do-not-use-static-constructors

You could make it a function-local static variable (so that it's initialized on first use), but unless we're going to have many, many keywords, I probably wouldn't bother with that and use llvm::StringSwitch instead.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have on the order of 25 keywords. llvm::StringSwitch will be as fast as StringMap.find()?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe, maybe not, but I doubt anyone will notice the extra few nanoseconds. I am also not convinced that we should have that many keywords, as every keyword is a variable name that's now impossible/hard to access.


const std::string DILToken::getTokenName(dil::TokenKind kind) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

const on a return type is useless, and if you don't plan on this function returning anything other than a constant string, then the it can return a StringRef

switch (kind) {
case dil::TokenKind::coloncolon:
return "coloncolon";
case dil::TokenKind::eof:
return "eof";
case dil::TokenKind::identifier:
return "identifier";
case dil::TokenKind::kw_namespace:
return "namespace";
case dil::TokenKind::l_paren:
return "l_paren";
case dil::TokenKind::r_paren:
return "r_paren";
case dil::TokenKind::unknown:
return "unknown";
default:
return "token_name";
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we drop the default clause? All it does is make it easy to forget to modify this function when adding a new token type.

}
}

static bool Is_Letter(char c) {
if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
return true;
return false;
}

static bool Is_Digit(char c) { return ('0' <= c && c <= '9'); }
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
static bool Is_Letter(char c) {
if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
return true;
return false;
}
static bool Is_Digit(char c) { return ('0' <= c && c <= '9'); }
static bool IsLetter(char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
}
static bool IsDigit(char c) { return '0' <= c && c <= '9'; }


// A word starts with a letter, underscore, or dollar sign, followed by
// letters ('a'..'z','A'..'Z'), digits ('0'..'9'), and/or underscores.
bool DILLexer::Is_Word(std::string::iterator start, uint32_t &length) {
bool done = false;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it doesn't look like this is used for anything

Copy link
Contributor Author

@cmtice cmtice Jan 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's used on lines 74 & 77.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but you break out of the loop as soon as you set it to true. (I don't think the meaning of the code would change if you just deleted all references to the variable.

bool dollar_start = false;

// Must not start with a digit.
if (m_cur_pos == m_expr.end() || Is_Digit(*m_cur_pos))
return false;

// First character *may* be a '$', for a register name or convenience
// variable.
if (*m_cur_pos == '$') {
dollar_start = true;
++m_cur_pos;
length++;
}

// Contains only letters, digits or underscores
for (; m_cur_pos != m_expr.end() && !done; ++m_cur_pos) {
char c = *m_cur_pos;
if (!Is_Letter(c) && !Is_Digit(c) && c != '_') {
done = true;
break;
} else
length++;
}

if (dollar_start && length > 1) // Must have something besides just '$'
return true;

if (!dollar_start && length > 0)
return true;

// Not a valid word, so re-set the lexing position.
m_cur_pos = start;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAICT, this is the only use of the start argument, which makes for a very weird API. Perhaps the function could make a note of the starting position internally, and then return the range it found to the caller (I'd suggest a return type of `iterator_rangestring::iterator, with the empty range meaning "no word found")

return false;
}

void DILLexer::UpdateLexedTokens(DILToken &result, dil::TokenKind tok_kind,
std::string tok_str, uint32_t tok_pos) {
DILToken new_token;
result.setValues(tok_kind, tok_str, tok_pos);
new_token = result;
m_lexed_tokens.push_back(std::move(new_token));
}

bool DILLexer::Lex(DILToken &result, bool look_ahead) {
bool retval = true;

if (!look_ahead) {
// We're being asked for the 'next' token, and not a part of a LookAhead.
// Check to see if we've already lexed it and pushed it onto our tokens
// vector; if so, return the next token from the vector, rather than doing
// more lexing.
if ((m_tokens_idx != UINT_MAX) &&
(m_tokens_idx < m_lexed_tokens.size() - 1)) {
result = m_lexed_tokens[m_tokens_idx + 1];
return retval;
}
}

// Skip over whitespace (spaces).
while (m_cur_pos != m_expr.end() && *m_cur_pos == ' ')
m_cur_pos++;

// Check to see if we've reached the end of our input string.
if (m_cur_pos == m_expr.end()) {
UpdateLexedTokens(result, dil::TokenKind::eof, "", m_expr.length());
return retval;
}

uint32_t position = m_cur_pos - m_expr.begin();
;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
;

std::string::iterator start = m_cur_pos;
uint32_t length = 0;
if (Is_Word(start, length)) {
dil::TokenKind kind;
std::string word = m_expr.substr(position, length);
auto iter = Keywords.find(word);
if (iter != Keywords.end())
kind = iter->second;
else
kind = dil::TokenKind::identifier;

UpdateLexedTokens(result, kind, word, position);
return true;
}

switch (*m_cur_pos) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An iterator is sort of a natural representation of a position in a string, but it also makes it hard to use many of the nice (safe) string APIs. For example, if we had a StringRef m_rest to represent the unparsed portion of the string, then this block could be written as:

constexpr std::pair<TokenKind, const char *> operators = {
  {TokenKind::l_paren, "("},
  {TokenKind::r_paren, ")"},
  {TokenKind::coloncolon, "::"},
  {TokenKind::colon, ":"},
};
size_t position = m_rest.data() - m_input.data();
for (auto [kind, str]: operators) {
  if (m_rest.consume_front(str)) {
    UpdateLexedTokens(result, kind, str, position);
    return true;
  }
}

(Notice how I do not have to worry about running beyond the end of the string, how the "current position" is updated automatically, and how the tokens which are prefixes of one another are handled by putting the longer string first.)

There are also other places where this could be useful. E.g. skipping over whitespace could be implemented as m_rest = m_rest.ltrim() and similar.

case '(':
m_cur_pos++;
UpdateLexedTokens(result, dil::TokenKind::l_paren, "(", position);
return true;
case ')':
m_cur_pos++;
UpdateLexedTokens(result, dil::TokenKind::r_paren, ")", position);
return true;
case ':':
if (position + 1 < m_expr.size() && m_expr[position + 1] == ':') {
m_cur_pos += 2;
UpdateLexedTokens(result, dil::TokenKind::coloncolon, "::", position);
return true;
}
break;
default:
break;
}
// Empty Token
result.setValues(dil::TokenKind::none, "", m_expr.length());
return false;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the meaning of an empty token?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Roughly the same as unknown. I will change the TokenKind above to unknown.

}

const DILToken &DILLexer::LookAhead(uint32_t N) {
uint32_t extra_lexed_tokens = m_lexed_tokens.size() - m_tokens_idx - 1;

if (N + 1 < extra_lexed_tokens)
return m_lexed_tokens[m_tokens_idx + N + 1];

uint32_t remaining_tokens =
(m_tokens_idx + N + 1) - m_lexed_tokens.size() + 1;

bool done = false;
bool look_ahead = true;
while (!done && remaining_tokens > 0) {
DILToken tok;
Lex(tok, look_ahead);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we check the return value here?

if (tok.getKind() == dil::TokenKind::eof)
done = true;
remaining_tokens--;
};

if (remaining_tokens > 0) {
m_invalid_token.setValues(dil::TokenKind::invalid, "", 0);
return m_invalid_token;
}

return m_lexed_tokens[m_tokens_idx + N + 1];
}

const DILToken &DILLexer::AcceptLookAhead(uint32_t N) {
if (m_tokens_idx + N + 1 > m_lexed_tokens.size())
return m_invalid_token;

m_tokens_idx += N + 1;
return m_lexed_tokens[m_tokens_idx];
}

} // namespace dil

} // namespace lldb_private
1 change: 1 addition & 0 deletions lldb/unittests/ValueObject/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
add_lldb_unittest(LLDBValueObjectTests
DumpValueObjectOptionsTests.cpp
DILLexerTests.cpp

LINK_LIBS
lldbValueObject
Expand Down
Loading
Loading