Skip to content
Merged
123 changes: 123 additions & 0 deletions lldb/include/lldb/ValueObject/DILLexer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
//===-- DILLexer.h ----------------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLDB_VALUEOBJECT_DILLEXER_H
#define LLDB_VALUEOBJECT_DILLEXER_H

#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Error.h"
#include <cstdint>
#include <memory>
#include <string>
#include <vector>

namespace lldb_private::dil {

/// Class defining the tokens generated by the DIL lexer and used by the
/// DIL parser.
class Token {
public:
enum Kind {
coloncolon,
eof,
identifier,
l_paren,
r_paren,
};

Token(Kind kind, std::string spelling, uint32_t start)
: m_kind(kind), m_spelling(std::move(spelling)), m_start_pos(start) {}

Kind GetKind() const { return m_kind; }

std::string GetSpelling() const { return m_spelling; }

bool Is(Kind kind) const { return m_kind == kind; }

bool IsNot(Kind kind) const { return m_kind != kind; }

bool IsOneOf(Kind kind1, Kind kind2) const { return Is(kind1) || Is(kind2); }

template <typename... Ts> bool IsOneOf(Kind kind, Ts... Ks) const {
return Is(kind) || IsOneOf(Ks...);
}

uint32_t GetLocation() const { return m_start_pos; }

static llvm::StringRef GetTokenName(Kind kind);

private:
Kind m_kind;
std::string m_spelling;
uint32_t m_start_pos; // within entire expression string
};

/// Class for doing the simple lexing required by DIL.
class DILLexer {
public:
/// Lexes all the tokens in expr and calls the private constructor
/// with the lexed tokens.
static llvm::Expected<DILLexer> Create(llvm::StringRef expr);

/// Return the current token to be handled by the DIL parser.
const Token &GetCurrentToken() { return m_lexed_tokens[m_tokens_idx]; }

/// Advance the current token position by N.
void Advance(uint32_t N = 1) {
if (m_tokens_idx + N >= m_lexed_tokens.size())
// N is too large; advance to the end of the lexed tokens.
m_tokens_idx = m_lexed_tokens.size() - 1;
else
m_tokens_idx += N;
}

/// Return the lexed token N positions ahead of the 'current' token
/// being handled by the DIL parser.
const Token &LookAhead(uint32_t N) {
if (m_tokens_idx + N < m_lexed_tokens.size())
return m_lexed_tokens[m_tokens_idx + N];

// Last token should be an 'eof' token.
return m_lexed_tokens.back();
}

/// Return the index for the 'current' token being handled by the DIL parser.
uint32_t GetCurrentTokenIdx() { return m_tokens_idx; }

/// Set the index for the 'current' token (to be handled by the parser)
/// to a particular position. Used for either committing 'look ahead' parsing
/// or rolling back tentative parsing.
void ResetTokenIdx(uint32_t new_value) {
assert(new_value < m_lexed_tokens.size());
m_tokens_idx = new_value;
}

uint32_t NumLexedTokens() { return m_lexed_tokens.size(); }

private:
DILLexer(llvm::StringRef dil_expr, std::vector<Token> lexed_tokens)
: m_expr(dil_expr), m_lexed_tokens(std::move(lexed_tokens)),
m_tokens_idx(0) {}

static llvm::Expected<Token> Lex(llvm::StringRef expr,
llvm::StringRef &remainder);

// The input string we are lexing & parsing.
llvm::StringRef m_expr;

// Holds all of the tokens lexed so far.
std::vector<Token> m_lexed_tokens;

// Index into m_lexed_tokens; indicates which token the DIL parser is
// currently trying to parse/handle.
uint32_t m_tokens_idx;
};

} // namespace lldb_private::dil

#endif // LLDB_VALUEOBJECT_DILLEXER_H
1 change: 1 addition & 0 deletions lldb/source/ValueObject/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
add_lldb_library(lldbValueObject
DILLexer.cpp
ValueObject.cpp
ValueObjectCast.cpp
ValueObjectChild.cpp
Expand Down
97 changes: 97 additions & 0 deletions lldb/source/ValueObject/DILLexer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
//===-- DILLexer.cpp ------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// This implements the recursive descent parser for the Data Inspection
// Language (DIL), and its helper functions, which will eventually underlie the
// 'frame variable' command. The language that this parser recognizes is
// described in lldb/docs/dil-expr-lang.ebnf
//
//===----------------------------------------------------------------------===//

#include "lldb/ValueObject/DILLexer.h"
#include "lldb/Utility/Status.h"
#include "llvm/ADT/StringSwitch.h"

namespace lldb_private::dil {

llvm::StringRef Token::GetTokenName(Kind kind) {
switch (kind) {
case Kind::coloncolon:
return "coloncolon";
case Kind::eof:
return "eof";
case Kind::identifier:
return "identifier";
case Kind::l_paren:
return "l_paren";
case Kind::r_paren:
return "r_paren";
}
}

static bool IsLetter(char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
}

static bool IsDigit(char c) { return '0' <= c && c <= '9'; }

// A word starts with a letter, underscore, or dollar sign, followed by
// letters ('a'..'z','A'..'Z'), digits ('0'..'9'), and/or underscores.
static std::optional<llvm::StringRef> IsWord(llvm::StringRef expr,
llvm::StringRef &remainder) {
// Find the longest prefix consisting of letters, digits, underscors and
// '$'. If it doesn't start with a digit, then it's a word.
llvm::StringRef candidate = remainder.take_while(
[](char c) { return IsDigit(c) || IsLetter(c) || c == '_' || c == '$'; });
if (candidate.empty() || IsDigit(candidate[0]))
return std::nullopt;
remainder = remainder.drop_front(candidate.size());
return candidate;
}

llvm::Expected<DILLexer> DILLexer::Create(llvm::StringRef expr) {
std::vector<Token> tokens;
llvm::StringRef remainder = expr;
do {
if (llvm::Expected<Token> t = Lex(expr, remainder)) {
tokens.push_back(std::move(*t));
} else {
return t.takeError();
}
} while (tokens.back().GetKind() != Token::eof);
return DILLexer(expr, std::move(tokens));
}

llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
llvm::StringRef &remainder) {
// Skip over whitespace (spaces).
remainder = remainder.ltrim();
llvm::StringRef::iterator cur_pos = remainder.begin();

// Check to see if we've reached the end of our input string.
if (remainder.empty())
return Token(Token::eof, "", (uint32_t)expr.size());

uint32_t position = cur_pos - expr.begin();
std::optional<llvm::StringRef> maybe_word = IsWord(expr, remainder);
if (maybe_word)
return Token(Token::identifier, maybe_word->str(), position);

constexpr std::pair<Token::Kind, const char *> operators[] = {
{Token::l_paren, "("},
{Token::r_paren, ")"},
{Token::coloncolon, "::"},
};
for (auto [kind, str] : operators) {
if (remainder.consume_front(str))
return Token(kind, str, position);
}

// Unrecognized character(s) in string; unable to lex it.
return llvm::createStringError("Unable to lex input string");
}

} // namespace lldb_private::dil
2 changes: 2 additions & 0 deletions lldb/unittests/ValueObject/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
add_lldb_unittest(LLDBValueObjectTests
DumpValueObjectOptionsTests.cpp
DILLexerTests.cpp

LINK_LIBS
lldbValueObject
lldbPluginPlatformLinux
lldbPluginScriptInterpreterNone
LLVMTestingSupport

LINK_COMPONENTS
Support
Expand Down
156 changes: 156 additions & 0 deletions lldb/unittests/ValueObject/DILLexerTests.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
//===-- DILLexerTests.cpp --------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "lldb/ValueObject/DILLexer.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Testing/Support/Error.h"
#include "gtest/gtest.h"
#include <string>

using llvm::StringRef;

using namespace lldb_private::dil;

llvm::Expected<std::vector<std::pair<Token::Kind, std::string>>>
ExtractTokenData(llvm::StringRef input_expr) {

llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr);
if (!maybe_lexer)
return maybe_lexer.takeError();
DILLexer lexer(*maybe_lexer);

std::vector<std::pair<Token::Kind, std::string>> data;
do {
Token tok = lexer.GetCurrentToken();
data.push_back(std::make_pair(tok.GetKind(), tok.GetSpelling()));
lexer.Advance();
} while (data.back().first != Token::eof);
// Don't return the eof token.
data.pop_back();
return data;
}

TEST(DILLexerTests, SimpleTest) {
StringRef input_expr("simple_var");
llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr);
ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
DILLexer lexer(*maybe_lexer);
Token token = lexer.GetCurrentToken();

EXPECT_EQ(token.GetKind(), Token::identifier);
EXPECT_EQ(token.GetSpelling(), "simple_var");
lexer.Advance();
token = lexer.GetCurrentToken();
EXPECT_EQ(token.GetKind(), Token::eof);
}

TEST(DILLexerTests, TokenKindTest) {
Token token = Token(Token::identifier, "ident", 0);

EXPECT_TRUE(token.Is(Token::identifier));
EXPECT_FALSE(token.Is(Token::l_paren));
EXPECT_TRUE(token.IsOneOf(Token::eof, Token::identifier));
EXPECT_FALSE(token.IsOneOf(Token::l_paren, Token::r_paren, Token::coloncolon,
Token::eof));
}

TEST(DILLexerTests, LookAheadTest) {
StringRef input_expr("(anonymous namespace)::some_var");
llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr);
ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
DILLexer lexer(*maybe_lexer);
Token token = lexer.GetCurrentToken();

// Current token is '('; check the next 4 tokens, to make
// sure they are the identifier 'anonymous', the identifier 'namespace'
// ')' and '::', in that order.
EXPECT_EQ(token.GetKind(), Token::l_paren);
EXPECT_EQ(lexer.LookAhead(1).GetKind(), Token::identifier);
EXPECT_EQ(lexer.LookAhead(1).GetSpelling(), "anonymous");
EXPECT_EQ(lexer.LookAhead(2).GetKind(), Token::identifier);
EXPECT_EQ(lexer.LookAhead(2).GetSpelling(), "namespace");
EXPECT_EQ(lexer.LookAhead(3).GetKind(), Token::r_paren);
EXPECT_EQ(lexer.LookAhead(4).GetKind(), Token::coloncolon);

// Our current index should still be 0, as we only looked ahead; we are still
// officially on the '('.
EXPECT_EQ(lexer.GetCurrentTokenIdx(), 0u);

// Accept the 'lookahead', so our current token is '::', which has the index
// 4 in our vector of tokens (which starts at zero).
lexer.Advance(4);
token = lexer.GetCurrentToken();
EXPECT_EQ(token.GetKind(), Token::coloncolon);
EXPECT_EQ(lexer.GetCurrentTokenIdx(), 4u);

lexer.Advance();
token = lexer.GetCurrentToken();
EXPECT_EQ(token.GetKind(), Token::identifier);
EXPECT_EQ(token.GetSpelling(), "some_var");
EXPECT_EQ(lexer.GetCurrentTokenIdx(), 5u);
EXPECT_EQ(token.GetLocation(), strlen("(anonymous namespace)::"));

lexer.Advance();
token = lexer.GetCurrentToken();
EXPECT_EQ(token.GetKind(), Token::eof);
}

TEST(DILLexerTests, MultiTokenLexTest) {
EXPECT_THAT_EXPECTED(
ExtractTokenData("This string has (several ) ::identifiers"),
llvm::HasValue(testing::ElementsAre(
testing::Pair(Token::identifier, "This"),
testing::Pair(Token::identifier, "string"),
testing::Pair(Token::identifier, "has"),
testing::Pair(Token::l_paren, "("),
testing::Pair(Token::identifier, "several"),
testing::Pair(Token::r_paren, ")"),
testing::Pair(Token::coloncolon, "::"),
testing::Pair(Token::identifier, "identifiers"))));
}

TEST(DILLexerTests, IdentifiersTest) {
// These strings should lex into identifier tokens.
std::vector<std::string> valid_identifiers = {
"$My_name1", "$pc", "abcd", "_", "_a", "_a_", "$",
"a_b", "this", "self", "a", "MyName", "namespace"};

// The lexer can lex these strings, but they should not be identifiers.
std::vector<std::string> invalid_identifiers = {"", "::", "(", ")"};

// The lexer is expected to fail attempting to lex these strings (it cannot
// create valid tokens out of them).
std::vector<std::string> invalid_tok_strings = {"234", "2a", "2", "1MyName"};

// Verify that all of the valid identifiers come out as identifier tokens.
for (auto &str : valid_identifiers) {
SCOPED_TRACE(str);
EXPECT_THAT_EXPECTED(ExtractTokenData(str),
llvm::HasValue(testing::ElementsAre(
testing::Pair(Token::identifier, str))));
}

// Verify that the lexer fails on invalid token strings.
for (auto &str : invalid_tok_strings) {
SCOPED_TRACE(str);
auto maybe_lexer = DILLexer::Create(str);
EXPECT_THAT_EXPECTED(maybe_lexer, llvm::Failed());
}

// Verify that none of the invalid identifiers come out as identifier tokens.
for (auto &str : invalid_identifiers) {
SCOPED_TRACE(str);
llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(str);
EXPECT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
DILLexer lexer(*maybe_lexer);
Token token = lexer.GetCurrentToken();
EXPECT_TRUE(token.IsNot(Token::identifier));
EXPECT_TRUE(token.IsOneOf(Token::eof, Token::coloncolon, Token::l_paren,
Token::r_paren));
}
}