Address latest review comments:

cmtice · cmtice · commit ccf5203595ec · 2025-01-30T15:07:33.000-08:00
- Remove 'namespace' as	a keyword (make it a normal identifier)
- Remove 'invalid' and 'none' token types.
- Remove unnecessary SetKind and GetLength methods from	Tokens.
- Re-arrange Lexer:
  - Give it a static Create method, which pre-lexes all	the tokens
  - Make Lex method static
  - Pull IsWord	method out of Lexer class
  - Make the Lexer constructor private.
- Remove LexAll, GetLocation, UpdateLexedTokens, AcceptLookAhead, GetNextToken,
  and IncrementTokenIdx methods from Lexer class.
- Add new 'Advance' method (to help replace some of the removed methods).
- Update indexing in LookAhead (LookAead(0) now means the 'current' token).
- Remove m_cur_pos data	member from Lexer class.
- Replace m_invalid_token with m_eof_token.
- Use 'remainder' StringRef to help with lexing.
- Update the unit tests	to handle all the code changes in the Lexer.
- Update the unit tests	to use ASSERT_THAT_EXPECTED to check llvm::Expected
  return values.
- Update the unit tests	to use "testing::ElementsAre(testing::Pair ..."	to
  verify all the lexed tokens; also added helper function ExtractTokenData, and
  deleted function VerifyExpectedTokens.
diff --git a/lldb/include/lldb/ValueObject/DILLexer.h b/lldb/include/lldb/ValueObject/DILLexer.h
@@ -18,9 +18,7 @@
 #include <string>
 #include <vector>
 
-namespace lldb_private {
-
-namespace dil {
+namespace lldb_private::dil {
 
 /// Class defining the tokens generated by the DIL lexer and used by the
 /// DIL parser.
@@ -30,27 +28,18 @@ class Token {
     coloncolon,
     eof,
     identifier,
-    invalid,
-    kw_namespace,
     l_paren,
-    none,
     r_paren,
     unknown,
   };
 
   Token(Kind kind, std::string spelling, uint32_t start)
       : m_kind(kind), m_spelling(spelling), m_start_pos(start) {}
 
-  Token() : m_kind(Kind::none), m_spelling(""), m_start_pos(0) {}
-
-  void SetKind(Kind kind) { m_kind = kind; }
-
   Kind GetKind() const { return m_kind; }
 
   std::string GetSpelling() const { return m_spelling; }
 
-  uint32_t GetLength() const { return m_spelling.size(); }
-
   bool Is(Kind kind) const { return m_kind == kind; }
 
   bool IsNot(Kind kind) const { return m_kind != kind; }
@@ -74,85 +63,69 @@ class Token {
 /// Class for doing the simple lexing required by DIL.
 class DILLexer {
 public:
-  DILLexer(llvm::StringRef dil_expr) : m_expr(dil_expr) {
-    m_cur_pos = m_expr.begin();
-    // Use UINT_MAX to indicate invalid/uninitialized value.
-    m_tokens_idx = UINT_MAX;
-    m_invalid_token = Token(Token::invalid, "", 0);
-  }
-
-  llvm::Expected<bool> LexAll();
-
-  /// Return the lexed token N+1 positions ahead of the 'current' token
-  /// being handled by the DIL parser.
-  const Token &LookAhead(uint32_t N);
-
-  const Token &AcceptLookAhead(uint32_t N);
-
-  const Token &GetNextToken();
-
-  /// Return the index for the 'current' token being handled by the DIL parser.
-  uint32_t GetCurrentTokenIdx() { return m_tokens_idx; }
+  /// Lexes all the tokens in expr and calls the private constructor
+  /// with the lexed tokens.
+  static llvm::Expected<DILLexer> Create(llvm::StringRef expr);
 
   /// Return the current token to be handled by the DIL parser.
   const Token &GetCurrentToken() { return m_lexed_tokens[m_tokens_idx]; }
 
-  uint32_t NumLexedTokens() { return m_lexed_tokens.size(); }
+  /// Advance the current token position by N.
+  void Advance(uint32_t N = 1) {
+    // UINT_MAX means uninitialized, no "current" position, so move to start.
+    if (m_tokens_idx == UINT_MAX)
+      m_tokens_idx = 0;
+    else if (m_tokens_idx + N >= m_lexed_tokens.size())
+      // N is too large; advance to the end of the lexed tokens.
+      m_tokens_idx = m_lexed_tokens.size() - 1;
+    else
+      m_tokens_idx += N;
+  }
 
-  /// Update the index for the 'current' token, to point to the next lexed
-  /// token.
-  bool IncrementTokenIdx() {
-    if (m_tokens_idx >= m_lexed_tokens.size() - 1)
-      return false;
+  /// Return the lexed token N positions ahead of the 'current' token
+  /// being handled by the DIL parser.
+  const Token &LookAhead(uint32_t N) {
+    if (m_tokens_idx + N < m_lexed_tokens.size())
+      return m_lexed_tokens[m_tokens_idx + N];
 
-    m_tokens_idx++;
-    return true;
+    return m_eof_token;
   }
 
+  /// Return the index for the 'current' token being handled by the DIL parser.
+  uint32_t GetCurrentTokenIdx() { return m_tokens_idx; }
+
   /// Set the index for the 'current' token (to be handled by the parser)
   /// to a particular position. Used for either committing 'look ahead' parsing
   /// or rolling back tentative parsing.
-  bool ResetTokenIdx(uint32_t new_value) {
-    if (new_value > m_lexed_tokens.size() - 1)
-      return false;
-
+  void ResetTokenIdx(uint32_t new_value) {
+    assert(new_value == UINT_MAX || new_value < m_lexed_tokens.size());
     m_tokens_idx = new_value;
-    return true;
   }
 
-  uint32_t GetLocation() { return m_cur_pos - m_expr.begin(); }
+  uint32_t NumLexedTokens() { return m_lexed_tokens.size(); }
 
 private:
-  llvm::Expected<Token> Lex();
+  DILLexer(llvm::StringRef dil_expr, std::vector<Token> lexed_tokens)
+      : m_expr(dil_expr), m_lexed_tokens(lexed_tokens), m_tokens_idx(UINT_MAX),
+        m_eof_token(Token(Token::eof, "", 0)) {}
 
-  llvm::iterator_range<llvm::StringRef::iterator> IsWord();
-
-  /// Update 'result' with the other paremeter values, create a
-  /// duplicate token, and push the duplicate token onto the vector of
-  /// lexed tokens.
-  void UpdateLexedTokens(Token &result, Token::Kind tok_kind,
-                         std::string tok_str, uint32_t tok_pos);
+  static llvm::Expected<Token> Lex(llvm::StringRef expr,
+                                   llvm::StringRef &remainder);
 
   // The input string we are lexing & parsing.
   llvm::StringRef m_expr;
 
-  // The current position of the lexer within m_expr (the character position,
-  // within the string, of the next item to be lexed).
-  llvm::StringRef::iterator m_cur_pos;
-
   // Holds all of the tokens lexed so far.
   std::vector<Token> m_lexed_tokens;
 
   // Index into m_lexed_tokens; indicates which token the DIL parser is
   // currently trying to parse/handle.
   uint32_t m_tokens_idx;
 
-  // "invalid" token; to be returned by lexer when 'look ahead' fails.
-  Token m_invalid_token;
+  // "eof" token; to be returned by lexer when 'look ahead' fails.
+  Token m_eof_token;
 };
 
-} // namespace dil
-
-} // namespace lldb_private
+} // namespace lldb_private::dil
 
 #endif // LLDB_VALUEOBJECT_DILLEXER_H_
diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp
@@ -15,9 +15,7 @@
 #include "lldb/Utility/Status.h"
 #include "llvm/ADT/StringSwitch.h"
 
-namespace lldb_private {
-
-namespace dil {
+namespace lldb_private::dil {
 
 llvm::StringRef Token::GetTokenName(Kind kind) {
   switch (kind) {
@@ -27,14 +25,8 @@ llvm::StringRef Token::GetTokenName(Kind kind) {
     return "eof";
   case Kind::identifier:
     return "identifier";
-  case Kind::invalid:
-    return "invalid";
-  case Kind::kw_namespace:
-    return "namespace";
   case Kind::l_paren:
     return "l_paren";
-  case Kind::none:
-    return "none";
   case Kind::r_paren:
     return "r_paren";
   case Kind::unknown:
@@ -50,140 +42,91 @@ static bool IsDigit(char c) { return '0' <= c && c <= '9'; }
 
 // A word starts with a letter, underscore, or dollar sign, followed by
 // letters ('a'..'z','A'..'Z'), digits ('0'..'9'), and/or  underscores.
-llvm::iterator_range<llvm::StringRef::iterator> DILLexer::IsWord() {
-  llvm::StringRef::iterator start = m_cur_pos;
+static std::optional<llvm::StringRef> IsWord(llvm::StringRef expr,
+                                             llvm::StringRef &remainder) {
+  llvm::StringRef::iterator cur_pos = expr.end() - remainder.size();
+  llvm::StringRef::iterator start = cur_pos;
   bool dollar_start = false;
 
   // Must not start with a digit.
-  if (m_cur_pos == m_expr.end() || IsDigit(*m_cur_pos))
-    return llvm::make_range(m_cur_pos, m_cur_pos);
+  if (cur_pos == expr.end() || IsDigit(*cur_pos))
+    return std::nullopt;
 
   // First character *may* be a '$', for a register name or convenience
   // variable.
-  if (*m_cur_pos == '$') {
+  if (*cur_pos == '$') {
     dollar_start = true;
-    ++m_cur_pos;
+    ++cur_pos;
   }
 
   // Contains only letters, digits or underscores
-  for (; m_cur_pos != m_expr.end(); ++m_cur_pos) {
-    char c = *m_cur_pos;
+  for (; cur_pos != expr.end(); ++cur_pos) {
+    char c = *cur_pos;
     if (!IsLetter(c) && !IsDigit(c) && c != '_')
       break;
   }
 
   // If first char is '$', make sure there's at least one mare char, or it's
   // invalid.
-  if (dollar_start && (m_cur_pos - start <= 1)) {
-    m_cur_pos = start;
-    return llvm::make_range(start, start); // Empty range
+  if (dollar_start && (cur_pos - start <= 1)) {
+    cur_pos = start;
+    return std::nullopt;
   }
 
-  return llvm::make_range(start, m_cur_pos);
-}
+  if (cur_pos == start)
+    return std::nullopt;
+
+  llvm::StringRef word = expr.substr(start - expr.begin(), cur_pos - start);
+  if (remainder.consume_front(word))
+    return word;
 
-void DILLexer::UpdateLexedTokens(Token &result, Token::Kind tok_kind,
-                                 std::string tok_str, uint32_t tok_pos) {
-  Token new_token(tok_kind, tok_str, tok_pos);
-  result = new_token;
-  m_lexed_tokens.push_back(std::move(new_token));
+  return std::nullopt;
 }
 
-llvm::Expected<bool> DILLexer::LexAll() {
-  bool done = false;
-  while (!done) {
-    auto tok_or_err = Lex();
-    if (!tok_or_err)
-      return tok_or_err.takeError();
-    Token token = *tok_or_err;
-    if (token.GetKind() == Token::eof) {
-      done = true;
+llvm::Expected<DILLexer> DILLexer::Create(llvm::StringRef expr) {
+  std::vector<Token> tokens;
+  llvm::StringRef remainder = expr;
+  do {
+    if (llvm::Expected<Token> t = Lex(expr, remainder)) {
+      tokens.push_back(std::move(*t));
+    } else {
+      return t.takeError();
     }
-  }
-  return true;
+  } while (tokens.back().GetKind() != Token::eof);
+  return DILLexer(expr, std::move(tokens));
 }
 
-llvm::Expected<Token> DILLexer::Lex() {
-  Token result;
-
+llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
+                                    llvm::StringRef &remainder) {
   // Skip over whitespace (spaces).
-  while (m_cur_pos != m_expr.end() && *m_cur_pos == ' ')
-    m_cur_pos++;
+  remainder = remainder.ltrim();
+  llvm::StringRef::iterator cur_pos = expr.end() - remainder.size();
 
   // Check to see if we've reached the end of our input string.
-  if (m_cur_pos == m_expr.end()) {
-    UpdateLexedTokens(result, Token::eof, "", (uint32_t)m_expr.size());
-    return result;
+  if (remainder.empty() || cur_pos == expr.end())
+    return Token(Token::eof, "", (uint32_t)expr.size());
+
+  uint32_t position = cur_pos - expr.begin();
+  std::optional<llvm::StringRef> maybe_word = IsWord(expr, remainder);
+  if (maybe_word) {
+    llvm::StringRef word = *maybe_word;
+    return Token(Token::identifier, word.str(), position);
   }
 
-  uint32_t position = m_cur_pos - m_expr.begin();
-  llvm::StringRef::iterator start = m_cur_pos;
-  llvm::iterator_range<llvm::StringRef::iterator> word_range = IsWord();
-  if (!word_range.empty()) {
-    uint32_t length = word_range.end() - word_range.begin();
-    llvm::StringRef word(m_expr.substr(position, length));
-    // We will be adding more keywords here in the future...
-    Token::Kind kind = llvm::StringSwitch<Token::Kind>(word)
-                           .Case("namespace", Token::kw_namespace)
-                           .Default(Token::identifier);
-    UpdateLexedTokens(result, kind, word.str(), position);
-    return result;
-  }
-
-  m_cur_pos = start;
-  llvm::StringRef remainder(m_expr.substr(position, m_expr.end() - m_cur_pos));
-  std::vector<std::pair<Token::Kind, const char *>> operators = {
+  constexpr std::pair<Token::Kind, const char *> operators[] = {
       {Token::l_paren, "("},
       {Token::r_paren, ")"},
       {Token::coloncolon, "::"},
   };
   for (auto [kind, str] : operators) {
     if (remainder.consume_front(str)) {
-      m_cur_pos += strlen(str);
-      UpdateLexedTokens(result, kind, str, position);
-      return result;
+      cur_pos += strlen(str);
+      return Token(kind, str, position);
     }
   }
 
   // Unrecognized character(s) in string; unable to lex it.
-  Status error("Unable to lex input string");
-  return error.ToError();
-}
-
-const Token &DILLexer::LookAhead(uint32_t N) {
-  if (m_tokens_idx + N + 1 < m_lexed_tokens.size())
-    return m_lexed_tokens[m_tokens_idx + N + 1];
-
-  return m_invalid_token;
+  return llvm::createStringError("Unable to lex input string");
 }
 
-const Token &DILLexer::AcceptLookAhead(uint32_t N) {
-  if (m_tokens_idx + N + 1 > m_lexed_tokens.size())
-    return m_invalid_token;
-
-  m_tokens_idx += N + 1;
-  return m_lexed_tokens[m_tokens_idx];
-}
-
-const Token &DILLexer::GetNextToken() {
-  if (m_tokens_idx == UINT_MAX)
-    m_tokens_idx = 0;
-  else
-    m_tokens_idx++;
-
-  // Return the next token in the vector of lexed tokens.
-  if (m_tokens_idx < m_lexed_tokens.size())
-    return m_lexed_tokens[m_tokens_idx];
-
-  // We're already at/beyond the end of our lexed tokens. If the last token
-  // is an eof token, return it.
-  if (m_lexed_tokens[m_lexed_tokens.size() - 1].GetKind() == Token::eof)
-    return m_lexed_tokens[m_lexed_tokens.size() - 1];
-
-  // Return the invalid token.
-  return m_invalid_token;
-}
-
-} // namespace dil
-
-} // namespace lldb_private
+} // namespace lldb_private::dil
diff --git a/lldb/unittests/ValueObject/CMakeLists.txt b/lldb/unittests/ValueObject/CMakeLists.txt
@@ -6,6 +6,7 @@ add_lldb_unittest(LLDBValueObjectTests
     lldbValueObject
     lldbPluginPlatformLinux
     lldbPluginScriptInterpreterNone
+    LLVMTestingSupport
 
   LINK_COMPONENTS
     Support
diff --git a/lldb/unittests/ValueObject/DILLexerTests.cpp b/lldb/unittests/ValueObject/DILLexerTests.cpp