Address remaining review comments:

cmtice · cmtice · commit 0b33ab7ea76e · 2025-02-03T12:36:26.000-08:00
- Remove 'unknown' token type.
- Remove UINT_MAX as a valid token index; always start at 0.
- Update IsWord	to use Pavel's more efficient StringRef	implementation.
- Allow '$' anywhere in an identifer string.
- Adjust unit tests to handle changes mentioned above.
- Clean	up test	of invalid identifiers (remove if-then-else; split invalid
  identifiers from unrecognized	strings).
diff --git a/lldb/include/lldb/ValueObject/DILLexer.h b/lldb/include/lldb/ValueObject/DILLexer.h
@@ -30,7 +30,6 @@ class Token {
     identifier,
     l_paren,
     r_paren,
-    unknown,
   };
 
   Token(Kind kind, std::string spelling, uint32_t start)
@@ -72,10 +71,7 @@ class DILLexer {
 
   /// Advance the current token position by N.
   void Advance(uint32_t N = 1) {
-    // UINT_MAX means uninitialized, no "current" position, so move to start.
-    if (m_tokens_idx == UINT_MAX)
-      m_tokens_idx = 0;
-    else if (m_tokens_idx + N >= m_lexed_tokens.size())
+    if (m_tokens_idx + N >= m_lexed_tokens.size())
       // N is too large; advance to the end of the lexed tokens.
       m_tokens_idx = m_lexed_tokens.size() - 1;
     else
@@ -99,7 +95,7 @@ class DILLexer {
   /// to a particular position. Used for either committing 'look ahead' parsing
   /// or rolling back tentative parsing.
   void ResetTokenIdx(uint32_t new_value) {
-    assert(new_value == UINT_MAX || new_value < m_lexed_tokens.size());
+    assert(new_value < m_lexed_tokens.size());
     m_tokens_idx = new_value;
   }
 
@@ -108,7 +104,7 @@ class DILLexer {
 private:
   DILLexer(llvm::StringRef dil_expr, std::vector<Token> lexed_tokens)
       : m_expr(dil_expr), m_lexed_tokens(std::move(lexed_tokens)),
-        m_tokens_idx(UINT_MAX), m_eof_token(Token(Token::eof, "", 0)) {}
+        m_tokens_idx(0), m_eof_token(Token(Token::eof, "", 0)) {}
 
   static llvm::Expected<Token> Lex(llvm::StringRef expr,
                                    llvm::StringRef &remainder);
diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp
@@ -29,8 +29,6 @@ llvm::StringRef Token::GetTokenName(Kind kind) {
     return "l_paren";
   case Kind::r_paren:
     return "r_paren";
-  case Kind::unknown:
-    return "unknown";
   }
 }
 
@@ -44,43 +42,14 @@ static bool IsDigit(char c) { return '0' <= c && c <= '9'; }
 // letters ('a'..'z','A'..'Z'), digits ('0'..'9'), and/or  underscores.
 static std::optional<llvm::StringRef> IsWord(llvm::StringRef expr,
                                              llvm::StringRef &remainder) {
-  llvm::StringRef::iterator cur_pos = expr.end() - remainder.size();
-  llvm::StringRef::iterator start = cur_pos;
-  bool dollar_start = false;
-
-  // Must not start with a digit.
-  if (cur_pos == expr.end() || IsDigit(*cur_pos))
-    return std::nullopt;
-
-  // First character *may* be a '$', for a register name or convenience
-  // variable.
-  if (*cur_pos == '$') {
-    dollar_start = true;
-    ++cur_pos;
-  }
-
-  // Contains only letters, digits or underscores
-  for (; cur_pos != expr.end(); ++cur_pos) {
-    char c = *cur_pos;
-    if (!IsLetter(c) && !IsDigit(c) && c != '_')
-      break;
-  }
-
-  // If first char is '$', make sure there's at least one mare char, or it's
-  // invalid.
-  if (dollar_start && (cur_pos - start <= 1)) {
-    cur_pos = start;
-    return std::nullopt;
-  }
-
-  if (cur_pos == start)
+  // Find the longest prefix consisting of letters, digits, underscors and
+  // '$'. If it doesn't start with a digit, then it's a word.
+  llvm::StringRef candidate = remainder.take_while(
+      [](char c) { return IsDigit(c) || IsLetter(c) || c == '_' || c == '$'; });
+  if (candidate.empty() || IsDigit(candidate[0]))
     return std::nullopt;
-
-  llvm::StringRef word = expr.substr(start - expr.begin(), cur_pos - start);
-  if (remainder.consume_front(word))
-    return word;
-
-  return std::nullopt;
+  remainder = remainder.drop_front(candidate.size());
+  return candidate;
 }
 
 llvm::Expected<DILLexer> DILLexer::Create(llvm::StringRef expr) {
@@ -100,10 +69,10 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
                                     llvm::StringRef &remainder) {
   // Skip over whitespace (spaces).
   remainder = remainder.ltrim();
-  llvm::StringRef::iterator cur_pos = expr.end() - remainder.size();
+  llvm::StringRef::iterator cur_pos = remainder.begin();
 
   // Check to see if we've reached the end of our input string.
-  if (remainder.empty() || cur_pos == expr.end())
+  if (remainder.empty())
     return Token(Token::eof, "", (uint32_t)expr.size());
 
   uint32_t position = cur_pos - expr.begin();
diff --git a/lldb/unittests/ValueObject/DILLexerTests.cpp b/lldb/unittests/ValueObject/DILLexerTests.cpp
@@ -27,13 +27,15 @@ ExtractTokenData(llvm::StringRef input_expr) {
   if (lexer.NumLexedTokens() == 0)
     return llvm::createStringError("No lexed tokens");
 
-  lexer.ResetTokenIdx(UINT_MAX);
+  lexer.ResetTokenIdx(0);
   std::vector<std::pair<Token::Kind, std::string>> data;
   do {
-    lexer.Advance();
     Token tok = lexer.GetCurrentToken();
     data.push_back(std::make_pair(tok.GetKind(), tok.GetSpelling()));
+    lexer.Advance();
   } while (data.back().first != Token::eof);
+  // Don't return the eof token.
+  data.pop_back();
   return data;
 }
 
@@ -42,11 +44,8 @@ TEST(DILLexerTests, SimpleTest) {
   llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr);
   ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
   DILLexer lexer(*maybe_lexer);
-  Token token = Token(Token::unknown, "", 0);
-  EXPECT_EQ(token.GetKind(), Token::unknown);
+  Token token = lexer.GetCurrentToken();
 
-  lexer.Advance();
-  token = lexer.GetCurrentToken();
   EXPECT_EQ(token.GetKind(), Token::identifier);
   EXPECT_EQ(token.GetSpelling(), "simple_var");
   lexer.Advance();
@@ -69,9 +68,7 @@ TEST(DILLexerTests, LookAheadTest) {
   llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr);
   ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
   DILLexer lexer(*maybe_lexer);
-  Token token = Token(Token::unknown, "", 0);
-  lexer.Advance();
-  token = lexer.GetCurrentToken();
+  Token token = lexer.GetCurrentToken();
 
   // Current token is '('; check the next 4 tokens, to make
   // sure they are the identifier 'anonymous', the identifier 'namespace'
@@ -110,49 +107,54 @@ TEST(DILLexerTests, LookAheadTest) {
 TEST(DILLexerTests, MultiTokenLexTest) {
   EXPECT_THAT_EXPECTED(
       ExtractTokenData("This string has (several ) ::identifiers"),
-      llvm::HasValue(
-          testing::ElementsAre(testing::Pair(Token::identifier, "This"),
-                               testing::Pair(Token::identifier, "string"),
-                               testing::Pair(Token::identifier, "has"),
-                               testing::Pair(Token::l_paren, "("),
-                               testing::Pair(Token::identifier, "several"),
-                               testing::Pair(Token::r_paren, ")"),
-                               testing::Pair(Token::coloncolon, "::"),
-                               testing::Pair(Token::identifier, "identifiers"),
-                               testing::Pair(Token::eof, ""))));
+      llvm::HasValue(testing::ElementsAre(
+          testing::Pair(Token::identifier, "This"),
+          testing::Pair(Token::identifier, "string"),
+          testing::Pair(Token::identifier, "has"),
+          testing::Pair(Token::l_paren, "("),
+          testing::Pair(Token::identifier, "several"),
+          testing::Pair(Token::r_paren, ")"),
+          testing::Pair(Token::coloncolon, "::"),
+          testing::Pair(Token::identifier, "identifiers"))));
 }
 
 TEST(DILLexerTests, IdentifiersTest) {
+  // These strings should lex into identifier tokens.
   std::vector<std::string> valid_identifiers = {
-      "$My_name1", "$pc",  "abcd", "_", "_a",     "_a_",
+      "$My_name1", "$pc",  "abcd", "_", "_a",     "_a_",      "$",
       "a_b",       "this", "self", "a", "MyName", "namespace"};
-  std::vector<std::string> invalid_identifiers = {"234", "2a",      "2",
-                                                  "$",   "1MyName", ""};
+
+  // The lexer can lex these strings, but they should not be identifiers.
+  std::vector<std::string> invalid_identifiers = {"", "::", "(", ")"};
+
+  // The lexer is expected to fail attempting to lex these strings (it cannot
+  // create valid tokens out of them).
+  std::vector<std::string> invalid_tok_strings = {"234", "2a", "2", "1MyName"};
 
   // Verify that all of the valid identifiers come out as identifier tokens.
   for (auto &str : valid_identifiers) {
     SCOPED_TRACE(str);
     EXPECT_THAT_EXPECTED(ExtractTokenData(str),
                          llvm::HasValue(testing::ElementsAre(
-                             testing::Pair(Token::identifier, str),
-                             testing::Pair(Token::eof, ""))));
+                             testing::Pair(Token::identifier, str))));
+  }
+
+  // Verify that the lexer fails on invalid token strings.
+  for (auto &str : invalid_tok_strings) {
+    SCOPED_TRACE(str);
+    auto maybe_lexer = DILLexer::Create(str);
+    EXPECT_THAT_EXPECTED(maybe_lexer, llvm::Failed());
   }
 
   // Verify that none of the invalid identifiers come out as identifier tokens.
   for (auto &str : invalid_identifiers) {
     SCOPED_TRACE(str);
     llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(str);
-    if (!maybe_lexer) {
-      llvm::consumeError(maybe_lexer.takeError());
-      // In this case, it's ok for lexing to return an error.
-    } else {
-      DILLexer lexer(*maybe_lexer);
-      Token token = Token(Token::unknown, "", 0);
-      // We didn't get an error; make sure we did not get an identifier token.
-      lexer.Advance();
-      token = lexer.GetCurrentToken();
-      EXPECT_TRUE(token.IsNot(Token::identifier));
-      EXPECT_TRUE(token.IsOneOf(Token::unknown, Token::eof));
-    }
+    EXPECT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
+    DILLexer lexer(*maybe_lexer);
+    Token token = lexer.GetCurrentToken();
+    EXPECT_TRUE(token.IsNot(Token::identifier));
+    EXPECT_TRUE(token.IsOneOf(Token::eof, Token::coloncolon, Token::l_paren,
+                              Token::r_paren));
   }
 }