[TableGen] Remove explicit recursion in LexToken (#143697)

jayfoad · web-flow · commit b9f5b39e04d2 · 2025-07-16T21:54:49.000+01:00
When profiling a Release+Asserts build of llvm-tblgen I noticed that it
was recursing hundreds of times to lex a sequence of hundreds of space
characters.
diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
@@ -174,129 +174,174 @@ int TGLexer::peekNextChar(int Index) const {
 }
 
 tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
-  TokStart = CurPtr;
-  // This always consumes at least one character.
-  int CurChar = getNextChar();
+  while (true) {
+    TokStart = CurPtr;
+    // This always consumes at least one character.
+    int CurChar = getNextChar();
 
-  switch (CurChar) {
-  default:
-    // Handle letters: [a-zA-Z_]
-    if (isValidIDChar(CurChar, /*First=*/true))
-      return LexIdentifier();
-
-    // Unknown character, emit an error.
-    return ReturnError(TokStart, "unexpected character");
-  case EOF:
-    // Lex next token, if we just left an include file.
-    // Note that leaving an include file means that the next
-    // symbol is located at the end of the 'include "..."'
-    // construct, so LexToken() is called with default
-    // false parameter.
-    if (processEOF())
-      return LexToken();
+    switch (CurChar) {
+    default:
+      // Handle letters: [a-zA-Z_]
+      if (isValidIDChar(CurChar, /*First=*/true))
+        return LexIdentifier();
 
-    // Return EOF denoting the end of lexing.
-    return tgtok::Eof;
-
-  case ':': return tgtok::colon;
-  case ';': return tgtok::semi;
-  case ',': return tgtok::comma;
-  case '<': return tgtok::less;
-  case '>': return tgtok::greater;
-  case ']': return tgtok::r_square;
-  case '{': return tgtok::l_brace;
-  case '}': return tgtok::r_brace;
-  case '(': return tgtok::l_paren;
-  case ')': return tgtok::r_paren;
-  case '=': return tgtok::equal;
-  case '?': return tgtok::question;
-  case '#':
-    if (FileOrLineStart) {
-      tgtok::TokKind Kind = prepIsDirective();
-      if (Kind != tgtok::Error)
-        return lexPreprocessor(Kind);
-    }
+      // Unknown character, emit an error.
+      return ReturnError(TokStart, "unexpected character");
+    case EOF:
+      // Lex next token, if we just left an include file.
+      if (processEOF()) {
+        // Leaving an include file means that the next symbol is located at the
+        // end of the 'include "..."' construct.
+        FileOrLineStart = false;
+        break;
+      }
 
-    return tgtok::paste;
+      // Return EOF denoting the end of lexing.
+      return tgtok::Eof;
+
+    case ':':
+      return tgtok::colon;
+    case ';':
+      return tgtok::semi;
+    case ',':
+      return tgtok::comma;
+    case '<':
+      return tgtok::less;
+    case '>':
+      return tgtok::greater;
+    case ']':
+      return tgtok::r_square;
+    case '{':
+      return tgtok::l_brace;
+    case '}':
+      return tgtok::r_brace;
+    case '(':
+      return tgtok::l_paren;
+    case ')':
+      return tgtok::r_paren;
+    case '=':
+      return tgtok::equal;
+    case '?':
+      return tgtok::question;
+    case '#':
+      if (FileOrLineStart) {
+        tgtok::TokKind Kind = prepIsDirective();
+        if (Kind != tgtok::Error)
+          return lexPreprocessor(Kind);
+      }
+
+      return tgtok::paste;
 
-  // The period is a separate case so we can recognize the "..."
-  // range punctuator.
-  case '.':
-    if (peekNextChar(0) == '.') {
-      ++CurPtr; // Eat second dot.
+      // The period is a separate case so we can recognize the "..."
+      // range punctuator.
+    case '.':
       if (peekNextChar(0) == '.') {
-        ++CurPtr; // Eat third dot.
-        return tgtok::dotdotdot;
+        ++CurPtr; // Eat second dot.
+        if (peekNextChar(0) == '.') {
+          ++CurPtr; // Eat third dot.
+          return tgtok::dotdotdot;
+        }
+        return ReturnError(TokStart, "invalid '..' punctuation");
       }
-      return ReturnError(TokStart, "invalid '..' punctuation");
-    }
-    return tgtok::dot;
+      return tgtok::dot;
 
-  case '\r':
-    llvm_unreachable("getNextChar() must never return '\r'");
+    case '\r':
+      llvm_unreachable("getNextChar() must never return '\r'");
 
-  case ' ':
-  case '\t':
-    // Ignore whitespace.
-    return LexToken(FileOrLineStart);
-  case '\n':
-    // Ignore whitespace, and identify the new line.
-    return LexToken(true);
-  case '/':
-    // If this is the start of a // comment, skip until the end of the line or
-    // the end of the buffer.
-    if (*CurPtr == '/')
-      SkipBCPLComment();
-    else if (*CurPtr == '*') {
-      if (SkipCComment())
-        return tgtok::Error;
-    } else // Otherwise, this is an error.
-      return ReturnError(TokStart, "unexpected character");
-    return LexToken(FileOrLineStart);
-  case '-': case '+':
-  case '0': case '1': case '2': case '3': case '4': case '5': case '6':
-  case '7': case '8': case '9': {
-    int NextChar = 0;
-    if (isDigit(CurChar)) {
-      // Allow identifiers to start with a number if it is followed by
-      // an identifier. This can happen with paste operations like
-      // foo#8i.
-      int i = 0;
-      do {
-        NextChar = peekNextChar(i++);
-      } while (isDigit(NextChar));
-
-      if (NextChar == 'x' || NextChar == 'b') {
-        // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
-        // likely a number.
-        int NextNextChar = peekNextChar(i);
-        switch (NextNextChar) {
-        default:
-          break;
-        case '0': case '1':
-          if (NextChar == 'b')
-            return LexNumber();
-          [[fallthrough]];
-        case '2': case '3': case '4': case '5':
-        case '6': case '7': case '8': case '9':
-        case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
-        case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
-          if (NextChar == 'x')
-            return LexNumber();
-          break;
+    case ' ':
+    case '\t':
+      // Ignore whitespace.
+      break;
+    case '\n':
+      // Ignore whitespace, and identify the new line.
+      FileOrLineStart = true;
+      break;
+    case '/':
+      // If this is the start of a // comment, skip until the end of the line or
+      // the end of the buffer.
+      if (*CurPtr == '/')
+        SkipBCPLComment();
+      else if (*CurPtr == '*') {
+        if (SkipCComment())
+          return tgtok::Error;
+      } else // Otherwise, this is an error.
+        return ReturnError(TokStart, "unexpected character");
+      break;
+    case '-':
+    case '+':
+    case '0':
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+    case '8':
+    case '9': {
+      int NextChar = 0;
+      if (isDigit(CurChar)) {
+        // Allow identifiers to start with a number if it is followed by
+        // an identifier.  This can happen with paste operations like
+        // foo#8i.
+        int i = 0;
+        do {
+          NextChar = peekNextChar(i++);
+        } while (isDigit(NextChar));
+
+        if (NextChar == 'x' || NextChar == 'b') {
+          // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
+          // likely a number.
+          int NextNextChar = peekNextChar(i);
+          switch (NextNextChar) {
+          default:
+            break;
+          case '0':
+          case '1':
+            if (NextChar == 'b')
+              return LexNumber();
+            [[fallthrough]];
+          case '2':
+          case '3':
+          case '4':
+          case '5':
+          case '6':
+          case '7':
+          case '8':
+          case '9':
+          case 'a':
+          case 'b':
+          case 'c':
+          case 'd':
+          case 'e':
+          case 'f':
+          case 'A':
+          case 'B':
+          case 'C':
+          case 'D':
+          case 'E':
+          case 'F':
+            if (NextChar == 'x')
+              return LexNumber();
+            break;
+          }
         }
       }
-    }
 
-    if (isValidIDChar(NextChar, /*First=*/true))
-      return LexIdentifier();
+      if (isValidIDChar(NextChar, /*First=*/true))
+        return LexIdentifier();
 
-    return LexNumber();
-  }
-  case '"': return LexString();
-  case '$': return LexVarName();
-  case '[': return LexBracket();
-  case '!': return LexExclaim();
+      return LexNumber();
+    }
+    case '"':
+      return LexString();
+    case '$':
+      return LexVarName();
+    case '[':
+      return LexBracket();
+    case '!':
+      return LexExclaim();
+    }
   }
 }