Skip to content

Commit b9f5b39

Browse files
authored
[TableGen] Remove explicit recursion in LexToken (#143697)
When profiling a Release+Asserts build of llvm-tblgen I noticed that it was recursing hundreds of times to lex a sequence of hundreds of space characters.
1 parent 8c28f49 commit b9f5b39

File tree

1 file changed

+156
-111
lines changed

1 file changed

+156
-111
lines changed

llvm/lib/TableGen/TGLexer.cpp

Lines changed: 156 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -174,129 +174,174 @@ int TGLexer::peekNextChar(int Index) const {
174174
}
175175

176176
tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
177-
TokStart = CurPtr;
178-
// This always consumes at least one character.
179-
int CurChar = getNextChar();
177+
while (true) {
178+
TokStart = CurPtr;
179+
// This always consumes at least one character.
180+
int CurChar = getNextChar();
180181

181-
switch (CurChar) {
182-
default:
183-
// Handle letters: [a-zA-Z_]
184-
if (isValidIDChar(CurChar, /*First=*/true))
185-
return LexIdentifier();
186-
187-
// Unknown character, emit an error.
188-
return ReturnError(TokStart, "unexpected character");
189-
case EOF:
190-
// Lex next token, if we just left an include file.
191-
// Note that leaving an include file means that the next
192-
// symbol is located at the end of the 'include "..."'
193-
// construct, so LexToken() is called with default
194-
// false parameter.
195-
if (processEOF())
196-
return LexToken();
182+
switch (CurChar) {
183+
default:
184+
// Handle letters: [a-zA-Z_]
185+
if (isValidIDChar(CurChar, /*First=*/true))
186+
return LexIdentifier();
197187

198-
// Return EOF denoting the end of lexing.
199-
return tgtok::Eof;
200-
201-
case ':': return tgtok::colon;
202-
case ';': return tgtok::semi;
203-
case ',': return tgtok::comma;
204-
case '<': return tgtok::less;
205-
case '>': return tgtok::greater;
206-
case ']': return tgtok::r_square;
207-
case '{': return tgtok::l_brace;
208-
case '}': return tgtok::r_brace;
209-
case '(': return tgtok::l_paren;
210-
case ')': return tgtok::r_paren;
211-
case '=': return tgtok::equal;
212-
case '?': return tgtok::question;
213-
case '#':
214-
if (FileOrLineStart) {
215-
tgtok::TokKind Kind = prepIsDirective();
216-
if (Kind != tgtok::Error)
217-
return lexPreprocessor(Kind);
218-
}
188+
// Unknown character, emit an error.
189+
return ReturnError(TokStart, "unexpected character");
190+
case EOF:
191+
// Lex next token, if we just left an include file.
192+
if (processEOF()) {
193+
// Leaving an include file means that the next symbol is located at the
194+
// end of the 'include "..."' construct.
195+
FileOrLineStart = false;
196+
break;
197+
}
219198

220-
return tgtok::paste;
199+
// Return EOF denoting the end of lexing.
200+
return tgtok::Eof;
201+
202+
case ':':
203+
return tgtok::colon;
204+
case ';':
205+
return tgtok::semi;
206+
case ',':
207+
return tgtok::comma;
208+
case '<':
209+
return tgtok::less;
210+
case '>':
211+
return tgtok::greater;
212+
case ']':
213+
return tgtok::r_square;
214+
case '{':
215+
return tgtok::l_brace;
216+
case '}':
217+
return tgtok::r_brace;
218+
case '(':
219+
return tgtok::l_paren;
220+
case ')':
221+
return tgtok::r_paren;
222+
case '=':
223+
return tgtok::equal;
224+
case '?':
225+
return tgtok::question;
226+
case '#':
227+
if (FileOrLineStart) {
228+
tgtok::TokKind Kind = prepIsDirective();
229+
if (Kind != tgtok::Error)
230+
return lexPreprocessor(Kind);
231+
}
232+
233+
return tgtok::paste;
221234

222-
// The period is a separate case so we can recognize the "..."
223-
// range punctuator.
224-
case '.':
225-
if (peekNextChar(0) == '.') {
226-
++CurPtr; // Eat second dot.
235+
// The period is a separate case so we can recognize the "..."
236+
// range punctuator.
237+
case '.':
227238
if (peekNextChar(0) == '.') {
228-
++CurPtr; // Eat third dot.
229-
return tgtok::dotdotdot;
239+
++CurPtr; // Eat second dot.
240+
if (peekNextChar(0) == '.') {
241+
++CurPtr; // Eat third dot.
242+
return tgtok::dotdotdot;
243+
}
244+
return ReturnError(TokStart, "invalid '..' punctuation");
230245
}
231-
return ReturnError(TokStart, "invalid '..' punctuation");
232-
}
233-
return tgtok::dot;
246+
return tgtok::dot;
234247

235-
case '\r':
236-
llvm_unreachable("getNextChar() must never return '\r'");
248+
case '\r':
249+
llvm_unreachable("getNextChar() must never return '\r'");
237250

238-
case ' ':
239-
case '\t':
240-
// Ignore whitespace.
241-
return LexToken(FileOrLineStart);
242-
case '\n':
243-
// Ignore whitespace, and identify the new line.
244-
return LexToken(true);
245-
case '/':
246-
// If this is the start of a // comment, skip until the end of the line or
247-
// the end of the buffer.
248-
if (*CurPtr == '/')
249-
SkipBCPLComment();
250-
else if (*CurPtr == '*') {
251-
if (SkipCComment())
252-
return tgtok::Error;
253-
} else // Otherwise, this is an error.
254-
return ReturnError(TokStart, "unexpected character");
255-
return LexToken(FileOrLineStart);
256-
case '-': case '+':
257-
case '0': case '1': case '2': case '3': case '4': case '5': case '6':
258-
case '7': case '8': case '9': {
259-
int NextChar = 0;
260-
if (isDigit(CurChar)) {
261-
// Allow identifiers to start with a number if it is followed by
262-
// an identifier. This can happen with paste operations like
263-
// foo#8i.
264-
int i = 0;
265-
do {
266-
NextChar = peekNextChar(i++);
267-
} while (isDigit(NextChar));
268-
269-
if (NextChar == 'x' || NextChar == 'b') {
270-
// If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
271-
// likely a number.
272-
int NextNextChar = peekNextChar(i);
273-
switch (NextNextChar) {
274-
default:
275-
break;
276-
case '0': case '1':
277-
if (NextChar == 'b')
278-
return LexNumber();
279-
[[fallthrough]];
280-
case '2': case '3': case '4': case '5':
281-
case '6': case '7': case '8': case '9':
282-
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
283-
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
284-
if (NextChar == 'x')
285-
return LexNumber();
286-
break;
251+
case ' ':
252+
case '\t':
253+
// Ignore whitespace.
254+
break;
255+
case '\n':
256+
// Ignore whitespace, and identify the new line.
257+
FileOrLineStart = true;
258+
break;
259+
case '/':
260+
// If this is the start of a // comment, skip until the end of the line or
261+
// the end of the buffer.
262+
if (*CurPtr == '/')
263+
SkipBCPLComment();
264+
else if (*CurPtr == '*') {
265+
if (SkipCComment())
266+
return tgtok::Error;
267+
} else // Otherwise, this is an error.
268+
return ReturnError(TokStart, "unexpected character");
269+
break;
270+
case '-':
271+
case '+':
272+
case '0':
273+
case '1':
274+
case '2':
275+
case '3':
276+
case '4':
277+
case '5':
278+
case '6':
279+
case '7':
280+
case '8':
281+
case '9': {
282+
int NextChar = 0;
283+
if (isDigit(CurChar)) {
284+
// Allow identifiers to start with a number if it is followed by
285+
// an identifier. This can happen with paste operations like
286+
// foo#8i.
287+
int i = 0;
288+
do {
289+
NextChar = peekNextChar(i++);
290+
} while (isDigit(NextChar));
291+
292+
if (NextChar == 'x' || NextChar == 'b') {
293+
// If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
294+
// likely a number.
295+
int NextNextChar = peekNextChar(i);
296+
switch (NextNextChar) {
297+
default:
298+
break;
299+
case '0':
300+
case '1':
301+
if (NextChar == 'b')
302+
return LexNumber();
303+
[[fallthrough]];
304+
case '2':
305+
case '3':
306+
case '4':
307+
case '5':
308+
case '6':
309+
case '7':
310+
case '8':
311+
case '9':
312+
case 'a':
313+
case 'b':
314+
case 'c':
315+
case 'd':
316+
case 'e':
317+
case 'f':
318+
case 'A':
319+
case 'B':
320+
case 'C':
321+
case 'D':
322+
case 'E':
323+
case 'F':
324+
if (NextChar == 'x')
325+
return LexNumber();
326+
break;
327+
}
287328
}
288329
}
289-
}
290330

291-
if (isValidIDChar(NextChar, /*First=*/true))
292-
return LexIdentifier();
331+
if (isValidIDChar(NextChar, /*First=*/true))
332+
return LexIdentifier();
293333

294-
return LexNumber();
295-
}
296-
case '"': return LexString();
297-
case '$': return LexVarName();
298-
case '[': return LexBracket();
299-
case '!': return LexExclaim();
334+
return LexNumber();
335+
}
336+
case '"':
337+
return LexString();
338+
case '$':
339+
return LexVarName();
340+
case '[':
341+
return LexBracket();
342+
case '!':
343+
return LexExclaim();
344+
}
300345
}
301346
}
302347

0 commit comments

Comments
 (0)