Skip to content

Commit 67ce4ab

Browse files
authored
[llvm][mustache] Use single pass when tokenizing (#159196)
The old implementation used many string searches over the same portions of the strings. This version sacrifices some API niceness for perf wins. | Metric | Baseline | Single-Pass | Change | | --- | --- | --- | --- | | Time (ms) | 36\.09 | 35\.78 | \-0.86% | | Cycles | 35\.3M | 35\.0M | \-0.79% | | Instructions | 86\.7M | 85\.8M | \-1.03% | | Branch Misses | 116K | 114K | \-1.91% | | Cache Misses | 244K | 232K | \-4.98% |
1 parent 37825ad commit 67ce4ab

File tree

1 file changed

+71
-113
lines changed

1 file changed

+71
-113
lines changed

llvm/lib/Support/Mustache.cpp

Lines changed: 71 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -368,141 +368,99 @@ struct Tag {
368368
llvm_unreachable("Unknown json::Value::Kind");
369369
}
370370

371-
static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open,
372-
StringRef Close) {
373-
const StringLiteral TripleOpen("{{{");
374-
const StringLiteral TripleClose("}}}");
375-
376-
size_t NormalOpenPos = Template.find(Open, StartPos);
377-
size_t TripleOpenPos = Template.find(TripleOpen, StartPos);
378-
379-
Tag Result;
380-
381-
// Determine which tag comes first.
382-
if (TripleOpenPos != StringRef::npos &&
383-
(NormalOpenPos == StringRef::npos || TripleOpenPos <= NormalOpenPos)) {
384-
// Found a triple mustache tag.
385-
size_t EndPos =
386-
Template.find(TripleClose, TripleOpenPos + TripleOpen.size());
387-
if (EndPos == StringRef::npos)
388-
return Result; // No closing tag found.
389-
390-
Result.TagKind = Tag::Kind::Triple;
391-
Result.StartPosition = TripleOpenPos;
392-
size_t ContentStart = TripleOpenPos + TripleOpen.size();
393-
Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
394-
Result.FullMatch = Template.substr(
395-
TripleOpenPos, (EndPos + TripleClose.size()) - TripleOpenPos);
396-
} else if (NormalOpenPos != StringRef::npos) {
397-
// Found a normal mustache tag.
398-
size_t EndPos = Template.find(Close, NormalOpenPos + Open.size());
399-
if (EndPos == StringRef::npos)
400-
return Result; // No closing tag found.
401-
402-
Result.TagKind = Tag::Kind::Normal;
403-
Result.StartPosition = NormalOpenPos;
404-
size_t ContentStart = NormalOpenPos + Open.size();
405-
Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
406-
Result.FullMatch =
407-
Template.substr(NormalOpenPos, (EndPos + Close.size()) - NormalOpenPos);
408-
}
409-
410-
return Result;
411-
}
412-
413-
static std::optional<std::pair<StringRef, StringRef>>
414-
processTag(const Tag &T, SmallVectorImpl<Token> &Tokens, MustacheContext &Ctx) {
415-
LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content
416-
<< ", Kind: " << tagKindToString(T.TagKind) << "\n");
417-
if (T.TagKind == Tag::Kind::Triple) {
418-
Tokens.emplace_back(T.FullMatch, Ctx.Saver.save("&" + T.Content), '&', Ctx);
419-
return std::nullopt;
420-
}
421-
StringRef Interpolated = T.Content;
422-
if (!Interpolated.trim().starts_with("=")) {
423-
char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
424-
Tokens.emplace_back(T.FullMatch, Interpolated, Front, Ctx);
425-
return std::nullopt;
426-
}
427-
Tokens.emplace_back(T.FullMatch, Interpolated, '=', Ctx);
428-
StringRef DelimSpec = Interpolated.trim();
429-
DelimSpec = DelimSpec.drop_front(1);
430-
DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
431-
DelimSpec = DelimSpec.trim();
432-
433-
std::pair<StringRef, StringRef> Ret = DelimSpec.split(' ');
434-
LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << Ret.first
435-
<< ", NewClose: " << Ret.second << "\n");
436-
return Ret;
437-
}
438-
439371
// Simple tokenizer that splits the template into tokens.
440-
// The mustache spec allows {{{ }}} to unescape variables,
441-
// but we don't support that here. An unescape variable
442-
// is represented only by {{& variable}}.
443372
static SmallVector<Token> tokenize(StringRef Template, MustacheContext &Ctx) {
444373
LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n");
445374
SmallVector<Token> Tokens;
446375
SmallString<8> Open("{{");
447376
SmallString<8> Close("}}");
448-
size_t Start = 0;
377+
size_t Cursor = 0;
378+
size_t TextStart = 0;
449379

450-
while (Start < Template.size()) {
451-
LLVM_DEBUG(dbgs() << "[Tokenize Loop] Start:" << Start << ", Open:'" << Open
452-
<< "', Close:'" << Close << "'\n");
453-
Tag T = findNextTag(Template, Start, Open, Close);
380+
const StringLiteral TripleOpen("{{{");
381+
const StringLiteral TripleClose("}}}");
454382

455-
if (T.TagKind == Tag::Kind::None) {
456-
// No more tags, the rest is text.
457-
Tokens.emplace_back(Template.substr(Start));
458-
break;
383+
while (Cursor < Template.size()) {
384+
StringRef TemplateSuffix = Template.substr(Cursor);
385+
StringRef TagOpen, TagClose;
386+
Tag::Kind Kind;
387+
388+
// Determine which tag we've encountered.
389+
if (TemplateSuffix.starts_with(TripleOpen)) {
390+
Kind = Tag::Kind::Triple;
391+
TagOpen = TripleOpen;
392+
TagClose = TripleClose;
393+
} else if (TemplateSuffix.starts_with(Open)) {
394+
Kind = Tag::Kind::Normal;
395+
TagOpen = Open;
396+
TagClose = Close;
397+
} else {
398+
// Not at a tag, continue scanning.
399+
++Cursor;
400+
continue;
459401
}
460402

461-
// Add the text before the tag.
462-
if (T.StartPosition > Start) {
463-
StringRef Text = Template.substr(Start, T.StartPosition - Start);
464-
Tokens.emplace_back(Text);
403+
// Found a tag, first add the preceding text.
404+
if (Cursor > TextStart)
405+
Tokens.emplace_back(Template.slice(TextStart, Cursor));
406+
407+
// Find the closing tag.
408+
size_t EndPos = Template.find(TagClose, Cursor + TagOpen.size());
409+
if (EndPos == StringRef::npos) {
410+
// No closing tag, the rest is text.
411+
Tokens.emplace_back(Template.substr(Cursor));
412+
TextStart = Cursor = Template.size();
413+
break;
465414
}
466415

467-
if (auto NewDelims = processTag(T, Tokens, Ctx)) {
468-
std::tie(Open, Close) = *NewDelims;
416+
// Extract tag content and full match.
417+
size_t ContentStart = Cursor + TagOpen.size();
418+
StringRef Content = Template.substr(ContentStart, EndPos - ContentStart);
419+
StringRef FullMatch =
420+
Template.substr(Cursor, (EndPos + TagClose.size()) - Cursor);
421+
422+
// Process the tag (inlined logic from processTag).
423+
LLVM_DEBUG(dbgs() << "[Tag] " << FullMatch << ", Content: " << Content
424+
<< ", Kind: " << tagKindToString(Kind) << "\n");
425+
if (Kind == Tag::Kind::Triple) {
426+
Tokens.emplace_back(FullMatch, Ctx.Saver.save("&" + Content), '&', Ctx);
427+
} else { // Normal Tag
428+
StringRef Interpolated = Content;
429+
if (!Interpolated.trim().starts_with("=")) {
430+
char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
431+
Tokens.emplace_back(FullMatch, Interpolated, Front, Ctx);
432+
} else { // Set Delimiter
433+
Tokens.emplace_back(FullMatch, Interpolated, '=', Ctx);
434+
StringRef DelimSpec = Interpolated.trim();
435+
DelimSpec = DelimSpec.drop_front(1);
436+
DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
437+
DelimSpec = DelimSpec.trim();
438+
439+
auto [NewOpen, NewClose] = DelimSpec.split(' ');
440+
LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << NewOpen
441+
<< ", NewClose: " << NewClose << "\n");
442+
Open = NewOpen;
443+
Close = NewClose;
444+
}
469445
}
470446

471-
// Move past the tag.
472-
Start = T.StartPosition + T.FullMatch.size();
447+
// Move past the tag for the next iteration.
448+
Cursor += FullMatch.size();
449+
TextStart = Cursor;
473450
}
474451

475-
// Fix up white spaces for:
476-
// - open sections
477-
// - inverted sections
478-
// - close sections
479-
// - comments
480-
//
481-
// This loop attempts to find standalone tokens and tries to trim out
482-
// the surrounding whitespace.
483-
// For example:
484-
// if you have the template string
485-
// {{#section}} \n Example \n{{/section}}
486-
// The output should would be
487-
// For example:
488-
// \n Example \n
452+
// Add any remaining text after the last tag.
453+
if (TextStart < Template.size())
454+
Tokens.emplace_back(Template.substr(TextStart));
455+
456+
// Fix up white spaces for standalone tags.
489457
size_t LastIdx = Tokens.size() - 1;
490458
for (size_t Idx = 0, End = Tokens.size(); Idx < End; ++Idx) {
491459
Token &CurrentToken = Tokens[Idx];
492460
Token::Type CurrentType = CurrentToken.getType();
493-
// Check if token type requires cleanup.
494-
bool RequiresCleanUp = requiresCleanUp(CurrentType);
495-
496-
if (!RequiresCleanUp)
461+
if (!requiresCleanUp(CurrentType))
497462
continue;
498463

499-
// We adjust the token body if there's no text behind or ahead.
500-
// A token is considered to have no text ahead if the right of the previous
501-
// token is a newline followed by spaces.
502-
// A token is considered to have no text behind if the left of the next
503-
// token is spaces followed by a newline.
504-
// eg.
505-
// "Line 1\n {{#section}} \n Line 2 \n {{/section}} \n Line 3"
506464
bool HasTextBehind = hasTextBehind(Idx, Tokens);
507465
bool HasTextAhead = hasTextAhead(Idx, Tokens);
508466

0 commit comments

Comments
 (0)