Skip to content

Commit fe932a8

Browse files
committed
[pseudo] Add first and follow set computation in Grammar.
These will be used when building parsing table for LR parsers. Separate from https://reviews.llvm.org/D118196. Differential Revision: https://reviews.llvm.org/D118990
1 parent 0e4b214 commit fe932a8

File tree

3 files changed

+177
-15
lines changed

3 files changed

+177
-15
lines changed

clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
// non-terminal or terminal, identified by a SymbolID.
2121
//
2222
// Notions about the BNF grammar:
23-
// - "_" is the augmented symbol, formed by start symbols.
23+
// - "_" is the start symbol of the augmented grammar;
2424
// - single-line comment is supported, starting with a #
2525
// - A rule describes how a nonterminal (left side of :=) is constructed, and
2626
// it is *per line* in the grammar file
@@ -38,6 +38,7 @@
3838

3939
#include "clang/Basic/TokenKinds.h"
4040
#include "llvm/ADT/ArrayRef.h"
41+
#include "llvm/ADT/DenseSet.h"
4142
#include "llvm/ADT/StringRef.h"
4243
#include <cstdint>
4344
#include <vector>
@@ -110,13 +111,16 @@ struct GrammarTable;
110111
// It is a building block for constructing a table-based parser.
111112
class Grammar {
112113
public:
113-
explicit Grammar(std::unique_ptr<GrammarTable> T) : T(std::move(T)) {}
114+
explicit Grammar(std::unique_ptr<GrammarTable>);
114115

115116
// Parses grammar from a BNF file.
116117
// Diagnostics emitted during parsing are stored in Diags.
117118
static std::unique_ptr<Grammar> parseBNF(llvm::StringRef BNF,
118119
std::vector<std::string> &Diags);
119120

121+
// Returns the SymbolID of the start symbol '_'.
122+
SymbolID startSymbol() const { return StartSymbol; };
123+
120124
// Returns all rules of the given non-terminal symbol.
121125
llvm::ArrayRef<Rule> rulesFor(SymbolID SID) const;
122126
const Rule &lookupRule(RuleID RID) const;
@@ -136,7 +140,15 @@ class Grammar {
136140

137141
private:
138142
std::unique_ptr<GrammarTable> T;
143+
// The start symbol '_' of the augmented grammar.
144+
SymbolID StartSymbol;
139145
};
146+
// For each nonterminal X, computes the set of terminals that begin strings
147+
// derived from X. (Known as FIRST sets in grammar-based parsers).
148+
std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &);
149+
// For each nonterminal X, computes the set of terminals that could immediately
150+
// follow X. (Known as FOLLOW sets in grammar-based parsers).
151+
std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &);
140152

141153
// Storage for the underlying data of the Grammar.
142154
// It can be constructed dynamically (from compiling BNF file) or statically

clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
//===----------------------------------------------------------------------===//
88

99
#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
10+
#include "clang/Basic/TokenKinds.h"
1011
#include "llvm/ADT/ArrayRef.h"
1112
#include "llvm/ADT/STLExtras.h"
1213
#include "llvm/ADT/StringRef.h"
@@ -23,6 +24,16 @@ Rule::Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Sequence)
2324
llvm::copy(Sequence, this->Sequence);
2425
}
2526

27+
Grammar::Grammar(std::unique_ptr<GrammarTable> Table) : T(std::move(Table)) {
28+
// start symbol is named _, binary search it.
29+
auto It = llvm::partition_point(
30+
T->Nonterminals,
31+
[](const GrammarTable::Nonterminal &X) { return X.Name < "_"; });
32+
assert(It != T->Nonterminals.end() && It->Name == "_" &&
33+
"symbol _ must exist in the grammar!");
34+
StartSymbol = It - T->Nonterminals.begin();
35+
}
36+
2637
llvm::ArrayRef<Rule> Grammar::rulesFor(SymbolID SID) const {
2738
assert(isNonterminal(SID));
2839
const auto &R = T->Nonterminals[SID].RuleRange;
@@ -72,6 +83,86 @@ std::string Grammar::dump() const {
7283
return OS.str();
7384
}
7485

86+
std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &G) {
87+
std::vector<llvm::DenseSet<SymbolID>> FirstSets(
88+
G.table().Nonterminals.size());
89+
auto ExpandFirstSet = [&FirstSets](SymbolID Target, SymbolID First) {
90+
assert(isNonterminal(Target));
91+
if (isToken(First))
92+
return FirstSets[Target].insert(First).second;
93+
bool Changed = false;
94+
for (SymbolID SID : FirstSets[First])
95+
Changed |= FirstSets[Target].insert(SID).second;
96+
return Changed;
97+
};
98+
99+
// A rule S := T ... implies elements in FIRST(S):
100+
// - if T is a terminal, FIRST(S) contains T
101+
// - if T is a nonterminal, FIRST(S) contains FIRST(T)
102+
// Since FIRST(T) may not have been fully computed yet, FIRST(S) itself may
103+
// end up being incomplete.
104+
// We iterate until we hit a fixed point.
105+
// (This isn't particularly efficient, but table building isn't on the
106+
// critical path).
107+
bool Changed = true;
108+
while (Changed) {
109+
Changed = false;
110+
for (const auto &R : G.table().Rules)
111+
// We only need to consider the first element because symbols are
112+
// non-nullable.
113+
Changed |= ExpandFirstSet(R.Target, R.seq().front());
114+
}
115+
return FirstSets;
116+
}
117+
118+
std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &G) {
119+
auto FirstSets = firstSets(G);
120+
std::vector<llvm::DenseSet<SymbolID>> FollowSets(
121+
G.table().Nonterminals.size());
122+
// Expand the follow set of a non-terminal symbol Y by adding all from the
123+
// given symbol set.
124+
auto ExpandFollowSet = [&FollowSets](SymbolID Y,
125+
const llvm::DenseSet<SymbolID> &ToAdd) {
126+
assert(isNonterminal(Y));
127+
bool Changed = false;
128+
for (SymbolID F : ToAdd)
129+
Changed |= FollowSets[Y].insert(F).second;
130+
return Changed;
131+
};
132+
// Follow sets is computed based on the following 3 rules, the computation
133+
// is completed at a fixed point where there is no more new symbols can be
134+
// added to any of the follow sets.
135+
//
136+
// Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol.
137+
FollowSets[G.startSymbol()].insert(tokenSymbol(tok::eof));
138+
bool Changed = true;
139+
while (Changed) {
140+
Changed = false;
141+
for (const auto &R : G.table().Rules) {
142+
// Rule 2: for a rule X := ... Y Z, we add all symbols from FIRST(Z) to
143+
// FOLLOW(Y).
144+
for (size_t i = 0; i + 1 < R.seq().size(); ++i) {
145+
if (isToken(R.seq()[i]))
146+
continue;
147+
// We only need to consider the next symbol because symbols are
148+
// non-nullable.
149+
SymbolID Next = R.seq()[i + 1];
150+
if (isToken(Next))
151+
// First set for a terminal is itself.
152+
Changed |= ExpandFollowSet(R.seq()[i], {Next});
153+
else
154+
Changed |= ExpandFollowSet(R.seq()[i], FirstSets[Next]);
155+
}
156+
// Rule 3: for a rule X := ... Z, we add all symbols from FOLLOW(X) to
157+
// FOLLOW(Z).
158+
SymbolID Z = R.seq().back();
159+
if (isNonterminal(Z))
160+
Changed |= ExpandFollowSet(Z, FollowSets[R.Target]);
161+
}
162+
}
163+
return FollowSets;
164+
}
165+
75166
} // namespace pseudo
76167
} // namespace syntax
77168
} // namespace clang

clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp

Lines changed: 72 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ namespace {
1919
using testing::AllOf;
2020
using testing::ElementsAre;
2121
using testing::IsEmpty;
22+
using testing::Pair;
2223
using testing::UnorderedElementsAre;
2324

2425
MATCHER_P(TargetID, SID, "") { return arg.Target == SID; }
@@ -33,7 +34,7 @@ class GrammarTest : public ::testing::Test {
3334
G = Grammar::parseBNF(BNF, Diags);
3435
}
3536

36-
SymbolID lookup(llvm::StringRef Name) const {
37+
SymbolID id(llvm::StringRef Name) const {
3738
for (unsigned I = 0; I < NumTerminals; ++I)
3839
if (G->table().Terminals[I] == Name)
3940
return tokenSymbol(static_cast<tok::TokenKind>(I));
@@ -50,31 +51,28 @@ class GrammarTest : public ::testing::Test {
5051
};
5152

5253
TEST_F(GrammarTest, Basic) {
53-
build("expression := IDENTIFIER + expression # comment");
54+
build("_ := IDENTIFIER + _ # comment");
5455
EXPECT_THAT(Diags, IsEmpty());
5556

5657
auto ExpectedRule =
57-
AllOf(TargetID(lookup("expression")),
58-
Sequence(lookup("IDENTIFIER"), lookup("+"), lookup("expression")));
59-
auto ExpressionID = lookup("expression");
60-
EXPECT_EQ(G->symbolName(ExpressionID), "expression");
61-
EXPECT_THAT(G->rulesFor(ExpressionID), UnorderedElementsAre(ExpectedRule));
58+
AllOf(TargetID(id("_")), Sequence(id("IDENTIFIER"), id("+"), id("_")));
59+
EXPECT_EQ(G->symbolName(id("_")), "_");
60+
EXPECT_THAT(G->rulesFor(id("_")), UnorderedElementsAre(ExpectedRule));
6261
const auto &Rule = G->lookupRule(/*RID=*/0);
6362
EXPECT_THAT(Rule, ExpectedRule);
6463
EXPECT_THAT(G->symbolName(Rule.seq()[0]), "IDENTIFIER");
6564
EXPECT_THAT(G->symbolName(Rule.seq()[1]), "+");
66-
EXPECT_THAT(G->symbolName(Rule.seq()[2]), "expression");
65+
EXPECT_THAT(G->symbolName(Rule.seq()[2]), "_");
6766
}
6867

6968
TEST_F(GrammarTest, EliminatedOptional) {
7069
build("_ := CONST_opt INT ;_opt");
7170
EXPECT_THAT(Diags, IsEmpty());
7271
EXPECT_THAT(G->table().Rules,
73-
UnorderedElementsAre(
74-
Sequence(lookup("INT")),
75-
Sequence(lookup("CONST"), lookup("INT")),
76-
Sequence(lookup("CONST"), lookup("INT"), lookup(";")),
77-
Sequence(lookup("INT"), lookup(";"))));
72+
UnorderedElementsAre(Sequence(id("INT")),
73+
Sequence(id("CONST"), id("INT")),
74+
Sequence(id("CONST"), id("INT"), id(";")),
75+
Sequence(id("INT"), id(";"))));
7876
}
7977

8078
TEST_F(GrammarTest, Diagnostics) {
@@ -87,6 +85,7 @@ TEST_F(GrammarTest, Diagnostics) {
8785
invalid
8886
)cpp");
8987

88+
EXPECT_EQ(G->startSymbol(), id("_"));
9089
EXPECT_THAT(Diags, UnorderedElementsAre(
9190
"Rule '_ := ,_opt' has a nullable RHS",
9291
"Rule 'null := ' has a nullable RHS",
@@ -96,6 +95,66 @@ TEST_F(GrammarTest, Diagnostics) {
9695
"No rules for nonterminal: IDENFIFIE"));
9796
}
9897

98+
TEST_F(GrammarTest, FirstAndFollowSets) {
99+
build(
100+
R"bnf(
101+
_ := expr
102+
expr := expr - term
103+
expr := term
104+
term := IDENTIFIER
105+
term := ( expr )
106+
)bnf");
107+
ASSERT_TRUE(Diags.empty());
108+
auto ToPairs = [](std::vector<llvm::DenseSet<SymbolID>> Input) {
109+
std::vector<std::pair<SymbolID, llvm::DenseSet<SymbolID>>> Sets;
110+
for (SymbolID ID = 0; ID < Input.size(); ++ID)
111+
Sets.emplace_back(ID, std::move(Input[ID]));
112+
return Sets;
113+
};
114+
115+
EXPECT_THAT(
116+
ToPairs(firstSets(*G)),
117+
UnorderedElementsAre(
118+
Pair(id("_"), UnorderedElementsAre(id("IDENTIFIER"), id("("))),
119+
Pair(id("expr"), UnorderedElementsAre(id("IDENTIFIER"), id("("))),
120+
Pair(id("term"), UnorderedElementsAre(id("IDENTIFIER"), id("(")))));
121+
EXPECT_THAT(
122+
ToPairs(followSets(*G)),
123+
UnorderedElementsAre(
124+
Pair(id("_"), UnorderedElementsAre(id("EOF"))),
125+
Pair(id("expr"), UnorderedElementsAre(id("-"), id("EOF"), id(")"))),
126+
Pair(id("term"), UnorderedElementsAre(id("-"), id("EOF"), id(")")))));
127+
128+
build(R"bnf(
129+
# A simplfied C++ decl-specifier-seq.
130+
_ := decl-specifier-seq
131+
decl-specifier-seq := decl-specifier decl-specifier-seq
132+
decl-specifier-seq := decl-specifier
133+
decl-specifier := simple-type-specifier
134+
decl-specifier := INLINE
135+
simple-type-specifier := INT
136+
)bnf");
137+
ASSERT_TRUE(Diags.empty());
138+
EXPECT_THAT(
139+
ToPairs(firstSets(*G)),
140+
UnorderedElementsAre(
141+
Pair(id("_"), UnorderedElementsAre(id("INLINE"), id("INT"))),
142+
Pair(id("decl-specifier-seq"),
143+
UnorderedElementsAre(id("INLINE"), id("INT"))),
144+
Pair(id("simple-type-specifier"), UnorderedElementsAre(id("INT"))),
145+
Pair(id("decl-specifier"),
146+
UnorderedElementsAre(id("INLINE"), id("INT")))));
147+
EXPECT_THAT(
148+
ToPairs(followSets(*G)),
149+
UnorderedElementsAre(
150+
Pair(id("_"), UnorderedElementsAre(id("EOF"))),
151+
Pair(id("decl-specifier-seq"), UnorderedElementsAre(id("EOF"))),
152+
Pair(id("decl-specifier"),
153+
UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF"))),
154+
Pair(id("simple-type-specifier"),
155+
UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF")))));
156+
}
157+
99158
} // namespace
100159
} // namespace pseudo
101160
} // namespace syntax

0 commit comments

Comments
 (0)