Skip to content

Commit 85eaecb

Browse files
committed
[pseudo] Check follow-sets instead of tying reduce actions to lookahead tokens.
Previously, the action table stores a reduce action for each lookahead token it should allow. These tokens are the followSet(action.rule.target). In practice, the follow sets are large, so we spend a bunch of time binary searching around all these essentially-duplicates to check whether our lookahead token is there. However the number of reduces for a given state is very small, so we're much better off linear scanning over them and performing a fast check for each. D128318 was an attempt at this, storing a bitmap for each reduce. However it's even more compact just to use the follow sets directly, as there are fewer nonterminals than (state, rule) pairs. It's also faster. This specialized approach means unbundling Reduce from other actions in LRTable, so it's no longer useful to support it in Action. I suspect Action will soon go away, as we store each kind of action separately. This improves glrParse speed by 42% (3.30 -> 4.69 MB/s). It also reduces LR table size by 59% (343 -> 142kB). Differential Revision: https://reviews.llvm.org/D128472
1 parent c1b07d6 commit 85eaecb

File tree

8 files changed

+260
-143
lines changed

8 files changed

+260
-143
lines changed

clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h

Lines changed: 52 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@
3838

3939
#include "clang-pseudo/grammar/Grammar.h"
4040
#include "llvm/ADT/ArrayRef.h"
41+
#include "llvm/ADT/BitVector.h"
42+
#include "llvm/Support/Capacity.h"
4143
#include <cstdint>
4244
#include <vector>
4345

@@ -62,6 +64,9 @@ class LRTable {
6264

6365
// Action represents the terminal and nonterminal actions, it combines the
6466
// entry of the ACTION and GOTO tables from the LR literature.
67+
//
68+
// FIXME: as we move away from a homogeneous table structure shared between
69+
// action types, this class becomes less useful. Remove it.
6570
class Action {
6671
public:
6772
enum Kind : uint8_t {
@@ -73,8 +78,6 @@ class LRTable {
7378
// A shift is a forward transition, and the value n is the next state that
7479
// the parser is to enter.
7580
Shift,
76-
// Reduce by a rule: pop the state stack.
77-
Reduce,
7881

7982
// NOTE: there are no typical accept actions in the LRtable, accept
8083
// actions are handled specifically in the parser -- if the parser
@@ -91,7 +94,6 @@ class LRTable {
9194

9295
static Action goTo(StateID S) { return Action(GoTo, S); }
9396
static Action shift(StateID S) { return Action(Shift, S); }
94-
static Action reduce(RuleID RID) { return Action(Reduce, RID); }
9597
static Action sentinel() { return Action(Sentinel, 0); }
9698

9799
StateID getShiftState() const {
@@ -102,10 +104,6 @@ class LRTable {
102104
assert(kind() == GoTo);
103105
return Value;
104106
}
105-
RuleID getReduceRule() const {
106-
assert(kind() == Reduce);
107-
return Value;
108-
}
109107
Kind kind() const { return static_cast<Kind>(K); }
110108

111109
bool operator==(const Action &L) const { return opaque() == L.opaque(); }
@@ -123,9 +121,6 @@ class LRTable {
123121
uint16_t Value : ValueBits;
124122
};
125123

126-
// Returns all available actions for the given state on a terminal.
127-
// Expected to be called by LR parsers.
128-
llvm::ArrayRef<Action> getActions(StateID State, SymbolID Terminal) const;
129124
// Returns the state after we reduce a nonterminal.
130125
// Expected to be called by LR parsers.
131126
// REQUIRES: Nonterminal is valid here.
@@ -135,9 +130,26 @@ class LRTable {
135130
// If the terminal is invalid here, returns None.
136131
llvm::Optional<StateID> getShiftState(StateID State, SymbolID Terminal) const;
137132

138-
// Looks up available actions.
139-
// Returns empty if no available actions in the table.
140-
llvm::ArrayRef<Action> find(StateID State, SymbolID Symbol) const;
133+
// Returns the possible reductions from a state.
134+
//
135+
// These are not keyed by a lookahead token. Instead, call canFollow() to
136+
// check whether a reduction should apply in the current context:
137+
// for (RuleID R : LR.getReduceRules(S)) {
138+
// if (!LR.canFollow(G.lookupRule(R).Target, NextToken))
139+
// continue;
140+
// // ...apply reduce...
141+
// }
142+
llvm::ArrayRef<RuleID> getReduceRules(StateID State) const {
143+
return llvm::makeArrayRef(&Reduces[ReduceOffset[State]],
144+
&Reduces[ReduceOffset[State + 1]]);
145+
}
146+
// Returns whether Terminal can follow Nonterminal in a valid source file.
147+
bool canFollow(SymbolID Nonterminal, SymbolID Terminal) const {
148+
assert(isToken(Terminal));
149+
assert(isNonterminal(Nonterminal));
150+
return FollowSets.test(tok::NUM_TOKENS * Nonterminal +
151+
symbolToToken(Terminal));
152+
}
141153

142154
// Returns the state from which the LR parser should start to parse the input
143155
// tokens as the given StartSymbol.
@@ -151,9 +163,12 @@ class LRTable {
151163
StateID getStartState(SymbolID StartSymbol) const;
152164

153165
size_t bytes() const {
154-
return sizeof(*this) + Actions.capacity() * sizeof(Action) +
155-
Symbols.capacity() * sizeof(SymbolID) +
156-
StateOffset.capacity() * sizeof(uint32_t);
166+
return sizeof(*this) + llvm::capacity_in_bytes(Actions) +
167+
llvm::capacity_in_bytes(Symbols) +
168+
llvm::capacity_in_bytes(StateOffset) +
169+
llvm::capacity_in_bytes(Reduces) +
170+
llvm::capacity_in_bytes(ReduceOffset) +
171+
llvm::capacity_in_bytes(FollowSets);
157172
}
158173

159174
std::string dumpStatistics() const;
@@ -162,17 +177,25 @@ class LRTable {
162177
// Build a SLR(1) parsing table.
163178
static LRTable buildSLR(const Grammar &G);
164179

165-
class Builder;
180+
struct Builder;
166181
// Represents an entry in the table, used for building the LRTable.
167182
struct Entry {
168183
StateID State;
169184
SymbolID Symbol;
170185
Action Act;
171186
};
187+
struct ReduceEntry {
188+
StateID State;
189+
RuleID Rule;
190+
};
172191
// Build a specifid table for testing purposes.
173-
static LRTable buildForTests(const GrammarTable &, llvm::ArrayRef<Entry>);
192+
static LRTable buildForTests(const Grammar &G, llvm::ArrayRef<Entry>,
193+
llvm::ArrayRef<ReduceEntry>);
174194

175195
private:
196+
// Looks up actions stored in the generic table.
197+
llvm::ArrayRef<Action> find(StateID State, SymbolID Symbol) const;
198+
176199
// Conceptually the LR table is a multimap from (State, SymbolID) => Action.
177200
// Our physical representation is quite different for compactness.
178201

@@ -188,6 +211,17 @@ class LRTable {
188211
std::vector<Action> Actions;
189212
// A sorted table, storing the start state for each target parsing symbol.
190213
std::vector<std::pair<SymbolID, StateID>> StartStates;
214+
215+
// Given a state ID S, the half-open range of Reduces is
216+
// [ReduceOffset[S], ReduceOffset[S+1])
217+
std::vector<uint32_t> ReduceOffset;
218+
std::vector<RuleID> Reduces;
219+
// Conceptually this is a bool[SymbolID][Token], each entry describing whether
220+
// the grammar allows the (nonterminal) symbol to be followed by the token.
221+
//
222+
// This is flattened by encoding the (SymbolID Nonterminal, tok::Kind Token)
223+
// as an index: Nonterminal * NUM_TOKENS + Token.
224+
llvm::BitVector FollowSets;
191225
};
192226
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &);
193227

clang-tools-extra/pseudo/lib/GLR.cpp

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -251,9 +251,8 @@ class GLRReduce {
251251
private:
252252
// pop walks up the parent chain(s) for a reduction from Head by to Rule.
253253
// Once we reach the end, record the bases and sequences.
254-
void pop(const GSS::Node *Head, RuleID RID) {
254+
void pop(const GSS::Node *Head, RuleID RID, const Rule &Rule) {
255255
LLVM_DEBUG(llvm::dbgs() << " Pop " << Params.G.dumpRule(RID) << "\n");
256-
const auto &Rule = Params.G.lookupRule(RID);
257256
Family F{/*Start=*/0, /*Symbol=*/Rule.Target, /*Rule=*/RID};
258257
TempSequence.resize_for_overwrite(Rule.Size);
259258
auto DFS = [&](const GSS::Node *N, unsigned I, auto &DFS) {
@@ -286,11 +285,11 @@ class GLRReduce {
286285
// In trivial cases, we perform the complete reduce here!
287286
if (popAndPushTrivial())
288287
continue;
289-
for (const auto &A :
290-
Params.Table.getActions((*Heads)[NextPopHead]->State, Lookahead)) {
291-
if (A.kind() != LRTable::Action::Reduce)
292-
continue;
293-
pop((*Heads)[NextPopHead], A.getReduceRule());
288+
for (RuleID RID :
289+
Params.Table.getReduceRules((*Heads)[NextPopHead]->State)) {
290+
const auto &Rule = Params.G.lookupRule(RID);
291+
if (Params.Table.canFollow(Rule.Target, Lookahead))
292+
pop((*Heads)[NextPopHead], RID, Rule);
294293
}
295294
}
296295
}
@@ -367,21 +366,23 @@ class GLRReduce {
367366
// - the head must have only one reduction rule
368367
// - the reduction path must be a straight line (no multiple parents)
369368
// (Roughly this means there's no local ambiguity, so the LR algorithm works).
369+
//
370+
// Returns true if we successfully consumed the next unpopped head.
370371
bool popAndPushTrivial() {
371372
if (!Sequences.empty() || Heads->size() != NextPopHead + 1)
372373
return false;
373374
const GSS::Node *Head = Heads->back();
374375
llvm::Optional<RuleID> RID;
375-
for (auto &A : Params.Table.getActions(Head->State, Lookahead)) {
376-
if (A.kind() != LRTable::Action::Reduce)
377-
continue;
378-
if (RID)
376+
for (RuleID R : Params.Table.getReduceRules(Head->State)) {
377+
if (RID.hasValue())
379378
return false;
380-
RID = A.getReduceRule();
379+
RID = R;
381380
}
382381
if (!RID)
383382
return true; // no reductions available, but we've processed the head!
384383
const auto &Rule = Params.G.lookupRule(*RID);
384+
if (!Params.Table.canFollow(Rule.Target, Lookahead))
385+
return true; // reduction is not available
385386
const GSS::Node *Base = Head;
386387
TempSequence.resize_for_overwrite(Rule.Size);
387388
for (unsigned I = 0; I < Rule.Size; ++I) {

clang-tools-extra/pseudo/lib/grammar/LRTable.cpp

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "clang-pseudo/grammar/Grammar.h"
1111
#include "llvm/ADT/ArrayRef.h"
1212
#include "llvm/ADT/STLExtras.h"
13+
#include "llvm/ADT/StringExtras.h"
1314
#include "llvm/Support/ErrorHandling.h"
1415
#include "llvm/Support/FormatVariadic.h"
1516
#include "llvm/Support/raw_ostream.h"
@@ -21,8 +22,6 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const LRTable::Action &A) {
2122
switch (A.kind()) {
2223
case LRTable::Action::Shift:
2324
return OS << llvm::formatv("shift state {0}", A.getShiftState());
24-
case LRTable::Action::Reduce:
25-
return OS << llvm::formatv("reduce by rule {0}", A.getReduceRule());
2625
case LRTable::Action::GoTo:
2726
return OS << llvm::formatv("go to state {0}", A.getGoToState());
2827
case LRTable::Action::Sentinel:
@@ -36,9 +35,11 @@ std::string LRTable::dumpStatistics() const {
3635
Statistics of the LR parsing table:
3736
number of states: {0}
3837
number of actions: {1}
39-
size of the table (bytes): {2}
38+
number of reduces: {2}
39+
size of the table (bytes): {3}
4040
)",
41-
StateOffset.size() - 1, Actions.size(), bytes())
41+
StateOffset.size() - 1, Actions.size(), Reduces.size(),
42+
bytes())
4243
.str();
4344
}
4445

@@ -52,19 +53,27 @@ std::string LRTable::dumpForTests(const Grammar &G) const {
5253
SymbolID TokID = tokenSymbol(static_cast<tok::TokenKind>(Terminal));
5354
for (auto A : find(S, TokID)) {
5455
if (A.kind() == LRTable::Action::Shift)
55-
OS.indent(4) << llvm::formatv("'{0}': shift state {1}\n",
56+
OS.indent(4) << llvm::formatv("{0}: shift state {1}\n",
5657
G.symbolName(TokID), A.getShiftState());
57-
else if (A.kind() == LRTable::Action::Reduce)
58-
OS.indent(4) << llvm::formatv("'{0}': reduce by rule {1} '{2}'\n",
59-
G.symbolName(TokID), A.getReduceRule(),
60-
G.dumpRule(A.getReduceRule()));
6158
}
6259
}
60+
for (RuleID R : getReduceRules(S)) {
61+
SymbolID Target = G.lookupRule(R).Target;
62+
std::vector<llvm::StringRef> Terminals;
63+
for (unsigned Terminal = 0; Terminal < NumTerminals; ++Terminal) {
64+
SymbolID TokID = tokenSymbol(static_cast<tok::TokenKind>(Terminal));
65+
if (canFollow(Target, TokID))
66+
Terminals.push_back(G.symbolName(TokID));
67+
}
68+
OS.indent(4) << llvm::formatv("{0}: reduce by rule {1} '{2}'\n",
69+
llvm::join(Terminals, " "), R,
70+
G.dumpRule(R));
71+
}
6372
for (SymbolID NontermID = 0; NontermID < G.table().Nonterminals.size();
6473
++NontermID) {
6574
if (find(S, NontermID).empty())
6675
continue;
67-
OS.indent(4) << llvm::formatv("'{0}': go to state {1}\n",
76+
OS.indent(4) << llvm::formatv("{0}: go to state {1}\n",
6877
G.symbolName(NontermID),
6978
getGoToState(S, NontermID));
7079
}
@@ -77,18 +86,12 @@ LRTable::getShiftState(StateID State, SymbolID Terminal) const {
7786
// FIXME: we spend a significant amount of time on misses here.
7887
// We could consider storing a std::bitset for a cheaper test?
7988
assert(pseudo::isToken(Terminal) && "expected terminal symbol!");
80-
for (const auto &Result : getActions(State, Terminal))
89+
for (const auto &Result : find(State, Terminal))
8190
if (Result.kind() == Action::Shift)
8291
return Result.getShiftState(); // unique: no shift/shift conflicts.
8392
return llvm::None;
8493
}
8594

86-
llvm::ArrayRef<LRTable::Action> LRTable::getActions(StateID State,
87-
SymbolID Terminal) const {
88-
assert(pseudo::isToken(Terminal) && "expect terminal symbol!");
89-
return find(State, Terminal);
90-
}
91-
9295
LRTable::StateID LRTable::getGoToState(StateID State,
9396
SymbolID Nonterminal) const {
9497
assert(pseudo::isNonterminal(Nonterminal) && "expected nonterminal symbol!");

0 commit comments

Comments
 (0)