Skip to content

Commit f1984b1

Browse files
committed
[pseudo] Implement LRGraph
LRGraph is the key component of the clang pseudo parser, it is a deterministic handle-finding finite-state machine, which is used to generated the LR parsing table. Separate from https://reviews.llvm.org/D118196. Differential Revision: https://reviews.llvm.org/D119172
1 parent f30ec8f commit f1984b1

File tree

5 files changed

+494
-0
lines changed

5 files changed

+494
-0
lines changed
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
//===--- LRGraph.h - Build an LR automaton ------------------*- C++-*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// LR parsers are bottom-up parsers -- they scan the input from left to right,
10+
// and collect the right-hand side of a production rule (called handle) on top
11+
// of the stack, then replace (reduce) the handle with the nonterminal defined
12+
// by the production rule.
13+
//
14+
// This file defines LRGraph, a deterministic handle-finding finite-state
15+
// automaton, which is a key component in LR parsers to recognize any of
16+
// handles in the grammar efficiently. We build the LR table (ACTION and GOTO
17+
// Table) based on the LRGraph.
18+
//
19+
// LRGraph can be constructed for any context-free grammars.
20+
// Even for a LR-ambiguous grammar, we can construct a deterministic FSA, but
21+
// interpretation of the FSA is nondeterminsitic -- we might in a state where
22+
// we can continue searching an handle and identify a handle (called
23+
// shift/reduce conflicts), or identify more than one handle (callled
24+
// reduce/reduce conflicts).
25+
//
26+
// LRGraph is a common model for all variants of LR automatons, from the most
27+
// basic one LR(0), the powerful SLR(1), LR(1) which uses a one-token lookahead
28+
// in making decisions.
29+
//===----------------------------------------------------------------------===//
30+
31+
#ifndef LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H
32+
#define LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H
33+
34+
#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
35+
#include "llvm/ADT/Hashing.h"
36+
#include <vector>
37+
38+
namespace clang {
39+
namespace syntax {
40+
namespace pseudo {
41+
42+
// An LR item -- a grammar rule with a dot at some position of the body.
43+
// e.g. a production rule A := X Y yields 3 items:
44+
// A := . X Y
45+
// A := X . Y
46+
// A := X Y .
47+
// An item indicates how much of a production rule has been recognized at a
48+
// position (described by dot), for example, A := X . Y indicates that we have
49+
// recognized the X part from the input, and we hope next to see the input
50+
// derivable from Y.
51+
class Item {
52+
public:
53+
static Item start(RuleID ID, const Grammar &G) {
54+
Item I;
55+
I.RID = ID;
56+
I.RuleLength = G.lookupRule(ID).Size;
57+
return I;
58+
}
59+
static Item sentinel(RuleID ID) {
60+
Item I;
61+
I.RID = ID;
62+
return I;
63+
}
64+
65+
RuleID rule() const { return RID; }
66+
uint8_t dot() const { return DotPos; }
67+
68+
bool hasNext() const { return DotPos < RuleLength; }
69+
SymbolID next(const Grammar &G) const {
70+
assert(hasNext());
71+
return G.lookupRule(RID).Sequence[DotPos];
72+
}
73+
74+
Item advance() const {
75+
assert(hasNext());
76+
Item I = *this;
77+
++I.DotPos;
78+
return I;
79+
}
80+
81+
std::string dump(const Grammar &G) const;
82+
83+
bool operator==(const Item &I) const {
84+
return DotPos == I.DotPos && RID == I.RID;
85+
}
86+
bool operator<(const Item &I) const {
87+
return std::tie(RID, DotPos) < std::tie(I.RID, I.DotPos);
88+
}
89+
friend llvm::hash_code hash_value(const Item &I) {
90+
return llvm::hash_combine(I.RID, I.DotPos);
91+
}
92+
93+
private:
94+
RuleID RID = 0;
95+
uint8_t DotPos = 0;
96+
uint8_t RuleLength = 0; // the length of rule body.
97+
};
98+
99+
// A state represents a node in the LR automaton graph. It is an item set, which
100+
// contains all possible rules that the LR parser may be parsing in that state.
101+
//
102+
// Conceptually, If we knew in advance what we're parsing, at any point we're
103+
// partway through parsing a production, sitting on a stack of partially parsed
104+
// productions. But because we don't know, there could be *several* productions
105+
// we're partway through. The set of possibilities is the parser state, and we
106+
// precompute all the transitions between these states.
107+
struct State {
108+
// A full set of items (including non-kernel items) representing the state,
109+
// in a canonical order (see SortByNextSymbol in the cpp file).
110+
std::vector<Item> Items;
111+
112+
std::string dump(const Grammar &G, unsigned Indent = 0) const;
113+
};
114+
115+
// LRGraph is a deterministic finite state automaton for LR parsing.
116+
//
117+
// Intuitively, an LR automaton is a transition graph. The graph has a
118+
// collection of nodes, called States. Each state corresponds to a particular
119+
// item set, which represents a condition that could occur duing the process of
120+
// parsing a production. Edges are directed from one state to another. Each edge
121+
// is labeled by a grammar symbol (terminal or nonterminal).
122+
//
123+
// LRGraph is used to construct the LR parsing table which is a core
124+
// data-structure driving the LR parser.
125+
class LRGraph {
126+
public:
127+
// StateID is the index in States table.
128+
using StateID = uint16_t;
129+
130+
// Constructs an LR(0) automaton.
131+
static LRGraph buildLR0(const Grammar &);
132+
133+
// An edge in the LR graph, it represents a transition in the LR automaton.
134+
// If the parser is at state Src, with a lookahead Label, then it
135+
// transits to state Dst.
136+
struct Edge {
137+
StateID Src, Dst;
138+
SymbolID Label;
139+
};
140+
141+
llvm::ArrayRef<State> states() const { return States; }
142+
llvm::ArrayRef<Edge> edges() const { return Edges; }
143+
144+
std::string dumpForTests(const Grammar &) const;
145+
146+
private:
147+
LRGraph(std::vector<State> States, std::vector<Edge> Edges)
148+
: States(std::move(States)), Edges(std::move(Edges)) {}
149+
150+
std::vector<State> States;
151+
std::vector<Edge> Edges;
152+
};
153+
154+
} // namespace pseudo
155+
} // namespace syntax
156+
} // namespace clang
157+
158+
namespace llvm {
159+
// Support clang::syntax::pseudo::Item as DenseMap keys.
160+
template <> struct DenseMapInfo<clang::syntax::pseudo::Item> {
161+
static inline clang::syntax::pseudo::Item getEmptyKey() {
162+
return clang::syntax::pseudo::Item::sentinel(-1);
163+
}
164+
static inline clang::syntax::pseudo::Item getTombstoneKey() {
165+
return clang::syntax::pseudo::Item::sentinel(-2);
166+
}
167+
static unsigned getHashValue(const clang::syntax::pseudo::Item &I) {
168+
return hash_value(I);
169+
}
170+
static bool isEqual(const clang::syntax::pseudo::Item &LHS,
171+
const clang::syntax::pseudo::Item &RHS) {
172+
return LHS == RHS;
173+
}
174+
};
175+
} // namespace llvm
176+
177+
#endif // LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H

clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS Support)
33
add_clang_library(clangToolingSyntaxPseudo
44
Grammar.cpp
55
GrammarBNF.cpp
6+
LRGraph.cpp
67

78
LINK_LIBS
89
clangBasic

0 commit comments

Comments
 (0)