|
| 1 | +//===--- LRGraph.h - Build an LR automaton ------------------*- C++-*-===// |
| 2 | +// |
| 3 | +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | +// See https://llvm.org/LICENSE.txt for license information. |
| 5 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | +// |
| 7 | +//===----------------------------------------------------------------------===// |
| 8 | +// |
| 9 | +// LR parsers are bottom-up parsers -- they scan the input from left to right, |
| 10 | +// and collect the right-hand side of a production rule (called handle) on top |
| 11 | +// of the stack, then replace (reduce) the handle with the nonterminal defined |
| 12 | +// by the production rule. |
| 13 | +// |
| 14 | +// This file defines LRGraph, a deterministic handle-finding finite-state |
| 15 | +// automaton, which is a key component in LR parsers to recognize any of |
| 16 | +// handles in the grammar efficiently. We build the LR table (ACTION and GOTO |
| 17 | +// Table) based on the LRGraph. |
| 18 | +// |
| 19 | +// LRGraph can be constructed for any context-free grammars. |
| 20 | +// Even for a LR-ambiguous grammar, we can construct a deterministic FSA, but |
| 21 | +// interpretation of the FSA is nondeterminsitic -- we might in a state where |
| 22 | +// we can continue searching an handle and identify a handle (called |
| 23 | +// shift/reduce conflicts), or identify more than one handle (callled |
| 24 | +// reduce/reduce conflicts). |
| 25 | +// |
| 26 | +// LRGraph is a common model for all variants of LR automatons, from the most |
| 27 | +// basic one LR(0), the powerful SLR(1), LR(1) which uses a one-token lookahead |
| 28 | +// in making decisions. |
| 29 | +//===----------------------------------------------------------------------===// |
| 30 | + |
| 31 | +#ifndef LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H |
| 32 | +#define LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H |
| 33 | + |
| 34 | +#include "clang/Tooling/Syntax/Pseudo/Grammar.h" |
| 35 | +#include "llvm/ADT/Hashing.h" |
| 36 | +#include <vector> |
| 37 | + |
| 38 | +namespace clang { |
| 39 | +namespace syntax { |
| 40 | +namespace pseudo { |
| 41 | + |
| 42 | +// An LR item -- a grammar rule with a dot at some position of the body. |
| 43 | +// e.g. a production rule A := X Y yields 3 items: |
| 44 | +// A := . X Y |
| 45 | +// A := X . Y |
| 46 | +// A := X Y . |
| 47 | +// An item indicates how much of a production rule has been recognized at a |
| 48 | +// position (described by dot), for example, A := X . Y indicates that we have |
| 49 | +// recognized the X part from the input, and we hope next to see the input |
| 50 | +// derivable from Y. |
| 51 | +class Item { |
| 52 | +public: |
| 53 | + static Item start(RuleID ID, const Grammar &G) { |
| 54 | + Item I; |
| 55 | + I.RID = ID; |
| 56 | + I.RuleLength = G.lookupRule(ID).Size; |
| 57 | + return I; |
| 58 | + } |
| 59 | + static Item sentinel(RuleID ID) { |
| 60 | + Item I; |
| 61 | + I.RID = ID; |
| 62 | + return I; |
| 63 | + } |
| 64 | + |
| 65 | + RuleID rule() const { return RID; } |
| 66 | + uint8_t dot() const { return DotPos; } |
| 67 | + |
| 68 | + bool hasNext() const { return DotPos < RuleLength; } |
| 69 | + SymbolID next(const Grammar &G) const { |
| 70 | + assert(hasNext()); |
| 71 | + return G.lookupRule(RID).Sequence[DotPos]; |
| 72 | + } |
| 73 | + |
| 74 | + Item advance() const { |
| 75 | + assert(hasNext()); |
| 76 | + Item I = *this; |
| 77 | + ++I.DotPos; |
| 78 | + return I; |
| 79 | + } |
| 80 | + |
| 81 | + std::string dump(const Grammar &G) const; |
| 82 | + |
| 83 | + bool operator==(const Item &I) const { |
| 84 | + return DotPos == I.DotPos && RID == I.RID; |
| 85 | + } |
| 86 | + bool operator<(const Item &I) const { |
| 87 | + return std::tie(RID, DotPos) < std::tie(I.RID, I.DotPos); |
| 88 | + } |
| 89 | + friend llvm::hash_code hash_value(const Item &I) { |
| 90 | + return llvm::hash_combine(I.RID, I.DotPos); |
| 91 | + } |
| 92 | + |
| 93 | +private: |
| 94 | + RuleID RID = 0; |
| 95 | + uint8_t DotPos = 0; |
| 96 | + uint8_t RuleLength = 0; // the length of rule body. |
| 97 | +}; |
| 98 | + |
| 99 | +// A state represents a node in the LR automaton graph. It is an item set, which |
| 100 | +// contains all possible rules that the LR parser may be parsing in that state. |
| 101 | +// |
| 102 | +// Conceptually, If we knew in advance what we're parsing, at any point we're |
| 103 | +// partway through parsing a production, sitting on a stack of partially parsed |
| 104 | +// productions. But because we don't know, there could be *several* productions |
| 105 | +// we're partway through. The set of possibilities is the parser state, and we |
| 106 | +// precompute all the transitions between these states. |
| 107 | +struct State { |
| 108 | + // A full set of items (including non-kernel items) representing the state, |
| 109 | + // in a canonical order (see SortByNextSymbol in the cpp file). |
| 110 | + std::vector<Item> Items; |
| 111 | + |
| 112 | + std::string dump(const Grammar &G, unsigned Indent = 0) const; |
| 113 | +}; |
| 114 | + |
| 115 | +// LRGraph is a deterministic finite state automaton for LR parsing. |
| 116 | +// |
| 117 | +// Intuitively, an LR automaton is a transition graph. The graph has a |
| 118 | +// collection of nodes, called States. Each state corresponds to a particular |
| 119 | +// item set, which represents a condition that could occur duing the process of |
| 120 | +// parsing a production. Edges are directed from one state to another. Each edge |
| 121 | +// is labeled by a grammar symbol (terminal or nonterminal). |
| 122 | +// |
| 123 | +// LRGraph is used to construct the LR parsing table which is a core |
| 124 | +// data-structure driving the LR parser. |
| 125 | +class LRGraph { |
| 126 | +public: |
| 127 | + // StateID is the index in States table. |
| 128 | + using StateID = uint16_t; |
| 129 | + |
| 130 | + // Constructs an LR(0) automaton. |
| 131 | + static LRGraph buildLR0(const Grammar &); |
| 132 | + |
| 133 | + // An edge in the LR graph, it represents a transition in the LR automaton. |
| 134 | + // If the parser is at state Src, with a lookahead Label, then it |
| 135 | + // transits to state Dst. |
| 136 | + struct Edge { |
| 137 | + StateID Src, Dst; |
| 138 | + SymbolID Label; |
| 139 | + }; |
| 140 | + |
| 141 | + llvm::ArrayRef<State> states() const { return States; } |
| 142 | + llvm::ArrayRef<Edge> edges() const { return Edges; } |
| 143 | + |
| 144 | + std::string dumpForTests(const Grammar &) const; |
| 145 | + |
| 146 | +private: |
| 147 | + LRGraph(std::vector<State> States, std::vector<Edge> Edges) |
| 148 | + : States(std::move(States)), Edges(std::move(Edges)) {} |
| 149 | + |
| 150 | + std::vector<State> States; |
| 151 | + std::vector<Edge> Edges; |
| 152 | +}; |
| 153 | + |
| 154 | +} // namespace pseudo |
| 155 | +} // namespace syntax |
| 156 | +} // namespace clang |
| 157 | + |
| 158 | +namespace llvm { |
| 159 | +// Support clang::syntax::pseudo::Item as DenseMap keys. |
| 160 | +template <> struct DenseMapInfo<clang::syntax::pseudo::Item> { |
| 161 | + static inline clang::syntax::pseudo::Item getEmptyKey() { |
| 162 | + return clang::syntax::pseudo::Item::sentinel(-1); |
| 163 | + } |
| 164 | + static inline clang::syntax::pseudo::Item getTombstoneKey() { |
| 165 | + return clang::syntax::pseudo::Item::sentinel(-2); |
| 166 | + } |
| 167 | + static unsigned getHashValue(const clang::syntax::pseudo::Item &I) { |
| 168 | + return hash_value(I); |
| 169 | + } |
| 170 | + static bool isEqual(const clang::syntax::pseudo::Item &LHS, |
| 171 | + const clang::syntax::pseudo::Item &RHS) { |
| 172 | + return LHS == RHS; |
| 173 | + } |
| 174 | +}; |
| 175 | +} // namespace llvm |
| 176 | + |
| 177 | +#endif // LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H |
0 commit comments