@@ -73,9 +73,9 @@ def get_states(self) -> set[LR0State]:
7373 states , _ = self ._create_states_and_shift_entries ()
7474 return states
7575
76- def _first (self , symbol : Token ) -> set [Token ]:
76+ def _first (self , * symbols ) -> set [Token ]:
7777 """
78- Computes and returns the FIRST set for the given symbol .
78+ Computes and returns the FIRST set for the given symbols .
7979
8080 The FIRST set of a symbol 'G' is the set of terminal symbols that are
8181 first in the right-hand side of a rule where 'G' is the left-hand side.
@@ -88,7 +88,19 @@ def _first(self, symbol: Token) -> set[Token]:
8888 where M is either terminal or non-terminal and T is non-terminal.
8989 'a' would be included in the FIRST set because if rule 4 is substituted in
9090 rule 3, 'a' (which is a terminal) could be derived from 'G'.
91+
92+ The computation of the FIRST set looks very simple if symbols = X Y Z, it seems
93+ as if Y and Z can be ignored and FIRST(X) is the only thing that matters.
94+ But consider a grammar where X -> Y and Y -> ε. Because Y can produce the empty
95+ string - and therefore X can produce the empty string - we find that FIRST(XYZ)
96+ must include FIRST(Z). Therefore, in computing FIRST sets we must keep track of
97+ which symbols can produce the empty string.
9198 """
99+ if len (symbols ) == 0 :
100+ return set ()
101+
102+ symbol = symbols [0 ]
103+
92104 if symbol .is_terminal :
93105 return {symbol }
94106
@@ -98,10 +110,14 @@ def _first(self, symbol: Token) -> set[Token]:
98110 if rule .lhs != symbol :
99111 continue
100112
113+ if rule .has_null_rhs () and len (symbols ) > 1 :
114+ _set |= self ._first (symbols [1 :])
115+ continue
116+
101117 if rule .rhs [0 ].is_terminal :
102118 _set .add (rule .rhs [0 ])
103119 elif rule .rhs_len == 1 :
104- _set |= self ._first (rule .rhs [ 0 ] )
120+ _set |= self ._first (* rule .rhs )
105121
106122 return _set
107123
0 commit comments