Skip to content

Commit 007cf75

Browse files
authored
Merge pull request nltk#3371 from josecols/feature/cnf-mixed-rules
Add support for mixed rules conversion into Chomsky Normal Form
2 parents 30ee1ad + bcb9fea commit 007cf75

File tree

3 files changed

+66
-11
lines changed

3 files changed

+66
-11
lines changed

AUTHORS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,7 @@
307307
- Samer Masterson <https://github.com/samertm>
308308
- William LaCroix <https://github.com/WilliamPLaCroix>
309309
- Peter de Blanc <https://github.com/pdeblanc>
310+
- Jose Cols <https://github.com/josecols>
310311

311312
## Others whose work we've taken and included in NLTK, but who didn't directly contribute it:
312313

nltk/grammar.py

Lines changed: 46 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -750,20 +750,13 @@ def chomsky_normal_form(self, new_token_padding="@$@", flexible=False):
750750
"Grammar has Empty rules. " "Cannot deal with them at the moment"
751751
)
752752

753-
# check for mixed rules
754-
for rule in self.productions():
755-
if rule.is_lexical() and len(rule.rhs()) > 1:
756-
raise ValueError(
757-
f"Cannot handled mixed rule {rule.lhs()} => {rule.rhs()}"
758-
)
759-
760753
step1 = CFG.eliminate_start(self)
761754
step2 = CFG.binarize(step1, new_token_padding)
755+
step3 = CFG.remove_mixed_rules(step2, new_token_padding)
762756
if flexible:
763-
return step2
764-
step3 = CFG.remove_unitary_rules(step2)
765-
step4 = CFG(step3.start(), list(set(step3.productions())))
766-
return step4
757+
return step3
758+
step4 = CFG.remove_unitary_rules(step3)
759+
return CFG(step4.start(), list(set(step4.productions())))
767760

768761
@classmethod
769762
def remove_unitary_rules(cls, grammar):
@@ -845,6 +838,48 @@ def eliminate_start(cls, grammar):
845838
return n_grammar
846839
return grammar
847840

841+
@classmethod
842+
def remove_mixed_rules(cls, grammar, padding="@$@"):
843+
"""
844+
Convert all mixed rules containing terminals and non-terminals
845+
into dummy non-terminals.
846+
Example::
847+
848+
Original:
849+
A => term B
850+
After Conversion:
851+
A => TERM@$@TERM B
852+
TERM@$@TERM => term
853+
"""
854+
result = []
855+
dummy_nonterms = {}
856+
for rule in grammar.productions():
857+
if not rule.is_lexical() or len(rule.rhs()) <= 1:
858+
result.append(rule)
859+
continue
860+
861+
new_rhs = []
862+
for item in rule.rhs():
863+
if is_nonterminal(item):
864+
new_rhs.append(item)
865+
else:
866+
if item not in dummy_nonterms:
867+
sanitized_term = "".join(
868+
_STANDARD_NONTERM_RE.findall(item.upper())
869+
)
870+
dummy_nonterm_symbol = (
871+
f"{sanitized_term}{padding}{sanitized_term}"
872+
)
873+
dummy_nonterms[item] = Nonterminal(dummy_nonterm_symbol)
874+
875+
new_rhs.append(dummy_nonterms[item])
876+
result.append(Production(dummy_nonterms[item], rhs=[item]))
877+
878+
result.append(Production(rule.lhs(), new_rhs))
879+
880+
n_grammar = CFG(grammar.start(), result)
881+
return n_grammar
882+
848883
def __repr__(self):
849884
return "<Grammar with %d productions>" % len(self._productions)
850885

nltk/test/grammar.doctest

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,3 +67,22 @@ Grammars can contain both empty strings and empty productions:
6767
... """)
6868
>>> list(generate(grammar))
6969
[['a', 'b'], ['a']]
70+
71+
Grammars with mixed rules can be converted into Chomsky Normal Form:
72+
73+
>>> from nltk import CFG
74+
>>> grammar = CFG.fromstring("""
75+
... S -> NP VP
76+
... PP -> P NP
77+
... NP -> NP PP P
78+
... NP -> 'the' Nom | 'a' Nom
79+
... VP -> V NP | VP PP
80+
... Det -> 'a' | 'the'
81+
... Nom -> 'dog' | 'cat'
82+
... V -> 'chased' | 'sat'
83+
... P -> 'on' | 'in'
84+
... """)
85+
>>> grammar
86+
<Grammar with 15 productions>
87+
>>> grammar.chomsky_normal_form()
88+
<Grammar with 18 productions>

0 commit comments

Comments
 (0)