Add support for mixed rules conversion into Chomsky Normal Form

josecols · josecols · commit 5c9f2bfbc850 · 2025-02-24T15:44:57.000-08:00
diff --git a/AUTHORS.md b/AUTHORS.md
@@ -307,6 +307,7 @@
 - Samer Masterson <https://github.com/samertm>
 - William LaCroix <https://github.com/WilliamPLaCroix>
 - Peter de Blanc <https://github.com/pdeblanc>
+- Jose Cols <https://github.com/josecols>
 
 ## Others whose work we've taken and included in NLTK, but who didn't directly contribute it:
 
diff --git a/nltk/grammar.py b/nltk/grammar.py
@@ -750,20 +750,13 @@ def chomsky_normal_form(self, new_token_padding="@$@", flexible=False):
                 "Grammar has Empty rules. " "Cannot deal with them at the moment"
             )
 
-        # check for mixed rules
-        for rule in self.productions():
-            if rule.is_lexical() and len(rule.rhs()) > 1:
-                raise ValueError(
-                    f"Cannot handled mixed rule {rule.lhs()} => {rule.rhs()}"
-                )
-
         step1 = CFG.eliminate_start(self)
         step2 = CFG.binarize(step1, new_token_padding)
+        step3 = CFG.remove_mixed_rules(step2, new_token_padding)
         if flexible:
-            return step2
-        step3 = CFG.remove_unitary_rules(step2)
-        step4 = CFG(step3.start(), list(set(step3.productions())))
-        return step4
+            return step3
+        step4 = CFG.remove_unitary_rules(step3)
+        return CFG(step4.start(), list(set(step4.productions())))
 
     @classmethod
     def remove_unitary_rules(cls, grammar):
@@ -845,6 +838,48 @@ def eliminate_start(cls, grammar):
             return n_grammar
         return grammar
 
+    @classmethod
+    def remove_mixed_rules(cls, grammar, padding="@$@"):
+        """
+        Convert all mixed rules containing terminals and non-terminals
+        into dummy non-terminals.
+        Example::
+
+            Original:
+                A => term B
+            After Conversion:
+                A => TERM@$@TERM B
+                TERM@$@TERM => term
+        """
+        result = []
+        dummy_nonterms = {}
+        for rule in grammar.productions():
+            if not rule.is_lexical() or len(rule.rhs()) <= 1:
+                result.append(rule)
+                continue
+
+            new_rhs = []
+            for item in rule.rhs():
+                if is_nonterminal(item):
+                    new_rhs.append(item)
+                else:
+                    if item not in dummy_nonterms:
+                        sanitized_term = "".join(
+                            _STANDARD_NONTERM_RE.findall(item.upper())
+                        )
+                        dummy_nonterm_symbol = (
+                            f"{sanitized_term}{padding}{sanitized_term}"
+                        )
+                        dummy_nonterms[item] = Nonterminal(dummy_nonterm_symbol)
+
+                    new_rhs.append(dummy_nonterms[item])
+                    result.append(Production(dummy_nonterms[item], rhs=[item]))
+
+            result.append(Production(rule.lhs(), new_rhs))
+
+        n_grammar = CFG(grammar.start(), result)
+        return n_grammar
+
     def __repr__(self):
         return "<Grammar with %d productions>" % len(self._productions)
 
diff --git a/nltk/test/grammar.doctest b/nltk/test/grammar.doctest
@@ -67,3 +67,22 @@ Grammars can contain both empty strings and empty productions:
     ... """)
     >>> list(generate(grammar))
     [['a', 'b'], ['a']]
+
+Grammars with mixed rules can be converted into Chomsky Normal Form:
+
+    >>> from nltk import CFG
+    >>> grammar = CFG.fromstring("""
+    ... S -> NP VP
+    ... PP -> P NP
+    ... NP -> NP PP P
+    ... NP -> 'the' Nom | 'a' Nom
+    ... VP -> V NP | VP PP
+    ... Det -> 'a' | 'the'
+    ... Nom -> 'dog' | 'cat'
+    ... V -> 'chased' | 'sat'
+    ... P -> 'on' | 'in'
+    ... """)
+    >>> grammar
+    <Grammar with 15 productions>
+    >>> grammar.chomsky_normal_form()
+    <Grammar with 18 productions>