Merge pull request nltk#3376 from ekaf/hotfix-3373

stevenbird · web-flow · commit 30ee1ad5e23d · 2025-03-08T16:50:39.000+09:30
Properly initialize Portuguese corpus
diff --git a/nltk/corpus/reader/plaintext.py b/nltk/corpus/reader/plaintext.py
@@ -163,13 +163,21 @@ def __init__(self, *args, **kwargs):
         PlaintextCorpusReader.__init__(self, *args, **kwargs)
 
 
-# FIXME: Is there a better way? How to not hardcode this?
-#       Possibly, add a language kwargs to CategorizedPlaintextCorpusReader to
-#       override the `sent_tokenizer`.
 class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader):
+    """
+    This class is identical with CategorizedPlaintextCorpusReader,
+    except that it initializes a Portuguese PunktTokenizer:
+
+    >>> from nltk.corpus import machado
+    >>> print(machado._sent_tokenizer._lang)
+    portuguese
+
+    """
+
     def __init__(self, *args, **kwargs):
-        CategorizedCorpusReader.__init__(self, kwargs)
-        kwargs["sent_tokenizer"] = PunktTokenizer("portuguese")
+        CategorizedPlaintextCorpusReader.__init__(self, *args, **kwargs)
+        # Fixed (@ekaf 2025), new way to invoke Punkt:
+        self._sent_tokenizer = PunktTokenizer("portuguese")
 
 
 class EuroparlCorpusReader(PlaintextCorpusReader):