Skip to content

Commit 30ee1ad

Browse files
authored
Merge pull request nltk#3376 from ekaf/hotfix-3373
Properly initialize Portuguese corpus
2 parents 80bebde + 7210c8e commit 30ee1ad

File tree

1 file changed

+13
-5
lines changed

1 file changed

+13
-5
lines changed

nltk/corpus/reader/plaintext.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -163,13 +163,21 @@ def __init__(self, *args, **kwargs):
163163
PlaintextCorpusReader.__init__(self, *args, **kwargs)
164164

165165

166-
# FIXME: Is there a better way? How to not hardcode this?
167-
# Possibly, add a language kwargs to CategorizedPlaintextCorpusReader to
168-
# override the `sent_tokenizer`.
169166
class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader):
167+
"""
168+
This class is identical with CategorizedPlaintextCorpusReader,
169+
except that it initializes a Portuguese PunktTokenizer:
170+
171+
>>> from nltk.corpus import machado
172+
>>> print(machado._sent_tokenizer._lang)
173+
portuguese
174+
175+
"""
176+
170177
def __init__(self, *args, **kwargs):
171-
CategorizedCorpusReader.__init__(self, kwargs)
172-
kwargs["sent_tokenizer"] = PunktTokenizer("portuguese")
178+
CategorizedPlaintextCorpusReader.__init__(self, *args, **kwargs)
179+
# Fixed (@ekaf 2025), new way to invoke Punkt:
180+
self._sent_tokenizer = PunktTokenizer("portuguese")
173181

174182

175183
class EuroparlCorpusReader(PlaintextCorpusReader):

0 commit comments

Comments
 (0)