Skip to content

Commit 4b99fe5

Browse files
committed
Ensure Portuguese _sent_tokenizer
1 parent 92ad091 commit 4b99fe5

File tree

1 file changed

+2
-4
lines changed

1 file changed

+2
-4
lines changed

nltk/corpus/reader/plaintext.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -163,13 +163,11 @@ def __init__(self, *args, **kwargs):
163163
PlaintextCorpusReader.__init__(self, *args, **kwargs)
164164

165165

166-
# FIXME: Is there a better way? How to not hardcode this?
167-
# Possibly, add a language kwargs to CategorizedPlaintextCorpusReader to
168-
# override the `sent_tokenizer`.
169166
class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader):
170167
def __init__(self, *args, **kwargs):
171168
CategorizedPlaintextCorpusReader.__init__(self, *args, **kwargs)
172-
kwargs["sent_tokenizer"] = PunktTokenizer("portuguese")
169+
# Fixed (@ekaf 2025), new way to invoke Punkt:
170+
self._sent_tokenizer = PunktTokenizer("portuguese")
173171

174172

175173
class EuroparlCorpusReader(PlaintextCorpusReader):

0 commit comments

Comments
 (0)