diff --git a/bindings/python/py_src/tokenizers/trainers/__init__.pyi b/bindings/python/py_src/tokenizers/trainers/__init__.pyi index d6c525718..58174ab44 100644 --- a/bindings/python/py_src/tokenizers/trainers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/trainers/__init__.pyi @@ -45,6 +45,20 @@ class BpeTrainer(Trainer): highly repetitive tokens like `======` for wikipedia """ + def __init__( + self, + vocab_size=30000, + min_frequency=0, + show_progress=True, + special_tokens=[], + limit_alphabet=None, + initial_alphabet=[], + continuing_subword_prefix=None, + end_of_word_suffix=None, + max_token_length=None, + words={}, + ): + pass class UnigramTrainer(Trainer): """ @@ -85,6 +99,7 @@ class UnigramTrainer(Trainer): vocab_size=8000, show_progress=True, special_tokens=[], + initial_alphabet=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, @@ -109,6 +124,8 @@ class WordLevelTrainer(Trainer): special_tokens (:obj:`List[Union[str, AddedToken]]`): A list of special tokens the model should know of. """ + def __init__(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[]): + pass class WordPieceTrainer(Trainer): """ diff --git a/bindings/python/src/trainers.rs b/bindings/python/src/trainers.rs index ef2c31e56..30786862e 100644 --- a/bindings/python/src/trainers.rs +++ b/bindings/python/src/trainers.rs @@ -312,7 +312,10 @@ impl PyBpeTrainer { } #[new] - #[pyo3(signature = (**kwargs), text_signature = None)] + #[pyo3( + signature = (**kwargs), + text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet=[], continuing_subword_prefix=None, end_of_word_suffix=None, max_token_length=None, words={})" + )] pub fn new(kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<(Self, PyTrainer)> { let mut builder = tk::models::bpe::BpeTrainer::builder(); if let Some(kwargs) = kwargs { @@ -518,7 +521,7 @@ impl PyWordPieceTrainer { #[new] #[pyo3( signature = (** kwargs), - text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)" + text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet=[], continuing_subword_prefix=\"##\", end_of_word_suffix=None)" )] pub fn new(kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<(Self, PyTrainer)> { let mut builder = tk::models::wordpiece::WordPieceTrainer::builder(); @@ -659,7 +662,10 @@ impl PyWordLevelTrainer { } #[new] - #[pyo3(signature = (**kwargs), text_signature = None)] + #[pyo3( + signature = (**kwargs), + text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[])" + )] pub fn new(kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<(Self, PyTrainer)> { let mut builder = tk::models::wordlevel::WordLevelTrainer::builder(); @@ -826,7 +832,7 @@ impl PyUnigramTrainer { #[new] #[pyo3( signature = (**kwargs), - text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)" + text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], initial_alphabet=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)" )] pub fn new(kwargs: Option>) -> PyResult<(Self, PyTrainer)> { let mut builder = tk::models::unigram::UnigramTrainer::builder();