@@ -65,31 +65,18 @@ defmodule Tokenizers.Tokenizer do
65
65
even if `:use_cache` is false. By default it uses `:filename.basedir/3` to get
66
66
a cache dir based in the "tokenizers_elixir" application name.
67
67
68
- * `:additional_special_tokens` - A list of special tokens to append to the tokenizer.
69
- Defaults to `[]`.
70
-
71
- * `:padding` - Override for padding configuration. Currently the only supported
72
- value is `:none` to disable padding. By default the configuration is restored
73
- from the file.
74
-
75
- * `:truncation` - Override for truncation configuration. Currently the only supported
76
- value is `:none` to disable truncation. By default the configuration is restored
77
- from the file.
78
-
79
68
"""
80
69
@ spec from_pretrained ( String . t ( ) , Keyword . t ( ) ) :: { :ok , t ( ) } | { :error , term ( ) }
81
70
def from_pretrained ( identifier , opts \\ [ ] ) do
82
71
opts =
83
72
Keyword . validate! (
84
73
opts ,
85
74
[
86
- :padding ,
87
- :truncation ,
75
+ :additional_special_tokens ,
88
76
revision: "main" ,
89
77
use_cache: true ,
90
78
cache_dir: :filename . basedir ( :user_cache , "tokenizers_elixir" ) ,
91
- http_client: { Tokenizers.HTTPClient , [ ] } ,
92
- additional_special_tokens: [ ]
79
+ http_client: { Tokenizers.HTTPClient , [ ] }
93
80
]
94
81
)
95
82
@@ -114,7 +101,7 @@ defmodule Tokenizers.Tokenizer do
114
101
Path . join ( cache_dir , entry_filename ( url , etag ) )
115
102
end
116
103
117
- load_opts = Keyword . take ( opts , [ :additional_special_tokens , :padding , :truncation ] )
104
+ load_opts = Keyword . take ( opts , [ :additional_special_tokens ] )
118
105
119
106
if opts [ :use_cache ] do
120
107
with { :ok , response } <- request ( http_client , Keyword . put ( http_opts , :method , :head ) ) do
@@ -183,43 +170,33 @@ defmodule Tokenizers.Tokenizer do
183
170
Base . encode32 ( etag , case: :lower , padding: false )
184
171
end
185
172
186
- @ typedoc """
187
- Options to set on the loaded tokenizer.
188
-
189
- * `:additional_special_tokens - a list of special tokens to append to the tokenizer.
190
- Defaults to `[]`.
191
-
192
- * `:padding` - Override for padding configuration. Currently the only supported
193
- value is `:none` to disable padding. By default the configuration is restored
194
- from the file.
195
-
196
- * `:truncation` - Override for truncation configuration. Currently the only supported
197
- value is `:none` to disable truncation. By default the configuration is restored
198
- from the file.
199
-
200
- """
201
- @ type load_options ::
202
- [
203
- additional_special_tokens: [ String . t ( ) | Tokenizers.AddedToken . t ( ) ] ,
204
- padding: :none ,
205
- truncation: :none
206
- ]
207
-
208
173
@ doc """
209
174
Instantiate a new tokenizer from the file at the given path.
210
175
"""
211
- @ spec from_file ( path :: String . t ( ) , load_options ( ) ) :: { :ok , t ( ) } | { :error , term ( ) }
212
- defdelegate from_file ( path , options \\ [ ] ) ,
213
- to: Tokenizers.Native ,
214
- as: :tokenizer_from_file
176
+ @ spec from_file ( path :: String . t ( ) , keyword ( ) ) :: { :ok , t ( ) } | { :error , term ( ) }
177
+ def from_file ( path , options \\ [ ] ) do
178
+ if Keyword . has_key? ( options , :additional_special_tokens ) do
179
+ IO . warn (
180
+ "passing :additional_special_tokens as an option is deprecated. Use add_special_tokens/2 instead"
181
+ )
182
+ end
183
+
184
+ Tokenizers.Native . tokenizer_from_file ( path , options )
185
+ end
215
186
216
187
@ doc """
217
188
Instantiate a new tokenizer from the buffer.
218
189
"""
219
- @ spec from_buffer ( data :: String . t ( ) , load_options ( ) ) :: { :ok , t ( ) } | { :error , term ( ) }
220
- defdelegate from_buffer ( data , options \\ [ ] ) ,
221
- to: Tokenizers.Native ,
222
- as: :tokenizer_from_buffer
190
+ @ spec from_buffer ( data :: String . t ( ) , keyword ( ) ) :: { :ok , t ( ) } | { :error , term ( ) }
191
+ def from_buffer ( data , options \\ [ ] ) do
192
+ if Keyword . has_key? ( options , :additional_special_tokens ) do
193
+ IO . warn (
194
+ "passing :additional_special_tokens as an option is deprecated. Use add_special_tokens/2 instead"
195
+ )
196
+ end
197
+
198
+ Tokenizers.Native . tokenizer_from_buffer ( data , options )
199
+ end
223
200
224
201
@ doc """
225
202
Save the tokenizer to the provided path. Options:
0 commit comments