@@ -67,16 +67,30 @@ defmodule Tokenizers.Tokenizer do
67
67
68
68
* `:additional_special_tokens` - A list of special tokens to append to the tokenizer.
69
69
Defaults to `[]`.
70
+
71
+ * `:padding` - Override for padding configuration. Currently the only supported
72
+ value is `:none` to disable padding. By default the configuration is restored
73
+ from the file.
74
+
75
+ * `:truncation` - Override for truncation configuration. Currently the only supported
76
+ value is `:none` to disable truncation. By default the configuration is restored
77
+ from the file.
78
+
70
79
"""
71
80
@ spec from_pretrained ( String . t ( ) , Keyword . t ( ) ) :: { :ok , t ( ) } | { :error , term ( ) }
72
81
def from_pretrained ( identifier , opts \\ [ ] ) do
73
82
opts =
74
- Keyword . validate! ( opts ,
75
- revision: "main" ,
76
- use_cache: true ,
77
- cache_dir: :filename . basedir ( :user_cache , "tokenizers_elixir" ) ,
78
- http_client: { Tokenizers.HTTPClient , [ ] } ,
79
- additional_special_tokens: [ ]
83
+ Keyword . validate! (
84
+ opts ,
85
+ [
86
+ :padding ,
87
+ :truncation ,
88
+ revision: "main" ,
89
+ use_cache: true ,
90
+ cache_dir: :filename . basedir ( :user_cache , "tokenizers_elixir" ) ,
91
+ http_client: { Tokenizers.HTTPClient , [ ] } ,
92
+ additional_special_tokens: [ ]
93
+ ]
80
94
)
81
95
82
96
{ http_client , http_opts } = opts [ :http_client ]
@@ -100,19 +114,21 @@ defmodule Tokenizers.Tokenizer do
100
114
Path . join ( cache_dir , entry_filename ( url , etag ) )
101
115
end
102
116
117
+ load_opts = Keyword . take ( opts , [ :additional_special_tokens , :padding , :truncation ] )
118
+
103
119
if opts [ :use_cache ] do
104
120
with { :ok , response } <- request ( http_client , Keyword . put ( http_opts , :method , :head ) ) do
105
121
etag = fetch_etag ( response . headers )
106
122
file_path = file_path_fun . ( etag )
107
123
108
124
if File . exists? ( file_path ) do
109
- from_file ( file_path , Keyword . take ( opts , [ :additional_special_tokens ] ) )
125
+ from_file ( file_path , load_opts )
110
126
else
111
127
with { :ok , response } <- request ( http_client , http_opts ) do
112
128
File . mkdir_p! ( cache_dir )
113
129
File . write! ( file_path , response . body )
114
130
115
- from_file ( file_path , Keyword . take ( opts , [ :additional_special_tokens ] ) )
131
+ from_file ( file_path , load_opts )
116
132
end
117
133
end
118
134
end
@@ -124,7 +140,7 @@ defmodule Tokenizers.Tokenizer do
124
140
File . mkdir_p! ( cache_dir )
125
141
File . write! ( file_path , response . body )
126
142
127
- from_file ( file_path , Keyword . take ( opts , [ :additional_special_tokens ] ) )
143
+ from_file ( file_path , load_opts )
128
144
end
129
145
end
130
146
end
@@ -167,28 +183,40 @@ defmodule Tokenizers.Tokenizer do
167
183
Base . encode32 ( etag , case: :lower , padding: false )
168
184
end
169
185
186
+ @ typedoc """
187
+ Options to set on the loaded tokenizer.
188
+
189
+ * `:additional_special_tokens - a list of special tokens to append to the tokenizer.
190
+ Defaults to `[]`.
191
+
192
+ * `:padding` - Override for padding configuration. Currently the only supported
193
+ value is `:none` to disable padding. By default the configuration is restored
194
+ from the file.
195
+
196
+ * `:truncation` - Override for truncation configuration. Currently the only supported
197
+ value is `:none` to disable truncation. By default the configuration is restored
198
+ from the file.
199
+
200
+ """
201
+ @ type load_options ::
202
+ [
203
+ additional_special_tokens: [ String . t ( ) | Tokenizers.AddedToken . t ( ) ] ,
204
+ padding: :none ,
205
+ truncation: :none
206
+ ]
207
+
170
208
@ doc """
171
209
Instantiate a new tokenizer from the file at the given path.
172
- You can specify a list of special tokens to append to the tokenizer.
173
210
"""
174
- @ spec from_file (
175
- path :: String . t ( ) ,
176
- options :: [ additional_special_tokens :: [ String . t ( ) | Tokenizers.AddedToken . t ( ) ] ]
177
- ) ::
178
- { :ok , t ( ) } | { :error , term ( ) }
211
+ @ spec from_file ( path :: String . t ( ) , load_options ( ) ) :: { :ok , t ( ) } | { :error , term ( ) }
179
212
defdelegate from_file ( path , options \\ [ ] ) ,
180
213
to: Tokenizers.Native ,
181
214
as: :tokenizer_from_file
182
215
183
216
@ doc """
184
217
Instantiate a new tokenizer from the buffer.
185
- You can specify a list of special tokens to append to the tokenizer.
186
218
"""
187
- @ spec from_buffer (
188
- data :: String . t ( ) ,
189
- options :: [ additional_special_tokens :: [ String . t ( ) | Tokenizers.AddedToken . t ( ) ] ]
190
- ) ::
191
- { :ok , t ( ) } | { :error , term ( ) }
219
+ @ spec from_buffer ( data :: String . t ( ) , load_options ( ) ) :: { :ok , t ( ) } | { :error , term ( ) }
192
220
defdelegate from_buffer ( data , options \\ [ ] ) ,
193
221
to: Tokenizers.Native ,
194
222
as: :tokenizer_from_buffer
0 commit comments