Skip to content

Commit 7cadd5f

Browse files
Make tokenizer functions immutable (#47)
1 parent 90dd590 commit 7cadd5f

File tree

10 files changed

+213
-287
lines changed

10 files changed

+213
-287
lines changed

lib/tokenizers/native.ex

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,8 +126,6 @@ defmodule Tokenizers.Native do
126126
# Trainers
127127
def trainers_info(_trainer), do: err()
128128
#
129-
def trainers_train(_trainer, _model), do: err()
130-
#
131129
def trainers_bpe_trainer(_options), do: err()
132130
def trainers_wordpiece_trainer(_options), do: err()
133131
def trainers_wordlevel_trainer(_options), do: err()

lib/tokenizers/tokenizer.ex

Lines changed: 23 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -65,31 +65,18 @@ defmodule Tokenizers.Tokenizer do
6565
even if `:use_cache` is false. By default it uses `:filename.basedir/3` to get
6666
a cache dir based in the "tokenizers_elixir" application name.
6767
68-
* `:additional_special_tokens` - A list of special tokens to append to the tokenizer.
69-
Defaults to `[]`.
70-
71-
* `:padding` - Override for padding configuration. Currently the only supported
72-
value is `:none` to disable padding. By default the configuration is restored
73-
from the file.
74-
75-
* `:truncation` - Override for truncation configuration. Currently the only supported
76-
value is `:none` to disable truncation. By default the configuration is restored
77-
from the file.
78-
7968
"""
8069
@spec from_pretrained(String.t(), Keyword.t()) :: {:ok, t()} | {:error, term()}
8170
def from_pretrained(identifier, opts \\ []) do
8271
opts =
8372
Keyword.validate!(
8473
opts,
8574
[
86-
:padding,
87-
:truncation,
75+
:additional_special_tokens,
8876
revision: "main",
8977
use_cache: true,
9078
cache_dir: :filename.basedir(:user_cache, "tokenizers_elixir"),
91-
http_client: {Tokenizers.HTTPClient, []},
92-
additional_special_tokens: []
79+
http_client: {Tokenizers.HTTPClient, []}
9380
]
9481
)
9582

@@ -114,7 +101,7 @@ defmodule Tokenizers.Tokenizer do
114101
Path.join(cache_dir, entry_filename(url, etag))
115102
end
116103

117-
load_opts = Keyword.take(opts, [:additional_special_tokens, :padding, :truncation])
104+
load_opts = Keyword.take(opts, [:additional_special_tokens])
118105

119106
if opts[:use_cache] do
120107
with {:ok, response} <- request(http_client, Keyword.put(http_opts, :method, :head)) do
@@ -183,43 +170,33 @@ defmodule Tokenizers.Tokenizer do
183170
Base.encode32(etag, case: :lower, padding: false)
184171
end
185172

186-
@typedoc """
187-
Options to set on the loaded tokenizer.
188-
189-
* `:additional_special_tokens - a list of special tokens to append to the tokenizer.
190-
Defaults to `[]`.
191-
192-
* `:padding` - Override for padding configuration. Currently the only supported
193-
value is `:none` to disable padding. By default the configuration is restored
194-
from the file.
195-
196-
* `:truncation` - Override for truncation configuration. Currently the only supported
197-
value is `:none` to disable truncation. By default the configuration is restored
198-
from the file.
199-
200-
"""
201-
@type load_options ::
202-
[
203-
additional_special_tokens: [String.t() | Tokenizers.AddedToken.t()],
204-
padding: :none,
205-
truncation: :none
206-
]
207-
208173
@doc """
209174
Instantiate a new tokenizer from the file at the given path.
210175
"""
211-
@spec from_file(path :: String.t(), load_options()) :: {:ok, t()} | {:error, term()}
212-
defdelegate from_file(path, options \\ []),
213-
to: Tokenizers.Native,
214-
as: :tokenizer_from_file
176+
@spec from_file(path :: String.t(), keyword()) :: {:ok, t()} | {:error, term()}
177+
def from_file(path, options \\ []) do
178+
if Keyword.has_key?(options, :additional_special_tokens) do
179+
IO.warn(
180+
"passing :additional_special_tokens as an option is deprecated. Use add_special_tokens/2 instead"
181+
)
182+
end
183+
184+
Tokenizers.Native.tokenizer_from_file(path, options)
185+
end
215186

216187
@doc """
217188
Instantiate a new tokenizer from the buffer.
218189
"""
219-
@spec from_buffer(data :: String.t(), load_options()) :: {:ok, t()} | {:error, term()}
220-
defdelegate from_buffer(data, options \\ []),
221-
to: Tokenizers.Native,
222-
as: :tokenizer_from_buffer
190+
@spec from_buffer(data :: String.t(), keyword()) :: {:ok, t()} | {:error, term()}
191+
def from_buffer(data, options \\ []) do
192+
if Keyword.has_key?(options, :additional_special_tokens) do
193+
IO.warn(
194+
"passing :additional_special_tokens as an option is deprecated. Use add_special_tokens/2 instead"
195+
)
196+
end
197+
198+
Tokenizers.Native.tokenizer_from_buffer(data, options)
199+
end
223200

224201
@doc """
225202
Save the tokenizer to the provided path. Options:

lib/tokenizers/trainer.ex

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,6 @@ defmodule Tokenizers.Trainer do
1212
@spec info(t()) :: map()
1313
defdelegate info(trainer), to: Tokenizers.Native, as: :trainers_info
1414

15-
@doc """
16-
The actual training method.
17-
This will mutate a Model as well as return a list of special_tokens to be added directly to the tokenizer along with the model.
18-
"""
19-
@spec train(t(), Tokenizers.Model.t()) :: {:ok, [String.t()]} | {:error, any()}
20-
defdelegate train(trainer, model), to: Tokenizers.Native, as: :trainers_train
21-
2215
@typedoc """
2316
Options for BPE trainer initialisation. All options can be ommited.
2417
"""

native/ex_tokenizers/src/lib.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,8 +148,6 @@ rustler::init!(
148148
// Trainers
149149
trainers_info,
150150
//
151-
trainers_train,
152-
//
153151
trainers_bpe_trainer,
154152
trainers_wordpiece_trainer,
155153
trainers_wordlevel_trainer,

0 commit comments

Comments
 (0)