Skip to content

Commit c2477cb

Browse files
authored
Approach for models, trainer and added_token (#41)
1 parent 41b0f77 commit c2477cb

29 files changed

+1716
-135
lines changed

lib/tokenizers/added_token.ex

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
defmodule Tokenizers.AddedToken do
2+
@moduledoc """
3+
This struct represents AddedTokens
4+
"""
5+
6+
@type t() :: %__MODULE__{resource: reference()}
7+
defstruct [:resource]
8+
9+
@typedoc """
10+
Options for added token initialisation. All options can be ommited.
11+
"""
12+
@type opts() :: [
13+
special: boolean(),
14+
single_word: boolean(),
15+
lstrip: boolean(),
16+
rstrip: boolean(),
17+
normalized: boolean()
18+
]
19+
20+
@doc """
21+
Create a new AddedToken.
22+
23+
* `:special` (default `false`) - defines whether this token is a special token.
24+
25+
* `:single_word` (default `false`) - defines whether this token should only match single words.
26+
If `true`, this token will never match inside of a word. For example the token `ing` would
27+
match on `tokenizing` if this option is `false`, but not if it is `true`.
28+
The notion of ”inside of a word” is defined by the word boundaries pattern
29+
in regular expressions (i.e. the token should start and end with word boundaries).
30+
31+
* `:lstrip` (default `false`) - defines whether this token should strip all potential
32+
whitespaces on its left side.
33+
If `true`, this token will greedily match any whitespace on its left.
34+
For example if we try to match the token `[MASK]` with `lstrip=true`,
35+
in the text `"I saw a [MASK]"`, we would match on `" [MASK]"`. (Note the space on the left).
36+
37+
* `:rstrip` (default `false`) - defines whether this token should strip all potential
38+
whitespaces on its right side.
39+
If `true`, this token will greedily match any whitespace on its right.
40+
It works just like `lstrip` but on the right.
41+
42+
* `:normalized` (default `true` for not special tokens, `false` for special tokens) -
43+
defines whether this token should match against the normalized version of the input text.
44+
For example, with the added token `"yesterday"`,
45+
and a normalizer in charge of lowercasing the text,
46+
the token could be extract from the input `"I saw a lion Yesterday"`.
47+
If `true`, the token will be extracted from the normalized input `"i saw a lion yesterday"`.
48+
If `false`, the token will be extracted from the original input `"I saw a lion Yesterday"`.
49+
"""
50+
@spec new(token :: String.t(), opts :: opts()) :: t()
51+
defdelegate new(token, opts \\ []), to: Tokenizers.Native, as: :added_token_new
52+
53+
@doc """
54+
Retrieves information about added token.
55+
"""
56+
@spec info(added_token :: __MODULE__.t()) :: map()
57+
defdelegate info(model), to: Tokenizers.Native, as: :added_token_info
58+
end
59+
60+
defimpl Inspect, for: Tokenizers.AddedToken do
61+
import Inspect.Algebra
62+
63+
@spec inspect(Tokenizers.AddedToken.t(), Inspect.Opts.t()) :: Inspect.Algebra.t()
64+
def inspect(decoder, opts) do
65+
attrs =
66+
decoder
67+
|> Tokenizers.Native.added_token_info()
68+
|> Keyword.new(fn {k, v} -> {String.to_atom(k), v} end)
69+
70+
concat(["#Tokenizers.PreTokenizer<", to_doc(attrs, opts), ">"])
71+
end
72+
end

lib/tokenizers/model.ex

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,30 +3,46 @@ defmodule Tokenizers.Model do
33
The struct and associated functions for the tokenizer model.
44
"""
55

6-
@type t :: %__MODULE__{resource: binary(), reference: reference()}
7-
defstruct resource: nil, reference: nil
8-
9-
alias Tokenizers.Native
10-
alias Tokenizers.Shared
6+
@typedoc """
7+
Represents different kind of models that can be used across the library.
8+
"""
9+
@type t() :: %__MODULE__{resource: reference()}
10+
defstruct [:resource]
1111

1212
@doc """
1313
Retrieves information about the model.
1414
1515
Information retrieved differs per model but all include `model_type`.
1616
"""
17-
@spec get_model_details(model :: __MODULE__.t()) :: map()
18-
def get_model_details(model), do: model |> Native.get_model_details() |> Shared.unwrap()
17+
@spec info(model :: __MODULE__.t()) :: map()
18+
defdelegate info(model), to: Tokenizers.Native, as: :models_info
19+
20+
@typedoc """
21+
Options to save the model. All options can be ommited.
22+
23+
* `:prefix` (default `""`) - The prefix to use for all the files that will get created.
24+
"""
25+
@type save_opts() :: [prefix: String.t()]
26+
27+
@doc """
28+
Save the current model in the given folder, using the given name for the various files that will get created.
29+
Any file with the same name that already exist in this folder will be overwritten.
30+
"""
31+
@spec save(model :: t(), folder :: String.t(), opts :: save_opts()) ::
32+
{:ok, file_paths :: [String.t()]} | {:error, any()}
33+
defdelegate save(model, folder, opts \\ []), to: Tokenizers.Native, as: :models_save
1934
end
2035

2136
defimpl Inspect, for: Tokenizers.Model do
2237
import Inspect.Algebra
2338

2439
alias Tokenizers.Model
2540

41+
@spec inspect(Tokenizers.Model.t(), Inspect.Opts.t()) :: Inspect.Algebra.t()
2642
def inspect(model, opts) do
2743
attrs =
2844
model
29-
|> Model.get_model_details()
45+
|> Model.info()
3046
|> Keyword.new(fn {k, v} -> {String.to_atom(k), v} end)
3147

3248
concat(["#Tokenizers.Model<", to_doc(attrs, opts), ">"])

lib/tokenizers/model/bpe.ex

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
defmodule Tokenizers.Model.BPE do
2+
@typedoc """
3+
Options for model initialisation. All options can be ommited.
4+
5+
* `:cache_capacity` (default `10_000`) - The number of words that the BPE cache can contain.
6+
The cache allows to speed-up the process by keeping
7+
the result of the merge operations for a number of words.
8+
* `:dropout` - The BPE dropout to use. Must be an float between 0 and 1
9+
* `:unk_token` - The unknown token to be used by the model
10+
* `:continuing_subword_prefix` - The prefix to attach to subword units that don't represent a beginning of word
11+
* `:end_of_word_suffix` - The suffix to attach to subword units that represent an end of word
12+
"""
13+
@type options() :: [
14+
cache_capacity: number(),
15+
dropout: float(),
16+
unk_token: String.t(),
17+
continuing_subword_prefix: String.t(),
18+
end_of_word_suffix: String.t(),
19+
fuse_unk: boolean(),
20+
byte_fallback: boolean()
21+
]
22+
23+
@doc """
24+
Instantiate a BPE model from the given vocab and merges
25+
"""
26+
@spec init(
27+
vocab :: %{String.t() => integer()},
28+
merges :: [{String.t(), String.t()}],
29+
options :: options()
30+
) :: {:ok, Tokenizers.Model.t()}
31+
defdelegate init(vocab, merges, options \\ []), to: Tokenizers.Native, as: :models_bpe_init
32+
33+
@doc """
34+
Instantiate an empty BPE Model
35+
"""
36+
@spec empty() :: {:ok, Tokenizers.Model.t()}
37+
defdelegate empty(), to: Tokenizers.Native, as: :models_bpe_empty
38+
39+
@doc """
40+
Instantiate a BPE model from the given vocab and merges files
41+
"""
42+
@spec from_file(
43+
vocab :: String.t(),
44+
merges :: String.t(),
45+
options :: options()
46+
) :: {:ok, Tokenizers.Model.t()}
47+
defdelegate from_file(vocab, merges, options \\ []),
48+
to: Tokenizers.Native,
49+
as: :models_bpe_from_file
50+
end

lib/tokenizers/model/unigram.ex

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
defmodule Tokenizers.Model.Unigram do
2+
@typedoc """
3+
Options for model initialisation. All options can be ommited.
4+
5+
* `:unk_id`- The unknown token id to be used by the model.
6+
"""
7+
@type options() :: [
8+
unk_id: float()
9+
]
10+
11+
@doc """
12+
Instantiate a Unigram model from the given vocab
13+
"""
14+
@spec init(
15+
vocab :: [{String.t(), number()}],
16+
options :: options()
17+
) :: {:ok, Tokenizers.Model.t()}
18+
defdelegate init(vocab, options \\ []),
19+
to: Tokenizers.Native,
20+
as: :models_unigram_init
21+
22+
@doc """
23+
Instantiate an empty Unigram model
24+
"""
25+
@spec empty() :: {:ok, Tokenizers.Model.t()}
26+
defdelegate empty(), to: Tokenizers.Native, as: :models_unigram_empty
27+
end

lib/tokenizers/model/wordlevel.ex

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
defmodule Tokenizers.Model.WordLevel do
2+
@typedoc """
3+
Options for model initialisation. All options can be ommited.
4+
5+
* `:unk_token` (default `"[UNK]"`) - The unknown token to be used by the model.
6+
"""
7+
@type options() :: [
8+
unk_token: String.t()
9+
]
10+
11+
@doc """
12+
Instantiate a WordLevel model from the given vocab
13+
"""
14+
@spec init(
15+
vocab :: %{String.t() => integer()},
16+
options :: options()
17+
) :: {:ok, Tokenizers.Model.t()}
18+
defdelegate init(vocab, options \\ []),
19+
to: Tokenizers.Native,
20+
as: :models_wordlevel_init
21+
22+
@doc """
23+
Instantiate an empty WordLevel model
24+
"""
25+
@spec empty() :: {:ok, Tokenizers.Model.t()}
26+
defdelegate empty(), to: Tokenizers.Native, as: :models_wordlevel_empty
27+
28+
@doc """
29+
Instantiate a WordLevel model from the given vocab file
30+
"""
31+
@spec from_file(
32+
vocab :: String.t(),
33+
options :: options()
34+
) :: {:ok, Tokenizers.Model.t()}
35+
defdelegate from_file(vocab, options \\ []),
36+
to: Tokenizers.Native,
37+
as: :models_wordlevel_from_file
38+
end

lib/tokenizers/model/wordpiece.ex

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
defmodule Tokenizers.Model.WordPiece do
2+
@typedoc """
3+
Options for model initialisation. All options can be ommited.
4+
5+
* `:unk_token` (default `"[UNK]"`) - The unknown token to be used by the model.
6+
* `:max_input_chars_per_word` (default `100`) - The maximum number of characters to authorize in a single word.
7+
* `:continuing_subword_prefix` (default `"##"`) - The prefix to attach to subword units that don't represent a beginning of word
8+
"""
9+
@type options() :: [
10+
unk_token: String.t(),
11+
max_input_chars_per_word: number(),
12+
continuing_subword_prefix: String.t()
13+
]
14+
15+
@doc """
16+
Instantiate a WordPiece model from the given vocab
17+
"""
18+
@spec init(
19+
vocab :: %{String.t() => integer()},
20+
options :: options()
21+
) :: {:ok, Tokenizers.Model.t()}
22+
defdelegate init(vocab, options \\ []),
23+
to: Tokenizers.Native,
24+
as: :models_wordpiece_init
25+
26+
@doc """
27+
Instantiate an empty WordPiece model
28+
"""
29+
@spec empty() :: {:ok, Tokenizers.Model.t()}
30+
defdelegate empty(), to: Tokenizers.Native, as: :models_wordpiece_empty
31+
32+
@doc """
33+
Instantiate a WordPiece model from the given vocab file
34+
"""
35+
@spec from_file(
36+
vocab :: String.t(),
37+
options :: options()
38+
) :: {:ok, Tokenizers.Model.t()}
39+
defdelegate from_file(vocab, options \\ []),
40+
to: Tokenizers.Native,
41+
as: :models_wordpiece_from_file
42+
end

lib/tokenizers/native.ex

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,41 @@ defmodule Tokenizers.Native do
1010
base_url: "#{github_url}/releases/download/v#{version}",
1111
force_build: System.get_env("TOKENIZERS_BUILD") in ["1", "true"]
1212

13+
# Added tokens
14+
def added_token_new(_token, _opts), do: err()
15+
#
16+
def added_token_info(_added_token), do: err()
17+
18+
# Models
19+
def models_save(_model, _folder, _opts), do: err()
20+
#
21+
def models_info(_model), do: err()
22+
#
23+
def models_bpe_init(_vocab, _merges, _options), do: err()
24+
def models_bpe_empty(), do: err()
25+
def models_bpe_from_file(_vocab, _merges, _options), do: err()
26+
#
27+
def models_wordpiece_init(_vocab, _options), do: err()
28+
def models_wordpiece_empty(), do: err()
29+
def models_wordpiece_from_file(_vocab, _options), do: err()
30+
#
31+
def models_wordlevel_init(_vocab, _options), do: err()
32+
def models_wordlevel_empty(), do: err()
33+
def models_wordlevel_from_file(_vocab, _options), do: err()
34+
#
35+
def models_unigram_init(_vocab, _options), do: err()
36+
def models_unigram_empty(), do: err()
37+
38+
# Trainers
39+
def trainers_info(_trainer), do: err()
40+
#
41+
def trainers_train(_trainer, _model), do: err()
42+
#
43+
def trainers_bpe_trainer(_options), do: err()
44+
def trainers_wordpiece_trainer(_options), do: err()
45+
def trainers_wordlevel_trainer(_options), do: err()
46+
def trainers_unigram_trainer(_options), do: err()
47+
1348
def decode(_tokenizer, _ids, _skip_special_tokens), do: err()
1449
def decode_batch(_tokenizer, _ids, _skip_special_tokens), do: err()
1550
def encode(_tokenizer, _input, _add_special_tokens), do: err()

lib/tokenizers/tokenizer.ex

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ defimpl Inspect, for: Tokenizers.Tokenizer do
268268
model_details =
269269
tokenizer
270270
|> Tokenizer.get_model()
271-
|> Model.get_model_details()
271+
|> Model.info()
272272
|> Keyword.new(fn {k, v} -> {String.to_atom(k), v} end)
273273

274274
attrs =

0 commit comments

Comments
 (0)