elixir-nx
diff --git a/‎lib/tokenizers/added_token.ex
Lines changed: 72 additions & 0 deletions b/‎lib/tokenizers/added_token.ex
Lines changed: 72 additions & 0 deletions
diff --git a/‎lib/tokenizers/model.ex
Lines changed: 24 additions & 8 deletions b/‎lib/tokenizers/model.ex
Lines changed: 24 additions & 8 deletions
diff --git a/‎lib/tokenizers/model/bpe.ex
Lines changed: 50 additions & 0 deletions b/‎lib/tokenizers/model/bpe.ex
Lines changed: 50 additions & 0 deletions
diff --git a/‎lib/tokenizers/model/unigram.ex
Lines changed: 27 additions & 0 deletions b/‎lib/tokenizers/model/unigram.ex
Lines changed: 27 additions & 0 deletions
diff --git a/‎lib/tokenizers/model/wordlevel.ex
Lines changed: 38 additions & 0 deletions b/‎lib/tokenizers/model/wordlevel.ex
Lines changed: 38 additions & 0 deletions
diff --git a/‎lib/tokenizers/model/wordpiece.ex
Lines changed: 42 additions & 0 deletions b/‎lib/tokenizers/model/wordpiece.ex
Lines changed: 42 additions & 0 deletions
diff --git a/‎lib/tokenizers/native.ex
Lines changed: 35 additions & 0 deletions b/‎lib/tokenizers/native.ex
Lines changed: 35 additions & 0 deletions
diff --git a/‎lib/tokenizers/tokenizer.ex
Lines changed: 1 addition & 1 deletion b/‎lib/tokenizers/tokenizer.ex
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,72 @@
+defmodule Tokenizers.AddedToken do
+  @moduledoc """
+  This struct represents AddedTokens
+  """
+
+  @type t() :: %__MODULE__{resource: reference()}
+  defstruct [:resource]
+
+  @typedoc """
+  Options for added token initialisation. All options can be ommited.
+  """
+  @type opts() :: [
+          special: boolean(),
+          single_word: boolean(),
+          lstrip: boolean(),
+          rstrip: boolean(),
+          normalized: boolean()
+        ]
+
+  @doc """
+  Create a new AddedToken.
+
+  * `:special` (default `false`) - defines whether this token is a special token.
+
+  * `:single_word` (default `false`) - defines whether this token should only match single words.
+    If `true`, this token will never match inside of a word. For example the token `ing` would
+    match on `tokenizing` if this option is `false`, but not if it is `true`.
+    The notion of ”inside of a word” is defined by the word boundaries pattern
+    in regular expressions (i.e. the token should start and end with word boundaries).
+
+  * `:lstrip` (default `false`) - defines whether this token should strip all potential 
+    whitespaces on its left side.
+    If `true`, this token will greedily match any whitespace on its left.
+    For example if we try to match the token `[MASK]` with `lstrip=true`,
+    in the text `"I saw a [MASK]"`, we would match on `" [MASK]"`. (Note the space on the left).
+
+  * `:rstrip` (default `false`) - defines whether this token should strip all potential
+    whitespaces on its right side.
+    If `true`, this token will greedily match any whitespace on its right.
+    It works just like `lstrip` but on the right.
+    
+  * `:normalized` (default `true` for not special tokens, `false` for special tokens) -
+    defines whether this token should match against the normalized version of the input text.
+    For example, with the added token `"yesterday"`,
+    and a normalizer in charge of lowercasing the text,
+    the token could be extract from the input `"I saw a lion Yesterday"`.
+    If `true`, the token will be extracted from the normalized input `"i saw a lion yesterday"`.
+    If `false`, the token will be extracted from the original input `"I saw a lion Yesterday"`.
+  """
+  @spec new(token :: String.t(), opts :: opts()) :: t()
+  defdelegate new(token, opts \\ []), to: Tokenizers.Native, as: :added_token_new
+
+  @doc """
+  Retrieves information about added token.
+  """
+  @spec info(added_token :: __MODULE__.t()) :: map()
+  defdelegate info(model), to: Tokenizers.Native, as: :added_token_info
+end
+
+defimpl Inspect, for: Tokenizers.AddedToken do
+  import Inspect.Algebra
+
+  @spec inspect(Tokenizers.AddedToken.t(), Inspect.Opts.t()) :: Inspect.Algebra.t()
+  def inspect(decoder, opts) do
+    attrs =
+      decoder
+      |> Tokenizers.Native.added_token_info()
+      |> Keyword.new(fn {k, v} -> {String.to_atom(k), v} end)
+
+    concat(["#Tokenizers.PreTokenizer<", to_doc(attrs, opts), ">"])
+  end
+end
@@ -3,30 +3,46 @@ defmodule Tokenizers.Model do
   The struct and associated functions for the tokenizer model.
   """
 
-  @type t :: %__MODULE__{resource: binary(), reference: reference()}
-  defstruct resource: nil, reference: nil
-
-  alias Tokenizers.Native
-  alias Tokenizers.Shared
+  @typedoc """
+  Represents different kind of models that can be used across the library.
+  """
+  @type t() :: %__MODULE__{resource: reference()}
+  defstruct [:resource]
 
   @doc """
   Retrieves information about the model.
 
   Information retrieved differs per model but all include `model_type`.
   """
-  @spec get_model_details(model :: __MODULE__.t()) :: map()
-  def get_model_details(model), do: model |> Native.get_model_details() |> Shared.unwrap()
+  @spec info(model :: __MODULE__.t()) :: map()
+  defdelegate info(model), to: Tokenizers.Native, as: :models_info
+
+  @typedoc """
+  Options to save the model. All options can be ommited.
+
+  * `:prefix` (default `""`) - The prefix to use for all the files that will get created.
+  """
+  @type save_opts() :: [prefix: String.t()]
+
+  @doc """
+  Save the current model in the given folder, using the given name for the various files that will get created.
+  Any file with the same name that already exist in this folder will be overwritten.
+  """
+  @spec save(model :: t(), folder :: String.t(), opts :: save_opts()) ::
+          {:ok, file_paths :: [String.t()]} | {:error, any()}
+  defdelegate save(model, folder, opts \\ []), to: Tokenizers.Native, as: :models_save
 end
 
 defimpl Inspect, for: Tokenizers.Model do
   import Inspect.Algebra
 
   alias Tokenizers.Model
 
+  @spec inspect(Tokenizers.Model.t(), Inspect.Opts.t()) :: Inspect.Algebra.t()
   def inspect(model, opts) do
     attrs =
       model
-      |> Model.get_model_details()
+      |> Model.info()
       |> Keyword.new(fn {k, v} -> {String.to_atom(k), v} end)
 
     concat(["#Tokenizers.Model<", to_doc(attrs, opts), ">"])
 
@@ -0,0 +1,50 @@
+defmodule Tokenizers.Model.BPE do
+  @typedoc """
+  Options for model initialisation. All options can be ommited.
+
+  * `:cache_capacity` (default `10_000`) - The number of words that the BPE cache can contain.
+    The cache allows to speed-up the process by keeping
+    the result of the merge operations for a number of words.
+  * `:dropout` - The BPE dropout to use. Must be an float between 0 and 1
+  * `:unk_token` - The unknown token to be used by the model
+  * `:continuing_subword_prefix` - The prefix to attach to subword units that don't represent a beginning of word
+  * `:end_of_word_suffix` - The suffix to attach to subword units that represent an end of word
+  """
+  @type options() :: [
+          cache_capacity: number(),
+          dropout: float(),
+          unk_token: String.t(),
+          continuing_subword_prefix: String.t(),
+          end_of_word_suffix: String.t(),
+          fuse_unk: boolean(),
+          byte_fallback: boolean()
+        ]
+
+  @doc """
+  Instantiate a BPE model from the given vocab and merges
+  """
+  @spec init(
+          vocab :: %{String.t() => integer()},
+          merges :: [{String.t(), String.t()}],
+          options :: options()
+        ) :: {:ok, Tokenizers.Model.t()}
+  defdelegate init(vocab, merges, options \\ []), to: Tokenizers.Native, as: :models_bpe_init
+
+  @doc """
+  Instantiate an empty BPE Model
+  """
+  @spec empty() :: {:ok, Tokenizers.Model.t()}
+  defdelegate empty(), to: Tokenizers.Native, as: :models_bpe_empty
+
+  @doc """
+  Instantiate a BPE model from the given vocab and merges files
+  """
+  @spec from_file(
+          vocab :: String.t(),
+          merges :: String.t(),
+          options :: options()
+        ) :: {:ok, Tokenizers.Model.t()}
+  defdelegate from_file(vocab, merges, options \\ []),
+    to: Tokenizers.Native,
+    as: :models_bpe_from_file
+end
@@ -0,0 +1,27 @@
+defmodule Tokenizers.Model.Unigram do
+  @typedoc """
+  Options for model initialisation. All options can be ommited.
+
+  * `:unk_id`- The unknown token id to be used by the model.
+  """
+  @type options() :: [
+          unk_id: float()
+        ]
+
+  @doc """
+  Instantiate a Unigram model from the given vocab
+  """
+  @spec init(
+          vocab :: [{String.t(), number()}],
+          options :: options()
+        ) :: {:ok, Tokenizers.Model.t()}
+  defdelegate init(vocab, options \\ []),
+    to: Tokenizers.Native,
+    as: :models_unigram_init
+
+  @doc """
+  Instantiate an empty Unigram model
+  """
+  @spec empty() :: {:ok, Tokenizers.Model.t()}
+  defdelegate empty(), to: Tokenizers.Native, as: :models_unigram_empty
+end
@@ -0,0 +1,38 @@
+defmodule Tokenizers.Model.WordLevel do
+  @typedoc """
+  Options for model initialisation. All options can be ommited.
+
+  * `:unk_token` (default `"[UNK]"`) - The unknown token to be used by the model.
+  """
+  @type options() :: [
+          unk_token: String.t()
+        ]
+
+  @doc """
+  Instantiate a WordLevel model from the given vocab
+  """
+  @spec init(
+          vocab :: %{String.t() => integer()},
+          options :: options()
+        ) :: {:ok, Tokenizers.Model.t()}
+  defdelegate init(vocab, options \\ []),
+    to: Tokenizers.Native,
+    as: :models_wordlevel_init
+
+  @doc """
+  Instantiate an empty WordLevel model
+  """
+  @spec empty() :: {:ok, Tokenizers.Model.t()}
+  defdelegate empty(), to: Tokenizers.Native, as: :models_wordlevel_empty
+
+  @doc """
+  Instantiate a WordLevel model from the given vocab file
+  """
+  @spec from_file(
+          vocab :: String.t(),
+          options :: options()
+        ) :: {:ok, Tokenizers.Model.t()}
+  defdelegate from_file(vocab, options \\ []),
+    to: Tokenizers.Native,
+    as: :models_wordlevel_from_file
+end
@@ -0,0 +1,42 @@
+defmodule Tokenizers.Model.WordPiece do
+  @typedoc """
+  Options for model initialisation. All options can be ommited.
+
+  * `:unk_token` (default `"[UNK]"`) - The unknown token to be used by the model.
+  * `:max_input_chars_per_word` (default `100`) - The maximum number of characters to authorize in a single word.
+  * `:continuing_subword_prefix` (default `"##"`) - The prefix to attach to subword units that don't represent a beginning of word
+  """
+  @type options() :: [
+          unk_token: String.t(),
+          max_input_chars_per_word: number(),
+          continuing_subword_prefix: String.t()
+        ]
+
+  @doc """
+  Instantiate a WordPiece model from the given vocab
+  """
+  @spec init(
+          vocab :: %{String.t() => integer()},
+          options :: options()
+        ) :: {:ok, Tokenizers.Model.t()}
+  defdelegate init(vocab, options \\ []),
+    to: Tokenizers.Native,
+    as: :models_wordpiece_init
+
+  @doc """
+  Instantiate an empty WordPiece model
+  """
+  @spec empty() :: {:ok, Tokenizers.Model.t()}
+  defdelegate empty(), to: Tokenizers.Native, as: :models_wordpiece_empty
+
+  @doc """
+  Instantiate a WordPiece model from the given vocab file
+  """
+  @spec from_file(
+          vocab :: String.t(),
+          options :: options()
+        ) :: {:ok, Tokenizers.Model.t()}
+  defdelegate from_file(vocab, options \\ []),
+    to: Tokenizers.Native,
+    as: :models_wordpiece_from_file
+end
@@ -10,6 +10,41 @@ defmodule Tokenizers.Native do
     base_url: "#{github_url}/releases/download/v#{version}",
     force_build: System.get_env("TOKENIZERS_BUILD") in ["1", "true"]
 
+  # Added tokens
+  def added_token_new(_token, _opts), do: err()
+  #
+  def added_token_info(_added_token), do: err()
+
+  # Models
+  def models_save(_model, _folder, _opts), do: err()
+  #
+  def models_info(_model), do: err()
+  #
+  def models_bpe_init(_vocab, _merges, _options), do: err()
+  def models_bpe_empty(), do: err()
+  def models_bpe_from_file(_vocab, _merges, _options), do: err()
+  #
+  def models_wordpiece_init(_vocab, _options), do: err()
+  def models_wordpiece_empty(), do: err()
+  def models_wordpiece_from_file(_vocab, _options), do: err()
+  #
+  def models_wordlevel_init(_vocab, _options), do: err()
+  def models_wordlevel_empty(), do: err()
+  def models_wordlevel_from_file(_vocab, _options), do: err()
+  #
+  def models_unigram_init(_vocab, _options), do: err()
+  def models_unigram_empty(), do: err()
+
+  # Trainers
+  def trainers_info(_trainer), do: err()
+  #
+  def trainers_train(_trainer, _model), do: err()
+  #
+  def trainers_bpe_trainer(_options), do: err()
+  def trainers_wordpiece_trainer(_options), do: err()
+  def trainers_wordlevel_trainer(_options), do: err()
+  def trainers_unigram_trainer(_options), do: err()
+
   def decode(_tokenizer, _ids, _skip_special_tokens), do: err()
   def decode_batch(_tokenizer, _ids, _skip_special_tokens), do: err()
   def encode(_tokenizer, _input, _add_special_tokens), do: err()
 
@@ -268,7 +268,7 @@ defimpl Inspect, for: Tokenizers.Tokenizer do
     model_details =
       tokenizer
       |> Tokenizer.get_model()
-      |> Model.get_model_details()
+      |> Model.info()
       |> Keyword.new(fn {k, v} -> {String.to_atom(k), v} end)
 
     attrs =