Add option to disable padding and truncation when loading tokenizer (#46)

jonatanklosko · josevalim · web-flow · commit 26d864bdedc1 · 2023-07-31T22:33:04.000+02:00
Co-authored-by: José Valim &lt;jose.valim@gmail.com&gt;
diff --git a/lib/tokenizers/encoding.ex b/lib/tokenizers/encoding.ex
@@ -179,7 +179,7 @@ defmodule Tokenizers.Encoding do
   * `direction` (default `:right`) - The padding direction.
   * `pad_id` (default `0`) - The id corresponding to the padding token.
   * `pad_type_id` (default `0`) - The type ID corresponding to the padding token.
-  * `pad_token` (default `[PDA]`) - The padding token to use.
+  * `pad_token` (default `[PAD]`) - The padding token to use.
 
   """
   @type padding_opts :: [
diff --git a/lib/tokenizers/tokenizer.ex b/lib/tokenizers/tokenizer.ex
@@ -67,16 +67,30 @@ defmodule Tokenizers.Tokenizer do
 
     * `:additional_special_tokens` - A list of special tokens to append to the tokenizer.
       Defaults to `[]`.
+
+    * `:padding` - Override for padding configuration. Currently the only supported
+      value is `:none` to disable padding. By default the configuration is restored
+      from the file.
+
+    * `:truncation` - Override for truncation configuration. Currently the only supported
+      value is `:none` to disable truncation. By default the configuration is restored
+      from the file.
+
   """
   @spec from_pretrained(String.t(), Keyword.t()) :: {:ok, t()} | {:error, term()}
   def from_pretrained(identifier, opts \\ []) do
     opts =
-      Keyword.validate!(opts,
-        revision: "main",
-        use_cache: true,
-        cache_dir: :filename.basedir(:user_cache, "tokenizers_elixir"),
-        http_client: {Tokenizers.HTTPClient, []},
-        additional_special_tokens: []
+      Keyword.validate!(
+        opts,
+        [
+          :padding,
+          :truncation,
+          revision: "main",
+          use_cache: true,
+          cache_dir: :filename.basedir(:user_cache, "tokenizers_elixir"),
+          http_client: {Tokenizers.HTTPClient, []},
+          additional_special_tokens: []
+        ]
       )
 
     {http_client, http_opts} = opts[:http_client]
@@ -100,19 +114,21 @@ defmodule Tokenizers.Tokenizer do
       Path.join(cache_dir, entry_filename(url, etag))
     end
 
+    load_opts = Keyword.take(opts, [:additional_special_tokens, :padding, :truncation])
+
     if opts[:use_cache] do
       with {:ok, response} <- request(http_client, Keyword.put(http_opts, :method, :head)) do
         etag = fetch_etag(response.headers)
         file_path = file_path_fun.(etag)
 
         if File.exists?(file_path) do
-          from_file(file_path, Keyword.take(opts, [:additional_special_tokens]))
+          from_file(file_path, load_opts)
         else
           with {:ok, response} <- request(http_client, http_opts) do
             File.mkdir_p!(cache_dir)
             File.write!(file_path, response.body)
 
-            from_file(file_path, Keyword.take(opts, [:additional_special_tokens]))
+            from_file(file_path, load_opts)
           end
         end
       end
@@ -124,7 +140,7 @@ defmodule Tokenizers.Tokenizer do
         File.mkdir_p!(cache_dir)
         File.write!(file_path, response.body)
 
-        from_file(file_path, Keyword.take(opts, [:additional_special_tokens]))
+        from_file(file_path, load_opts)
       end
     end
   end
@@ -167,28 +183,40 @@ defmodule Tokenizers.Tokenizer do
     Base.encode32(etag, case: :lower, padding: false)
   end
 
+  @typedoc """
+  Options to set on the loaded tokenizer.
+
+    * `:additional_special_tokens - a list of special tokens to append to the tokenizer.
+      Defaults to `[]`.
+
+    * `:padding` - Override for padding configuration. Currently the only supported
+      value is `:none` to disable padding. By default the configuration is restored
+      from the file.
+
+    * `:truncation` - Override for truncation configuration. Currently the only supported
+      value is `:none` to disable truncation. By default the configuration is restored
+      from the file.
+
+  """
+  @type load_options ::
+          [
+            additional_special_tokens: [String.t() | Tokenizers.AddedToken.t()],
+            padding: :none,
+            truncation: :none
+          ]
+
   @doc """
   Instantiate a new tokenizer from the file at the given path.
-  You can specify a list of special tokens to append to the tokenizer.
   """
-  @spec from_file(
-          path :: String.t(),
-          options :: [additional_special_tokens :: [String.t() | Tokenizers.AddedToken.t()]]
-        ) ::
-          {:ok, t()} | {:error, term()}
+  @spec from_file(path :: String.t(), load_options()) :: {:ok, t()} | {:error, term()}
   defdelegate from_file(path, options \\ []),
     to: Tokenizers.Native,
     as: :tokenizer_from_file
 
   @doc """
   Instantiate a new tokenizer from the buffer.
-  You can specify a list of special tokens to append to the tokenizer.
   """
-  @spec from_buffer(
-          data :: String.t(),
-          options :: [additional_special_tokens :: [String.t() | Tokenizers.AddedToken.t()]]
-        ) ::
-          {:ok, t()} | {:error, term()}
+  @spec from_buffer(data :: String.t(), load_options()) :: {:ok, t()} | {:error, term()}
   defdelegate from_buffer(data, options \\ []),
     to: Tokenizers.Native,
     as: :tokenizer_from_buffer
diff --git a/native/ex_tokenizers/src/pre_tokenizers.rs b/native/ex_tokenizers/src/pre_tokenizers.rs
@@ -164,7 +164,7 @@ pub fn pre_tokenizers_byte_level_alphabet() -> Vec<u32> {
 
 #[rustler::nif]
 pub fn pre_tokenizers_whitespace() -> ExTokenizersPreTokenizer {
-    ExTokenizersPreTokenizer::new(tokenizers::pre_tokenizers::whitespace::Whitespace::default())
+    ExTokenizersPreTokenizer::new(tokenizers::pre_tokenizers::whitespace::Whitespace)
 }
 
 #[rustler::nif]
diff --git a/native/ex_tokenizers/src/tokenizer.rs b/native/ex_tokenizers/src/tokenizer.rs
@@ -60,35 +60,18 @@ pub fn tokenizer_init(
 #[derive(NifTaggedEnum)]
 pub enum LoadOption {
     AdditionalSpecialTokens(Vec<AddedSpecialTokenInput>),
+    // Currently only :none is supported
+    Padding(rustler::Atom),
+    Truncation(rustler::Atom),
 }
 
 #[rustler::nif(schedule = "DirtyIo")]
 pub fn tokenizer_from_file(
     path: &str,
     options: Vec<LoadOption>,
 ) -> Result<ExTokenizersTokenizer, ExTokenizersError> {
-    struct Opts {
-        additional_special_tokens: Vec<AddedSpecialTokenInput>,
-    }
-    let mut opts = Opts {
-        additional_special_tokens: vec![],
-    };
-    for opt in options {
-        match opt {
-            LoadOption::AdditionalSpecialTokens(tokens) => {
-                opts.additional_special_tokens = tokens;
-            }
-        }
-    }
-
     let mut tokenizer = TokenizerImpl::from_file(path)?;
-    tokenizer.add_special_tokens(
-        opts.additional_special_tokens
-            .iter()
-            .map(|t| t.into())
-            .collect::<Vec<_>>()
-            .as_ref(),
-    );
+    tokenizer = apply_load_options(tokenizer, options);
     Ok(tokenizer.into())
 }
 
@@ -97,28 +80,55 @@ pub fn tokenizer_from_buffer(
     data: String,
     options: Vec<LoadOption>,
 ) -> Result<ExTokenizersTokenizer, ExTokenizersError> {
+    let mut tokenizer: ExTokenizerImpl = data.parse()?;
+    tokenizer = apply_load_options(tokenizer, options);
+    Ok(tokenizer.into())
+}
+
+fn apply_load_options(mut tokenizer: ExTokenizerImpl, options: Vec<LoadOption>) -> ExTokenizerImpl {
     struct Opts {
         additional_special_tokens: Vec<AddedSpecialTokenInput>,
+        disable_padding: bool,
+        disable_truncation: bool,
     }
+
     let mut opts = Opts {
         additional_special_tokens: vec![],
+        disable_padding: false,
+        disable_truncation: false,
     };
+
     for opt in options {
         match opt {
             LoadOption::AdditionalSpecialTokens(tokens) => {
                 opts.additional_special_tokens = tokens;
             }
+            LoadOption::Padding(_) => {
+                opts.disable_padding = true;
+            }
+            LoadOption::Truncation(_) => {
+                opts.disable_truncation = true;
+            }
         }
     }
-    let mut tokenizer: ExTokenizerImpl = data.parse()?;
+
     tokenizer.add_special_tokens(
         opts.additional_special_tokens
             .iter()
             .map(|t| t.into())
             .collect::<Vec<_>>()
             .as_ref(),
     );
-    Ok(tokenizer.into())
+
+    if opts.disable_padding {
+        tokenizer.with_padding(None);
+    }
+
+    if opts.disable_truncation {
+        tokenizer.with_padding(None);
+    }
+
+    tokenizer
 }
 
 #[derive(NifTaggedEnum)]

Original file line number	Diff line number	Diff line change
`@@ -164,7 +164,7 @@ pub fn pre_tokenizers_byte_level_alphabet() -> Vec<u32> {`
`164`	`164`
`165`	`165`	`#[rustler::nif]`
`166`	`166`	`pub fn pre_tokenizers_whitespace() -> ExTokenizersPreTokenizer {`
`167`		`- ExTokenizersPreTokenizer::new(tokenizers::pre_tokenizers::whitespace::Whitespace::default())`
	`167`	`+ ExTokenizersPreTokenizer::new(tokenizers::pre_tokenizers::whitespace::Whitespace)`
`168`	`168`	`}`
`169`	`169`
`170`	`170`	`#[rustler::nif]`