Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ clap = "4.5.52"
clap_derive = "4.5.49"
rand = "0.9.2"
schemars = "1.1.0"
serde_json = "1.0.145"
serde_json = { version = "1.0.145", features = ["arbitrary_precision"] }
ndarray = "0.16.1"
serde = { version = "1.0.228", features = ["serde_derive"] }
tracing = "0.1.41"
Expand Down
40 changes: 40 additions & 0 deletions encoderfile/src/build_cli/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,46 @@ impl EncoderfileConfig {
#[derive(Debug, Serialize, Deserialize, JsonSchema)]
pub struct TokenizerBuildConfig {
pub pad_strategy: Option<TokenizerPadStrategy>,
pub truncation_side: Option<TokenizerTruncationSide>,
pub truncation_strategy: Option<TokenizerTruncationStrategy>,
pub max_length: Option<usize>,
pub stride: Option<usize>,
}

#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
#[serde(rename_all = "snake_case")]
pub enum TokenizerTruncationSide {
Left,
Right,
}

impl From<TokenizerTruncationSide> for tokenizers::TruncationDirection {
fn from(value: TokenizerTruncationSide) -> Self {
match value {
TokenizerTruncationSide::Left => tokenizers::TruncationDirection::Left,
TokenizerTruncationSide::Right => tokenizers::TruncationDirection::Right,
}
}
}

#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
#[serde(rename_all = "snake_case")]
pub enum TokenizerTruncationStrategy {
LongestFirst,
OnlyFirst,
OnlySecond,
}

impl From<TokenizerTruncationStrategy> for tokenizers::TruncationStrategy {
fn from(value: TokenizerTruncationStrategy) -> Self {
match value {
TokenizerTruncationStrategy::LongestFirst => {
tokenizers::TruncationStrategy::LongestFirst
}
TokenizerTruncationStrategy::OnlyFirst => tokenizers::TruncationStrategy::OnlyFirst,
TokenizerTruncationStrategy::OnlySecond => tokenizers::TruncationStrategy::OnlySecond,
}
}
}

#[derive(Debug, Serialize, Deserialize, JsonSchema)]
Expand Down
117 changes: 113 additions & 4 deletions encoderfile/src/build_cli/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,11 @@ use crate::{
};
use anyhow::Result;
use std::str::FromStr;
use tokenizers::{PaddingParams, PaddingStrategy, Tokenizer};
use tokenizers::{PaddingParams, PaddingStrategy, Tokenizer, TruncationParams};

use super::config::{EncoderfileConfig, TokenizerPadStrategy};
use super::config::{
EncoderfileConfig, TokenizerPadStrategy, TokenizerTruncationSide, TokenizerTruncationStrategy,
};

pub fn validate_tokenizer<'a>(config: &'a EncoderfileConfig) -> Result<PlannedAsset<'a>> {
let tokenizer =
Expand Down Expand Up @@ -73,19 +75,36 @@ impl EncoderfileConfig {
}
};

// TODO: insert any overrides from encoderfile.yml here
let tokenizer_build_config = match &self.tokenizer {
Some(t) => t,
None => return Ok(config),
};

// padding
if let Some(s) = &tokenizer_build_config.pad_strategy {
config.padding.strategy = match s {
TokenizerPadStrategy::BatchLongest => PaddingStrategy::BatchLongest,
TokenizerPadStrategy::Fixed { fixed } => PaddingStrategy::Fixed(*fixed),
}
};

// truncation
if let Some(s) = &tokenizer_build_config.truncation_side {
config.truncation.direction = s.clone().into();
}

if let Some(s) = &tokenizer_build_config.truncation_strategy {
config.truncation.strategy = s.clone().into();
}

if let Some(max_len) = &tokenizer_build_config.max_length {
config.truncation.max_length = *max_len;
}

if let Some(stride) = &tokenizer_build_config.stride {
config.truncation.stride = *stride;
}

Ok(config)
}
}
Expand All @@ -105,7 +124,24 @@ fn from_tokenizer(tokenizer: &Tokenizer) -> Result<TokenizerConfig> {
}
};

Ok(TokenizerConfig { padding })
let truncation = match tokenizer.get_truncation() {
Some(p) => p.clone(),
None => {
let truncation_params = TruncationParams::default();

eprintln!(
"WARNING: No padding params found in `tokenizer.json`. Using defaults: {:?}",
&truncation_params,
);

truncation_params
}
};

Ok(TokenizerConfig {
padding,
truncation,
})
}

fn tokenizer_config_from_json_value(
Expand Down Expand Up @@ -178,6 +214,55 @@ fn tokenizer_config_from_json_value(
|config| config.padding.pad_type_id,
)?;

builder.field(
"truncation_strategy",
|config, v| {
let strategy: TokenizerTruncationStrategy = serde_json::from_value(v.clone())?;

config.truncation.strategy = strategy.into();

Ok(())
},
|config| config.truncation.strategy,
)?;

builder.field(
"truncation_side",
|config, v| {
let side: TokenizerTruncationSide = serde_json::from_value(v.clone())?;

config.truncation.direction = side.into();

Ok(())
},
|config| config.truncation.direction,
)?;

builder.any_field(
&["model_max_length", "max_length"],
|config, v| {
config.truncation.max_length = v
.as_number()
.ok_or(anyhow::anyhow!("model_max_length must be an int"))?
.as_u128()
.ok_or(anyhow::anyhow!("Failed to cast number to u128"))?
as usize;

Ok(())
},
|config| config.truncation.max_length,
)?;

builder.field(
"stride",
|config, v| {
config.truncation.stride = serde_json::from_value(v.clone())?;

Ok(())
},
|config| config.truncation.stride,
)?;

// now we fetch pad_token_id manually because it doesn't get serialized into tokenizer_config.json!
builder.set_pad_token_id(tokenizer)?;

Expand Down Expand Up @@ -212,6 +297,26 @@ impl<'a> TokenizerConfigBuilder<'a> {
Ok(())
}

fn any_field<P, D, V>(
&mut self,
fields: &[&str],
process_value_fn: P,
default_value_fn: D,
) -> Result<()>
where
P: FnOnce(&mut TokenizerConfig, &serde_json::Value) -> Result<()>,
D: FnOnce(&TokenizerConfig) -> V,
V: std::fmt::Debug,
{
for field in fields {
if self.val.get(*field).is_some() {
return self.field(field, process_value_fn, default_value_fn);
}
}

anyhow::bail!("One of these fields is required: {:?}", fields);
}

fn field<P, D, V>(
&mut self,
field: &str,
Expand Down Expand Up @@ -303,6 +408,10 @@ mod tests {
transform: None,
tokenizer: Some(TokenizerBuildConfig {
pad_strategy: Some(TokenizerPadStrategy::Fixed { fixed: 512 }),
truncation_side: None,
truncation_strategy: None,
max_length: None,
stride: None,
}),
validate_transform: false,
base_binary_path: None,
Expand Down
3 changes: 2 additions & 1 deletion encoderfile/src/common/config.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use super::model_type::ModelType;
use serde::{Deserialize, Serialize};
use tokenizers::PaddingParams;
use tokenizers::{PaddingParams, TruncationParams};

#[derive(Debug, Serialize, Deserialize)]
pub struct Config {
Expand All @@ -13,4 +13,5 @@ pub struct Config {
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct TokenizerConfig {
pub padding: PaddingParams,
pub truncation: TruncationParams,
}
4 changes: 3 additions & 1 deletion encoderfile/src/runtime/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ impl TokenizerService {

pub fn init(mut self) -> Result<Self> {
self.tokenizer
.with_padding(Some(self.config.padding.clone()));
.with_padding(Some(self.config.padding.clone()))
.with_truncation(Some(self.config.truncation.clone()))
.map_err(|e| anyhow::anyhow!("Failed to apply truncation settings: {:?}", e))?;

Ok(self)
}
Expand Down