feat: add sentence transformers support (#45)

besaleli · web-flow · commit 1e90b984f915 · 2025-11-14T11:49:03.000-05:00
- Add support for sentence transformers
- Add `mean_pool` tensor operation
diff --git a/Makefile b/Makefile
@@ -19,6 +19,10 @@ licenses:
 	@echo "Generating licenses..."
 	@cargo bundle-licenses --format yaml --output THIRDPARTY.yml
 
+.PHONY:
+clippy:
+	cargo clippy --fix --all-features --allow-dirty
+
 .PHONY: pre-commit
 pre-commit:
 	@uv run pre-commit run --all-files
diff --git a/encoderfile/build.rs b/encoderfile/build.rs
@@ -32,11 +32,13 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
                 "proto/embedding.proto",
                 "proto/sequence_classification.proto",
                 "proto/token_classification.proto",
+                "proto/sentence_embedding.proto",
             ],
             &[
                 "proto/embedding",
                 "proto/sequence_classification",
                 "proto/token_classification",
+                "proto/sentence_embedding",
             ],
         )?;
 
diff --git a/encoderfile/proto/metadata.proto b/encoderfile/proto/metadata.proto
@@ -15,4 +15,5 @@ enum ModelType {
   EMBEDDING = 1;
   SEQUENCE_CLASSIFICATION = 2;
   TOKEN_CLASSIFICATION = 3;
+  SENTENCE_EMBEDDING = 4;
 }
diff --git a/encoderfile/proto/sentence_embedding.proto b/encoderfile/proto/sentence_embedding.proto
@@ -0,0 +1,27 @@
+syntax = "proto3";
+
+package encoderfile.sentence_embedding;
+
+import "proto/token.proto";
+import "proto/metadata.proto";
+
+service SentenceEmbeddingInference {
+  rpc Predict(SentenceEmbeddingRequest) returns (SentenceEmbeddingResponse);
+  rpc GetModelMetadata(encoderfile.metadata.GetModelMetadataRequest) returns (encoderfile.metadata.GetModelMetadataResponse);
+}
+
+message SentenceEmbeddingRequest {
+  repeated string inputs = 1;
+  map<string, string> metadata = 3;
+}
+
+message SentenceEmbeddingResponse {
+  // len(embeddings) == len(inputs)
+  repeated SentenceEmbedding results = 1;
+  string model_id = 2;
+  map<string, string> metadata = 3;
+}
+
+message SentenceEmbedding {
+  repeated float embedding = 1;
+}
diff --git a/encoderfile/src/cli.rs b/encoderfile/src/cli.rs
@@ -1,10 +1,11 @@
 use crate::{
     common::{
-        EmbeddingRequest, ModelType, SequenceClassificationRequest, TokenClassificationRequest,
+        EmbeddingRequest, ModelType, SentenceEmbeddingRequest, SequenceClassificationRequest,
+        TokenClassificationRequest,
     },
     runtime::AppState,
     server::{run_grpc, run_http, run_mcp},
-    services::{embedding, sequence_classification, token_classification},
+    services::{embedding, sentence_embedding, sequence_classification, token_classification},
 };
 use anyhow::Result;
 use clap_derive::{Parser, Subcommand, ValueEnum};
@@ -145,6 +146,11 @@ impl Commands {
 
                         generate_cli_route!(request, token_classification, format, out_dir, state)
                     }
+                    ModelType::SentenceEmbedding => {
+                        let request = SentenceEmbeddingRequest { inputs, metadata };
+
+                        generate_cli_route!(request, sentence_embedding, format, out_dir, state)
+                    }
                 }
             }
             Commands::Mcp { hostname, port } => {
diff --git a/encoderfile/src/common/mod.rs b/encoderfile/src/common/mod.rs
@@ -1,13 +1,15 @@
-pub mod embedding;
-pub mod model_metadata;
-pub mod model_type;
-pub mod sequence_classification;
-pub mod token;
-pub mod token_classification;
+mod embedding;
+mod model_metadata;
+mod model_type;
+mod sentence_embedding;
+mod sequence_classification;
+mod token;
+mod token_classification;
 
 pub use embedding::*;
 pub use model_metadata::*;
 pub use model_type::*;
+pub use sentence_embedding::*;
 pub use sequence_classification::*;
 pub use token::*;
 pub use token_classification::*;
diff --git a/encoderfile/src/common/model_type.rs b/encoderfile/src/common/model_type.rs
@@ -4,6 +4,7 @@ pub enum ModelType {
     Embedding,
     SequenceClassification,
     TokenClassification,
+    SentenceEmbedding,
 }
 
 impl From<ModelType> for crate::generated::metadata::ModelType {
@@ -12,6 +13,7 @@ impl From<ModelType> for crate::generated::metadata::ModelType {
             ModelType::Embedding => Self::Embedding,
             ModelType::SequenceClassification => Self::SequenceClassification,
             ModelType::TokenClassification => Self::TokenClassification,
+            ModelType::SentenceEmbedding => Self::SentenceEmbedding,
         }
     }
 }
diff --git a/encoderfile/src/common/sentence_embedding.rs b/encoderfile/src/common/sentence_embedding.rs
@@ -0,0 +1,55 @@
+use schemars::JsonSchema;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use utoipa::ToSchema;
+
+#[derive(Debug, Serialize, Deserialize, ToSchema, JsonSchema)]
+pub struct SentenceEmbeddingRequest {
+    pub inputs: Vec<String>,
+    #[serde(default)]
+    pub metadata: Option<HashMap<String, String>>,
+}
+
+impl From<crate::generated::sentence_embedding::SentenceEmbeddingRequest>
+    for SentenceEmbeddingRequest
+{
+    fn from(val: crate::generated::sentence_embedding::SentenceEmbeddingRequest) -> Self {
+        Self {
+            inputs: val.inputs,
+            metadata: Some(val.metadata),
+        }
+    }
+}
+
+#[derive(Debug, Serialize, ToSchema, JsonSchema, utoipa::ToResponse)]
+pub struct SentenceEmbeddingResponse {
+    pub results: Vec<SentenceEmbedding>,
+    pub model_id: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub metadata: Option<HashMap<String, String>>,
+}
+
+impl From<SentenceEmbeddingResponse>
+    for crate::generated::sentence_embedding::SentenceEmbeddingResponse
+{
+    fn from(val: SentenceEmbeddingResponse) -> Self {
+        Self {
+            results: val.results.into_iter().map(|i| i.into()).collect(),
+            model_id: val.model_id,
+            metadata: val.metadata.unwrap_or_default(),
+        }
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize, ToSchema, JsonSchema)]
+pub struct SentenceEmbedding {
+    pub embedding: Vec<f32>,
+}
+
+impl From<SentenceEmbedding> for crate::generated::sentence_embedding::SentenceEmbedding {
+    fn from(val: SentenceEmbedding) -> Self {
+        Self {
+            embedding: val.embedding,
+        }
+    }
+}
diff --git a/encoderfile/src/generated/mod.rs b/encoderfile/src/generated/mod.rs
@@ -10,6 +10,10 @@ pub mod token_classification {
     tonic::include_proto!("encoderfile.token_classification");
 }
 
+pub mod sentence_embedding {
+    tonic::include_proto!("encoderfile.sentence_embedding");
+}
+
 pub mod token {
     tonic::include_proto!("encoderfile.token");
 }
diff --git a/encoderfile/src/inference/embedding.rs b/encoderfile/src/inference/embedding.rs
@@ -24,9 +24,7 @@ pub fn embedding<'a>(
         .expect("Model does not return tensor of shape [n_batch, n_tokens, hidden_dim]")
         .into_owned();
 
-    if let Some(transform) = state.transform() {
-        outputs = transform.postprocess(outputs)?;
-    }
+    outputs = state.transform().postprocess(outputs)?;
 
     let embeddings = postprocess(outputs, encodings);
 
diff --git a/encoderfile/src/inference/mod.rs b/encoderfile/src/inference/mod.rs
@@ -1,4 +1,5 @@
 pub mod embedding;
+pub mod sentence_embedding;
 pub mod sequence_classification;
 pub mod token_classification;
 pub mod utils;
diff --git a/encoderfile/src/inference/sentence_embedding.rs b/encoderfile/src/inference/sentence_embedding.rs
@@ -0,0 +1,48 @@
+use ndarray::{Array2, Axis, Ix2, Ix3};
+use tokenizers::Encoding;
+
+use crate::{common::SentenceEmbedding, error::ApiError, runtime::AppState};
+
+#[tracing::instrument(skip_all)]
+pub fn sentence_embedding<'a>(
+    mut session: crate::runtime::Model<'a>,
+    state: &AppState,
+    encodings: Vec<Encoding>,
+) -> Result<Vec<SentenceEmbedding>, ApiError> {
+    let (a_ids, a_mask, a_type_ids) = crate::prepare_inputs!(encodings);
+
+    let a_mask_arr = a_mask
+        .try_extract_array::<i64>()
+        .expect("Failed to extract attention mask into i64")
+        .into_dimensionality::<Ix2>()
+        .expect("a_mask is not in Ix2")
+        .into_owned()
+        .mapv(|i| i as f32);
+
+    let outputs = crate::run_model!(session, a_ids, a_mask, a_type_ids)?
+        .get("last_hidden_state")
+        .expect("Model does not return last_hidden_state")
+        .try_extract_array::<f32>()
+        .expect("Model does not return tensor extractable to f32")
+        .into_dimensionality::<Ix3>()
+        .expect("Model does not return tensor of shape [n_batch, n_tokens, hidden_dim]")
+        .into_owned();
+
+    let transform = state.transform();
+
+    let pooled_outputs = transform.pool(outputs, a_mask_arr)?;
+
+    let embeddings = postprocess(state.transform().postprocess(pooled_outputs)?, encodings);
+
+    Ok(embeddings)
+}
+
+#[tracing::instrument(skip_all)]
+pub fn postprocess(outputs: Array2<f32>, _encodings: Vec<Encoding>) -> Vec<SentenceEmbedding> {
+    outputs
+        .axis_iter(Axis(0))
+        .map(|emb| SentenceEmbedding {
+            embedding: emb.to_owned().into_raw_vec_and_offset().0,
+        })
+        .collect()
+}
diff --git a/encoderfile/src/inference/sequence_classification.rs b/encoderfile/src/inference/sequence_classification.rs
@@ -24,9 +24,7 @@ pub fn sequence_classification<'a>(
         .expect("Model does not return tensor of shape [n_batch, n_labels]")
         .into_owned();
 
-    if let Some(transform) = state.transform() {
-        outputs = transform.postprocess(outputs)?;
-    }
+    outputs = state.transform().postprocess(outputs)?;
 
     let results = postprocess(outputs, &state.config);
 
diff --git a/encoderfile/src/inference/token_classification.rs b/encoderfile/src/inference/token_classification.rs
@@ -24,9 +24,7 @@ pub fn token_classification<'a>(
         .expect("Model does not return tensor of shape [n_batch, n_tokens, n_labels]")
         .into_owned();
 
-    if let Some(transform) = state.transform() {
-        outputs = transform.postprocess(outputs)?;
-    }
+    outputs = state.transform().postprocess(outputs)?;
 
     let predictions = postprocess(outputs, encodings, &state.config);
 
diff --git a/encoderfile/src/runtime/state.rs b/encoderfile/src/runtime/state.rs
@@ -23,11 +23,11 @@ pub struct AppState {
     pub config: Arc<ModelConfig>,
     pub model_type: ModelType,
     pub model_id: String,
-    pub transform_factory: fn() -> Option<Transform>,
+    pub transform_factory: fn() -> Transform,
 }
 
 impl AppState {
-    pub fn transform(&self) -> Option<Transform> {
+    pub fn transform(&self) -> Transform {
         (self.transform_factory)()
     }
 }
diff --git a/encoderfile/src/runtime/transform.rs b/encoderfile/src/runtime/transform.rs
@@ -3,12 +3,12 @@ use crate::transforms::Transform;
 include!(concat!(env!("OUT_DIR"), "/generated/transform.rs"));
 
 #[cfg(not(tarpaulin_include))]
-pub fn get_transform() -> Option<Transform> {
+pub fn get_transform() -> Transform {
     if let Some(script) = TRANSFORM {
         let engine = Transform::new(script).expect("Failed to create transform");
 
-        return Some(engine);
+        return engine;
     }
 
-    None
+    Transform::new("").expect("Failed to create transform")
 }
diff --git a/encoderfile/src/services/mod.rs b/encoderfile/src/services/mod.rs
@@ -1,9 +1,11 @@
 mod embedding;
 mod model_metadata;
+mod sentence_embedding;
 mod sequence_classification;
 mod token_classification;
 
 pub use embedding::*;
 pub use model_metadata::*;
+pub use sentence_embedding::*;
 pub use sequence_classification::*;
 pub use token_classification::*;
diff --git a/encoderfile/src/services/model_metadata.rs b/encoderfile/src/services/model_metadata.rs
@@ -1,4 +1,4 @@
-use crate::{common::model_metadata::GetModelMetadataResponse, runtime::AppState};
+use crate::{common::GetModelMetadataResponse, runtime::AppState};
 
 pub fn get_model_metadata(state: &AppState) -> GetModelMetadataResponse {
     GetModelMetadataResponse {
diff --git a/encoderfile/src/services/sentence_embedding.rs b/encoderfile/src/services/sentence_embedding.rs
@@ -0,0 +1,26 @@
+use crate::{
+    common::{SentenceEmbeddingRequest, SentenceEmbeddingResponse},
+    error::ApiError,
+    inference,
+    runtime::AppState,
+};
+
+#[tracing::instrument(skip_all)]
+pub fn sentence_embedding(
+    request: impl Into<SentenceEmbeddingRequest>,
+    state: &AppState,
+) -> Result<SentenceEmbeddingResponse, ApiError> {
+    let request = request.into();
+
+    let session = state.session.lock();
+
+    let encodings = crate::runtime::encode_text(&state.tokenizer, request.inputs)?;
+
+    let results = inference::sentence_embedding::sentence_embedding(session, state, encodings)?;
+
+    Ok(SentenceEmbeddingResponse {
+        results,
+        model_id: state.model_id.clone(),
+        metadata: request.metadata,
+    })
+}
diff --git a/encoderfile/src/test_utils/mod.rs b/encoderfile/src/test_utils/mod.rs
@@ -1,4 +1,8 @@
-use crate::{common::ModelType, runtime::AppState, runtime::ModelConfig};
+use crate::{
+    common::ModelType,
+    runtime::{AppState, ModelConfig},
+    transforms::Transform,
+};
 use ort::session::Session;
 use parking_lot::Mutex;
 use std::{fs::File, io::BufReader, sync::Arc};
@@ -18,14 +22,18 @@ pub fn get_state(dir: &str, model_type: ModelType) -> AppState {
         config,
         model_type,
         model_id: "test-model".to_string(),
-        transform_factory: || None,
+        transform_factory: || Transform::new("").unwrap(),
     }
 }
 
 pub fn embedding_state() -> AppState {
     get_state(EMBEDDING_DIR, ModelType::Embedding)
 }
 
+pub fn sentence_embedding_state() -> AppState {
+    get_state(EMBEDDING_DIR, ModelType::SentenceEmbedding)
+}
+
 pub fn sequence_classification_state() -> AppState {
     get_state(
         SEQUENCE_CLASSIFICATION_DIR,
diff --git a/encoderfile/src/transforms/engine.rs b/encoderfile/src/transforms/engine.rs
diff --git a/encoderfile/src/transforms/tensor/mod.rs b/encoderfile/src/transforms/tensor/mod.rs
diff --git a/encoderfile/src/transforms/tensor/tests/ops.rs b/encoderfile/src/transforms/tensor/tests/ops.rs
diff --git a/encoderfile/src/transport/grpc/mod.rs b/encoderfile/src/transport/grpc/mod.rs
diff --git a/encoderfile/src/transport/http/mod.rs b/encoderfile/src/transport/http/mod.rs
diff --git a/encoderfile/src/transport/mcp/mod.rs b/encoderfile/src/transport/mcp/mod.rs
diff --git a/encoderfile/tests/test_grpc.rs b/encoderfile/tests/test_grpc.rs
diff --git a/encoderfile/tests/test_http.rs b/encoderfile/tests/test_http.rs
diff --git a/encoderfile/tests/test_services.rs b/encoderfile/tests/test_services.rs

Original file line number	Diff line number	Diff line change
`@@ -15,4 +15,5 @@ enum ModelType {`
`15`	`15`	`EMBEDDING = 1;`
`16`	`16`	`SEQUENCE_CLASSIFICATION = 2;`
`17`	`17`	`TOKEN_CLASSIFICATION = 3;`
	`18`	`+ SENTENCE_EMBEDDING = 4;`
`18`	`19`	`}`
Original file line number	Diff line number	Diff line change
`@@ -4,6 +4,7 @@ pub enum ModelType {`
`4`	`4`	`Embedding,`
`5`	`5`	`SequenceClassification,`
`6`	`6`	`TokenClassification,`
	`7`	`+ SentenceEmbedding,`
`7`	`8`	`}`
`8`	`9`
`9`	`10`	`impl From<ModelType> for crate::generated::metadata::ModelType {`
`@@ -12,6 +13,7 @@ impl From<ModelType> for crate::generated::metadata::ModelType {`
`12`	`13`	`ModelType::Embedding => Self::Embedding,`
`13`	`14`	`ModelType::SequenceClassification => Self::SequenceClassification,`
`14`	`15`	`ModelType::TokenClassification => Self::TokenClassification,`
	`16`	`+ ModelType::SentenceEmbedding => Self::SentenceEmbedding,`
`15`	`17`	`}`
`16`	`18`	`}`
`17`	`19`	`}`
Original file line number	Diff line number	Diff line change
`@@ -10,6 +10,10 @@ pub mod token_classification {`
`10`	`10`	`tonic::include_proto!("encoderfile.token_classification");`
`11`	`11`	`}`
`12`	`12`
	`13`	`+pub mod sentence_embedding {`
	`14`	`+ tonic::include_proto!("encoderfile.sentence_embedding");`
	`15`	`+}`
	`16`	`+`
`13`	`17`	`pub mod token {`
`14`	`18`	`tonic::include_proto!("encoderfile.token");`
`15`	`19`	`}`