feat(text-embed): add support for Voyage embedding model (#648)

badmonster0 · web-flow · commit 65cbf27a18e5 · 2025-06-22T10:26:46.000-07:00
diff --git a/examples/code_embedding/main.py b/examples/code_embedding/main.py
@@ -1,7 +1,11 @@
 from dotenv import load_dotenv
 from psycopg_pool import ConnectionPool
+from pgvector.psycopg import register_vector
+from typing import Any
 import cocoindex
 import os
+from numpy.typing import NDArray
+import numpy as np
 
 
 @cocoindex.op.function()
@@ -13,10 +17,17 @@ def extract_extension(filename: str) -> str:
 @cocoindex.transform_flow()
 def code_to_embedding(
     text: cocoindex.DataSlice[str],
-) -> cocoindex.DataSlice[list[float]]:
+) -> cocoindex.DataSlice[NDArray[np.float32]]:
     """
     Embed the text using a SentenceTransformer model.
     """
+    # You can also switch to Voyage embedding model:
+    #    return text.transform(
+    #        cocoindex.functions.EmbedText(
+    #            api_type=cocoindex.llm.LlmApiType.VOYAGE,
+    #            model="voyage-code-3",
+    #        )
+    #    )
     return text.transform(
         cocoindex.functions.SentenceTransformerEmbed(
             model="sentence-transformers/all-MiniLM-L6-v2"
@@ -71,7 +82,7 @@ def code_embedding_flow(
     )
 
 
-def search(pool: ConnectionPool, query: str, top_k: int = 5):
+def search(pool: ConnectionPool, query: str, top_k: int = 5) -> list[dict[str, Any]]:
     # Get the table name, for the export target in the code_embedding_flow above.
     table_name = cocoindex.utils.get_target_default_name(
         code_embedding_flow, "code_embeddings"
@@ -80,10 +91,11 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5):
     query_vector = code_to_embedding.eval(query)
     # Run the query and get the results.
     with pool.connection() as conn:
+        register_vector(conn)
         with conn.cursor() as cur:
             cur.execute(
                 f"""
-                SELECT filename, code, embedding <=> %s::vector AS distance
+                SELECT filename, code, embedding <=> %s AS distance
                 FROM {table_name} ORDER BY distance LIMIT %s
             """,
                 (query_vector, top_k),
@@ -94,7 +106,7 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5):
             ]
 
 
-def _main():
+def _main() -> None:
     # Make sure the flow is built and up-to-date.
     stats = code_embedding_flow.update()
     print("Updated index: ", stats)
diff --git a/python/cocoindex/flow.py b/python/cocoindex/flow.py
@@ -92,6 +92,7 @@ def _spec_kind(spec: Any) -> str:
 
 
 T = TypeVar("T")
+S = TypeVar("S")
 
 
 class _DataSliceState:
@@ -216,7 +217,7 @@ def transform(
             ),
         )
 
-    def call(self, func: Callable[[DataSlice[T]], T], *args: Any, **kwargs: Any) -> T:
+    def call(self, func: Callable[..., S], *args: Any, **kwargs: Any) -> S:
         """
         Call a function with the data slice.
         """
diff --git a/python/cocoindex/llm.py b/python/cocoindex/llm.py
@@ -11,6 +11,7 @@ class LlmApiType(Enum):
     ANTHROPIC = "Anthropic"
     LITE_LLM = "LiteLlm"
     OPEN_ROUTER = "OpenRouter"
+    VOYAGE = "Voyage"
 
 
 @dataclass
diff --git a/src/llm/gemini.rs b/src/llm/gemini.rs
@@ -8,6 +8,12 @@ use phf::phf_map;
 use serde_json::Value;
 use urlencoding::encode;
 
+static DEFAULT_EMBEDDING_DIMENSIONS: phf::Map<&str, u32> = phf_map! {
+    "gemini-embedding-exp-03-07" => 3072,
+    "text-embedding-004" => 768,
+    "embedding-001" => 768,
+};
+
 pub struct Client {
     api_key: String,
     client: reqwest::Client,
@@ -127,12 +133,6 @@ impl LlmGenerationClient for Client {
     }
 }
 
-static DEFAULT_EMBEDDING_DIMENSIONS: phf::Map<&str, u32> = phf_map! {
-    "gemini-embedding-exp-03-07" => 3072,
-    "text-embedding-004" => 768,
-    "embedding-001" => 768,
-};
-
 #[derive(Deserialize)]
 struct ContentEmbedding {
     values: Vec<f32>,
diff --git a/src/llm/mod.rs b/src/llm/mod.rs
@@ -12,6 +12,7 @@ pub enum LlmApiType {
     Anthropic,
     LiteLlm,
     OpenRouter,
+    Voyage,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -80,6 +81,7 @@ mod litellm;
 mod ollama;
 mod openai;
 mod openrouter;
+mod voyage;
 
 pub async fn new_llm_generation_client(
     api_type: LlmApiType,
@@ -103,6 +105,9 @@ pub async fn new_llm_generation_client(
         }
         LlmApiType::OpenRouter => Box::new(openrouter::Client::new_openrouter(address).await?)
             as Box<dyn LlmGenerationClient>,
+        LlmApiType::Voyage => {
+            api_bail!("Voyage is not supported for generation")
+        }
     };
     Ok(client)
 }
@@ -118,7 +123,15 @@ pub fn new_llm_embedding_client(
         LlmApiType::OpenAi => {
             Box::new(openai::Client::new(address)?) as Box<dyn LlmEmbeddingClient>
         }
-        _ => api_bail!("Embedding is not supported for API type {:?}", api_type),
+        LlmApiType::Voyage => {
+            Box::new(voyage::Client::new(address)?) as Box<dyn LlmEmbeddingClient>
+        }
+        LlmApiType::Ollama
+        | LlmApiType::OpenRouter
+        | LlmApiType::LiteLlm
+        | LlmApiType::Anthropic => {
+            api_bail!("Embedding is not supported for API type {:?}", api_type)
+        }
     };
     Ok(client)
 }
diff --git a/src/llm/openai.rs b/src/llm/openai.rs
@@ -15,6 +15,12 @@ use async_openai::{
 use async_trait::async_trait;
 use phf::phf_map;
 
+static DEFAULT_EMBEDDING_DIMENSIONS: phf::Map<&str, u32> = phf_map! {
+    "text-embedding-3-small" => 1536,
+    "text-embedding-3-large" => 3072,
+    "text-embedding-ada-002" => 1536,
+};
+
 pub struct Client {
     client: async_openai::Client<OpenAIConfig>,
 }
@@ -111,12 +117,6 @@ impl LlmGenerationClient for Client {
     }
 }
 
-static DEFAULT_EMBEDDING_DIMENSIONS: phf::Map<&str, u32> = phf_map! {
-    "text-embedding-3-small" => 1536,
-    "text-embedding-3-large" => 3072,
-    "text-embedding-ada-002" => 1536,
-};
-
 #[async_trait]
 impl LlmEmbeddingClient for Client {
     async fn embed_text<'req>(
diff --git a/src/llm/voyage.rs b/src/llm/voyage.rs
@@ -0,0 +1,109 @@
+use crate::prelude::*;
+
+use crate::llm::{LlmEmbeddingClient, LlmEmbeddingRequest, LlmEmbeddingResponse};
+use phf::phf_map;
+
+static DEFAULT_EMBEDDING_DIMENSIONS: phf::Map<&str, u32> = phf_map! {
+    // Current models
+    "voyage-3-large" => 1024,
+    "voyage-3.5" => 1024,
+    "voyage-3.5-lite" => 1024,
+    "voyage-code-3" => 1024,
+    "voyage-finance-2" => 1024,
+    "voyage-law-2" => 1024,
+    "voyage-code-2" => 1536,
+
+    // Legacy models
+    "voyage-3" => 1024,
+    "voyage-3-lite" => 512,
+    "voyage-multilingual-2" => 1024,
+    "voyage-large-2-instruct" => 1024,
+    "voyage-large-2" => 1536,
+    "voyage-2" => 1024,
+    "voyage-lite-02-instruct" => 1024,
+    "voyage-02" => 1024,
+    "voyage-01" => 1024,
+    "voyage-lite-01" => 1024,
+    "voyage-lite-01-instruct" => 1024,
+};
+
+pub struct Client {
+    api_key: String,
+    client: reqwest::Client,
+}
+
+impl Client {
+    pub fn new(address: Option<String>) -> Result<Self> {
+        if address.is_some() {
+            api_bail!("Voyage AI doesn't support custom API address");
+        }
+        let api_key = match std::env::var("VOYAGE_API_KEY") {
+            Ok(val) => val,
+            Err(_) => api_bail!("VOYAGE_API_KEY environment variable must be set"),
+        };
+        Ok(Self {
+            api_key,
+            client: reqwest::Client::new(),
+        })
+    }
+}
+
+#[derive(Deserialize)]
+struct EmbeddingData {
+    embedding: Vec<f32>,
+}
+
+#[derive(Deserialize)]
+struct EmbedResponse {
+    data: Vec<EmbeddingData>,
+}
+
+#[async_trait]
+impl LlmEmbeddingClient for Client {
+    async fn embed_text<'req>(
+        &self,
+        request: LlmEmbeddingRequest<'req>,
+    ) -> Result<LlmEmbeddingResponse> {
+        let url = "https://api.voyageai.com/v1/embeddings";
+
+        let mut payload = serde_json::json!({
+            "input": request.text,
+            "model": request.model,
+        });
+
+        if let Some(task_type) = request.task_type {
+            payload["input_type"] = serde_json::Value::String(task_type.into());
+        }
+
+        let resp = self
+            .client
+            .post(url)
+            .header("Authorization", format!("Bearer {}", self.api_key))
+            .json(&payload)
+            .send()
+            .await
+            .context("HTTP error")?;
+
+        if !resp.status().is_success() {
+            bail!(
+                "Voyage AI API error: {:?}\n{}\n",
+                resp.status(),
+                resp.text().await?
+            );
+        }
+
+        let embedding_resp: EmbedResponse = resp.json().await.context("Invalid JSON")?;
+
+        if embedding_resp.data.is_empty() {
+            bail!("No embedding data in response");
+        }
+
+        Ok(LlmEmbeddingResponse {
+            embedding: embedding_resp.data[0].embedding.clone(),
+        })
+    }
+
+    fn get_default_embedding_dimension(&self, model: &str) -> Option<u32> {
+        DEFAULT_EMBEDDING_DIMENSIONS.get(model).copied()
+    }
+}