Add together fine tuning config to python client (tensorzero#3105)

anndvision · web-flow · commit d23a66629363 · 2025-08-20T01:43:51.000Z
* update rust

* update python client

* update error string

* fix naming error

* throw error for credentials if json parsing fails

* add together mock inference provider

* update github workflows config
diff --git a/.github/workflows/general.yml b/.github/workflows/general.yml
@@ -533,6 +533,7 @@ jobs:
       OPENAI_API_KEY: not_used
       FIREWORKS_API_KEY: not_used
       FIREWORKS_ACCOUNT_ID: not_used
+      TOGETHER_API_KEY: not_used
       TENSORZERO_USE_MOCK_INFERENCE_PROVIDER: 1
       TENSORZERO_SKIP_LARGE_FIXTURES: 1
       R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
diff --git a/clients/python/src/lib.rs b/clients/python/src/lib.rs
@@ -40,8 +40,8 @@ use tensorzero_core::{
     optimization::{
         fireworks_sft::UninitializedFireworksSFTConfig,
         gcp_vertex_gemini_sft::UninitializedGCPVertexGeminiSFTConfig,
-        openai_sft::UninitializedOpenAISFTConfig, OptimizationJobInfoPyClass,
-        OptimizationJobStatus, UninitializedOptimizerInfo,
+        openai_sft::UninitializedOpenAISFTConfig, together_sft::UninitializedTogetherSFTConfig,
+        OptimizationJobInfoPyClass, OptimizationJobStatus, UninitializedOptimizerInfo,
     },
     variant::{
         BestOfNSamplingConfigPyClass, ChainOfThoughtConfigPyClass, ChatCompletionConfigPyClass,
@@ -96,6 +96,7 @@ fn tensorzero(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<UninitializedOpenAISFTConfig>()?;
     m.add_class::<UninitializedFireworksSFTConfig>()?;
     m.add_class::<UninitializedGCPVertexGeminiSFTConfig>()?;
+    m.add_class::<UninitializedTogetherSFTConfig>()?;
     m.add_class::<Datapoint>()?;
     m.add_class::<ResolvedInput>()?;
     m.add_class::<ResolvedInputMessage>()?;
diff --git a/clients/python/tensorzero/__init__.py b/clients/python/tensorzero/__init__.py
@@ -25,6 +25,7 @@
     ResolvedInput,
     ResolvedInputMessage,
     StoredInference,
+    TogetherSFTConfig,
     VariantsConfig,
 )
 from .tensorzero import (
@@ -89,7 +90,7 @@
 ChatDatapoint = Datapoint.Chat
 JsonDatapoint = Datapoint.Json
 
-OptimizationConfig = t.Union[OpenAISFTConfig, FireworksSFTConfig]
+OptimizationConfig = t.Union[OpenAISFTConfig, FireworksSFTConfig, TogetherSFTConfig]
 ChatInferenceOutput = t.List[ContentBlock]
 
 
@@ -166,6 +167,7 @@
     "Thought",
     "ThoughtChunk",
     "TimeFilter",
+    "TogetherSFTConfig",
     "Tool",
     "ToolChoice",
     "ToolParams",
diff --git a/clients/python/tensorzero/tensorzero.pyi b/clients/python/tensorzero/tensorzero.pyi
@@ -202,6 +202,16 @@ class GCPVertexGeminiSFTConfig:
         bucket_path_prefix: Optional[str] = None,
     ) -> None: ...
 
+@final
+class TogetherSFTConfig:
+    def __init__(
+        self,
+        *,
+        model: str,
+        credentials: Optional[str] = None,
+        api_base: Optional[str] = None,
+    ) -> None: ...
+
 @final
 class Datapoint:
     Chat: Type["Datapoint"]
@@ -1023,6 +1033,7 @@ __all__ = [
     "OptimizationJobInfo",
     "OptimizationJobStatus",
     "RenderedSample",
+    "TogetherSFTConfig",
     "StoredInference",
     "ResolvedInput",
     "ResolvedInputMessage",
diff --git a/clients/python/tests/test_optimization.py b/clients/python/tests/test_optimization.py
@@ -9,6 +9,7 @@
     OptimizationJobStatus,
     RenderedSample,
     TensorZeroGateway,
+    TogetherSFTConfig,
 )
 
 
@@ -57,6 +58,28 @@ def test_sync_fireworks_sft(
         sleep(1)
 
 
+def test_sync_together_sft(
+    embedded_sync_client: TensorZeroGateway,
+    mixed_rendered_samples: List[RenderedSample],
+):
+    optimization_config = TogetherSFTConfig(
+        model="meta-llama/Meta-Llama-3.1-8B-Instruct-Reference",
+        api_base="http://localhost:3030/together/",
+    )
+    optimization_job_handle = embedded_sync_client.experimental_launch_optimization(
+        train_samples=mixed_rendered_samples,
+        val_samples=None,
+        optimization_config=optimization_config,
+    )
+    while True:
+        job_info = embedded_sync_client.experimental_poll_optimization(
+            job_handle=optimization_job_handle
+        )
+        if job_info.status == OptimizationJobStatus.Completed:
+            break
+        sleep(1)
+
+
 @pytest.mark.asyncio
 async def test_async_openai_sft(
     embedded_async_client: AsyncTensorZeroGateway,
@@ -105,3 +128,28 @@ async def test_async_fireworks_sft(
         if job_info.status == OptimizationJobStatus.Completed:
             break
         sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_async_together_sft(
+    embedded_async_client: AsyncTensorZeroGateway,
+    mixed_rendered_samples: List[RenderedSample],
+):
+    optimization_config = TogetherSFTConfig(
+        model="meta-llama/Meta-Llama-3.1-8B-Instruct-Reference",
+        api_base="http://localhost:3030/together/",
+    )
+    optimization_job_handle = (
+        await embedded_async_client.experimental_launch_optimization(
+            train_samples=mixed_rendered_samples,
+            val_samples=None,
+            optimization_config=optimization_config,
+        )
+    )
+    while True:
+        job_info = await embedded_async_client.experimental_poll_optimization(
+            job_handle=optimization_job_handle
+        )
+        if job_info.status == OptimizationJobStatus.Completed:
+            break
+        sleep(1)
diff --git a/tensorzero-core/src/inference/types/pyo3_helpers.rs b/tensorzero-core/src/inference/types/pyo3_helpers.rs
@@ -12,6 +12,7 @@ use crate::endpoints::datasets::Datapoint;
 use crate::inference::types::{ContentBlockChatOutput, ResolvedInput, ResolvedInputMessageContent};
 use crate::optimization::fireworks_sft::UninitializedFireworksSFTConfig;
 use crate::optimization::openai_sft::UninitializedOpenAISFTConfig;
+use crate::optimization::together_sft::UninitializedTogetherSFTConfig;
 use crate::optimization::UninitializedOptimizerConfig;
 use crate::stored_inference::{
     RenderedSample, SimpleStoredSampleInfo, StoredInference, StoredSample,
@@ -323,9 +324,11 @@ pub fn deserialize_optimization_config(
         Ok(UninitializedOptimizerConfig::OpenAISFT(obj.extract()?))
     } else if obj.is_instance_of::<UninitializedFireworksSFTConfig>() {
         Ok(UninitializedOptimizerConfig::FireworksSFT(obj.extract()?))
+    } else if obj.is_instance_of::<UninitializedTogetherSFTConfig>() {
+        Ok(UninitializedOptimizerConfig::TogetherSFT(obj.extract()?))
     } else {
         Err(PyValueError::new_err(
-            "Invalid optimization config. Expected OpenAISFTConfig or FireworksSFTConfig",
+            "Invalid optimization config. Expected OpenAISFTConfig, FireworksSFTConfig, or TogetherSFTConfig",
         ))
     }
 }
diff --git a/tensorzero-core/src/optimization/fireworks_sft/mod.rs b/tensorzero-core/src/optimization/fireworks_sft/mod.rs
@@ -212,8 +212,11 @@ impl UninitializedFireworksSFTConfig {
         account_id: String,
         api_base: Option<String>,
     ) -> PyResult<Self> {
-        let credentials =
-            credentials.map(|s| serde_json::from_str(&s).unwrap_or(CredentialLocation::Env(s)));
+        let credentials = credentials
+            .map(|s| serde_json::from_str(&s))
+            .transpose()
+            .map_err(|e| PyErr::new::<PyValueError, _>(format!("Invalid credentials JSON: {e}")))?
+            .or_else(|| Some(default_api_key_location()));
         let api_base = api_base
             .map(|s| {
                 Url::parse(&s)
diff --git a/tensorzero-core/src/optimization/gcp_vertex_gemini_sft/mod.rs b/tensorzero-core/src/optimization/gcp_vertex_gemini_sft/mod.rs
@@ -24,9 +24,6 @@ use crate::{
     stored_inference::RenderedSample,
 };
 
-#[cfg(feature = "pyo3")]
-use crate::inference::types::pyo3_helpers::tensorzero_core_error;
-
 pub fn gcp_vertex_gemini_base_url(project_id: &str, region: &str) -> Result<Url, url::ParseError> {
     let subdomain_prefix = location_subdomain_prefix(region);
     Url::parse(&format!(
@@ -95,7 +92,7 @@ impl UninitializedGCPVertexGeminiSFTConfig {
     #[new]
     #[pyo3(signature = (*, model, bucket_name, project_id, region, learning_rate_multiplier=None, adapter_size=None, n_epochs=None, export_last_checkpoint_only=None, credentials=None, api_base=None, seed=None, service_account=None, kms_key_name=None, tuned_model_display_name=None, bucket_path_prefix=None))]
     pub fn new(
-        py: Python<'_>,
+        _py: Python<'_>,
         model: String,
         bucket_name: String,
         project_id: String,
@@ -113,13 +110,11 @@ impl UninitializedGCPVertexGeminiSFTConfig {
         bucket_path_prefix: Option<String>,
     ) -> PyResult<Self> {
         // Use Deserialize to convert the string to a CredentialLocation
-        let credentials = match credentials {
-            Some(s) => match serde_json::from_str(&s) {
-                Ok(parsed) => Some(parsed),
-                Err(e) => return Err(tensorzero_core_error(py, &e.to_string())?),
-            },
-            None => None,
-        };
+        let credentials = credentials
+            .map(|s| serde_json::from_str(&s))
+            .transpose()
+            .map_err(|e| PyErr::new::<PyValueError, _>(format!("Invalid credentials JSON: {e}")))?
+            .or_else(|| Some(default_api_key_location()));
         let api_base = api_base
             .map(|s| {
                 Url::parse(&s)
diff --git a/tensorzero-core/src/optimization/openai_sft/mod.rs b/tensorzero-core/src/optimization/openai_sft/mod.rs
@@ -90,8 +90,11 @@ impl UninitializedOpenAISFTConfig {
         suffix: Option<String>,
     ) -> PyResult<Self> {
         // Use Deserialize to convert the string to a CredentialLocation
-        let credentials =
-            credentials.map(|s| serde_json::from_str(&s).unwrap_or(CredentialLocation::Env(s)));
+        let credentials = credentials
+            .map(|s| serde_json::from_str(&s))
+            .transpose()
+            .map_err(|e| PyErr::new::<PyValueError, _>(format!("Invalid credentials JSON: {e}")))?
+            .or_else(|| Some(default_api_key_location()));
         let api_base = api_base
             .map(|s| {
                 Url::parse(&s)
diff --git a/tensorzero-core/src/optimization/together_sft/mod.rs b/tensorzero-core/src/optimization/together_sft/mod.rs
@@ -1,4 +1,6 @@
 #[cfg(feature = "pyo3")]
+use pyo3::exceptions::PyValueError;
+#[cfg(feature = "pyo3")]
 use pyo3::prelude::*;
 use std::borrow::Cow;
 use std::collections::HashMap;
@@ -66,13 +68,21 @@ impl std::fmt::Display for TogetherSFTJobHandle {
 #[cfg_attr(test, derive(ts_rs::TS))]
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 #[cfg_attr(test, ts(export))]
+#[cfg_attr(feature = "pyo3", pyclass(str, name = "TogetherSFTConfig"))]
 pub struct UninitializedTogetherSFTConfig {
     pub model: String,
     #[cfg_attr(test, ts(type = "string | null"))]
     pub credentials: Option<CredentialLocation>,
     pub api_base: Option<Url>,
 }
 
+impl std::fmt::Display for UninitializedTogetherSFTConfig {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let json = serde_json::to_string_pretty(self).map_err(|_| std::fmt::Error)?;
+        write!(f, "{json}")
+    }
+}
+
 #[derive(Debug, Serialize)]
 pub struct TogetherSupervisedRow<'a> {
     messages: Vec<OpenAIRequestMessage<'a>>,
@@ -120,6 +130,57 @@ impl<'a> TryFrom<&'a RenderedSample> for TogetherSupervisedRow<'a> {
     }
 }
 
+#[cfg(feature = "pyo3")]
+#[pymethods]
+impl UninitializedTogetherSFTConfig {
+    // We allow too many arguments since it is a Python constructor
+    /// NOTE: This signature currently does not work:
+    /// print(TogetherSFTConfig.__init__.__text_signature__)
+    /// prints out signature:
+    /// ($self, /, *args, **kwargs)
+    #[new]
+    #[pyo3(signature = (*, model, credentials=None, api_base=None))]
+    pub fn new(
+        model: String,
+        credentials: Option<String>,
+        api_base: Option<String>,
+    ) -> PyResult<Self> {
+        // Use Deserialize to convert the string to a CredentialLocation
+        let credentials = credentials
+            .map(|s| serde_json::from_str(&s))
+            .transpose()
+            .map_err(|e| PyErr::new::<PyValueError, _>(format!("Invalid credentials JSON: {e}")))?
+            .or_else(|| Some(default_api_key_location()));
+        let api_base = api_base
+            .map(|s| {
+                Url::parse(&s)
+                    .map_err(|e| PyErr::new::<PyValueError, std::string::String>(e.to_string()))
+            })
+            .transpose()?;
+        Ok(Self {
+            model,
+            credentials,
+            api_base,
+        })
+    }
+
+    /// Initialize the TogetherSFTConfig. All parameters are optional except for `model`.
+    ///
+    /// :param model: The model to use for the fine-tuning job.
+    /// :param credentials: The credentials to use for the fine-tuning job. This should be a string like "env::OPENAI_API_KEY". See docs for more details.
+    /// :param api_base: The base URL to use for the fine-tuning job. This is primarily used for testing.
+    #[expect(unused_variables)]
+    #[pyo3(signature = (*, model, credentials=None, api_base=None))]
+    fn __init__(
+        this: Py<Self>,
+        model: String,
+        credentials: Option<String>,
+        api_base: Option<String>,
+    ) -> Py<Self> {
+        this
+    }
+}
+
 impl UninitializedTogetherSFTConfig {
     pub fn load(self) -> Result<TogetherSFTConfig, Error> {
         Ok(TogetherSFTConfig {
diff --git a/tensorzero-core/tests/mock-inference-provider/src/main.rs b/tensorzero-core/tests/mock-inference-provider/src/main.rs
@@ -3,6 +3,7 @@
 
 mod error;
 mod fireworks;
+mod together;
 
 use async_stream::try_stream;
 use axum::http::StatusCode;
@@ -152,6 +153,18 @@ fn make_router() -> axum::Router {
             "/fireworks/v1/accounts/{account_id}/deployedModels",
             axum::routing::post(fireworks::create_deployed_model),
         )
+        .route(
+            "/together/files/upload",
+            axum::routing::post(together::upload_file),
+        )
+        .route(
+            "/together/fine-tunes",
+            axum::routing::post(together::create_fine_tuning_job),
+        )
+        .route(
+            "/together/fine-tunes/{job_id}",
+            axum::routing::get(together::get_fine_tuning_job),
+        )
         .route("/status", axum::routing::get(status_handler))
         .layer(TraceLayer::new_for_http())
 }
diff --git a/tensorzero-core/tests/mock-inference-provider/src/together.rs b/tensorzero-core/tests/mock-inference-provider/src/together.rs
diff --git a/tensorzero-core/tests/optimization/common/together_sft.rs b/tensorzero-core/tests/optimization/common/together_sft.rs
diff --git a/tensorzero-core/tests/optimization/mock_tests.rs b/tensorzero-core/tests/optimization/mock_tests.rs