togethercomputer · mryab · Mar 11, 2025 · Feb 28, 2025 · Mar 3, 2025 · Mar 3, 2025
diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py
@@ -104,6 +104,18 @@ def fine_tuning(ctx: click.Context) -> None:
     default="all-linear",
     help="Trainable modules for LoRA adapters. For example, 'all-linear', 'q_proj,v_proj'",
 )
+@click.option(
+    "--training-method",
+    type=click.Choice(["sft", "dpo"]),
+    default="sft",
+    help="Training method to use. Options: sft (supervised fine-tuning), dpo (Direct Preference Optimization)",
+)
+@click.option(
+    "--dpo-beta",
+    type=float,
+    default=0.1,
+    help="Beta parameter for DPO training (only used when '--training-method' is 'dpo')",
+)
 @click.option(
     "--suffix", type=str, default=None, help="Suffix for the fine-tuned model name"
 )
@@ -152,6 +164,8 @@ def create(
     wandb_name: str,
     confirm: bool,
     train_on_inputs: bool | Literal["auto"],
+    training_method: str,
+    dpo_beta: float,
 ) -> None:
     """Start fine-tuning"""
     client: Together = ctx.obj
@@ -180,6 +194,8 @@ def create(
         wandb_project_name=wandb_project_name,
         wandb_name=wandb_name,
         train_on_inputs=train_on_inputs,
+        training_method=training_method,
+        dpo_beta=dpo_beta,
     )
 
     model_limits: FinetuneTrainingLimits = client.fine_tuning.get_model_limits(

diff --git a/src/together/constants.py b/src/together/constants.py
@@ -39,12 +39,18 @@ class DatasetFormat(enum.Enum):
     GENERAL = "general"
     CONVERSATION = "conversation"
     INSTRUCTION = "instruction"
+    PREFERENCE_OPENAI = "preference_openai"
 
 
 JSONL_REQUIRED_COLUMNS_MAP = {
     DatasetFormat.GENERAL: ["text"],
     DatasetFormat.CONVERSATION: ["messages"],
     DatasetFormat.INSTRUCTION: ["prompt", "completion"],
+    DatasetFormat.PREFERENCE_OPENAI: [
+        "input",
+        "preferred_output",
+        "non_preferred_output",
+    ],
 }
 REQUIRED_COLUMNS_MESSAGE = ["role", "content"]
 POSSIBLE_ROLES_CONVERSATION = ["system", "user", "assistant"]
diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py
@@ -22,6 +22,8 @@
     TrainingType,
     FinetuneLRScheduler,
     FinetuneLinearLRSchedulerArgs,
+    TrainingMethodDPO,
+    TrainingMethodSFT,
 )
 from together.types.finetune import DownloadCheckpointType
 from together.utils import log_warn_once, normalize_key
@@ -52,7 +54,10 @@ def createFinetuneRequest(
     wandb_project_name: str | None = None,
     wandb_name: str | None = None,
     train_on_inputs: bool | Literal["auto"] = "auto",
+    training_method: str = "sft",
+    dpo_beta: float | None = None,
 ) -> FinetuneRequest:
+
     if batch_size == "max":
         log_warn_once(
             "Starting from together>=1.3.0, "
@@ -100,11 +105,24 @@ def createFinetuneRequest(
     if weight_decay is not None and (weight_decay < 0):
         raise ValueError("Weight decay should be non-negative")
 
+    AVAILABLE_TRAINING_METHODS = {
+        TrainingMethodSFT().method,
+        TrainingMethodDPO().method,
+    }
+    if training_method not in AVAILABLE_TRAINING_METHODS:
+        raise ValueError(
+            f"training_method must be one of {', '.join(AVAILABLE_TRAINING_METHODS)}"
+        )
+
     lrScheduler = FinetuneLRScheduler(
         lr_scheduler_type="linear",
         lr_scheduler_args=FinetuneLinearLRSchedulerArgs(min_lr_ratio=min_lr_ratio),
     )
 
+    training_method_cls: TrainingMethodSFT | TrainingMethodDPO = TrainingMethodSFT()
+    if training_method == "dpo":
+        training_method_cls = TrainingMethodDPO(dpo_beta=dpo_beta)
+
     finetune_request = FinetuneRequest(
         model=model,
         training_file=training_file,
@@ -125,6 +143,7 @@ def createFinetuneRequest(
         wandb_project_name=wandb_project_name,
         wandb_name=wandb_name,
         train_on_inputs=train_on_inputs,
+        training_method=training_method_cls,
     )
 
     return finetune_request
@@ -162,6 +181,8 @@ def create(
         verbose: bool = False,
         model_limits: FinetuneTrainingLimits | None = None,
         train_on_inputs: bool | Literal["auto"] = "auto",
+        training_method: str = "sft",
+        dpo_beta: float | None = None,
     ) -> FinetuneResponse:
         """
         Method to initiate a fine-tuning job
@@ -207,6 +228,9 @@ def create(
                 For datasets with the "messages" field (conversational format) or "prompt" and "completion" fields
                 (Instruction format), inputs will be masked.
                 Defaults to "auto".
+            training_method (str, optional): Training method. Defaults to "sft".
+                Supported methods: "sft", "dpo".
+            dpo_beta (float, optional): DPO beta parameter. Defaults to None.
 
         Returns:
             FinetuneResponse: Object containing information about fine-tuning job.
@@ -218,7 +242,6 @@ def create(
 
         if model_limits is None:
             model_limits = self.get_model_limits(model=model)
-
         finetune_request = createFinetuneRequest(
             model_limits=model_limits,
             training_file=training_file,
@@ -244,6 +267,8 @@ def create(
             wandb_project_name=wandb_project_name,
             wandb_name=wandb_name,
             train_on_inputs=train_on_inputs,
+            training_method=training_method,
+            dpo_beta=dpo_beta,
         )
 
         if verbose:
@@ -261,7 +286,6 @@ def create(
             ),
             stream=False,
         )
-
         assert isinstance(response, TogetherResponse)
 
         return FinetuneResponse(**response.data)
@@ -503,6 +527,8 @@ async def create(
         verbose: bool = False,
         model_limits: FinetuneTrainingLimits | None = None,
         train_on_inputs: bool | Literal["auto"] = "auto",
+        training_method: str = "sft",
+        dpo_beta: float | None = None,
     ) -> FinetuneResponse:
         """
         Async method to initiate a fine-tuning job
@@ -548,6 +574,9 @@ async def create(
                 For datasets with the "messages" field (conversational format) or "prompt" and "completion" fields
                 (Instruction format), inputs will be masked.
                 Defaults to "auto".
+            training_method (str, optional): Training method. Defaults to "sft".
+                Supported methods: "sft", "dpo".
+            dpo_beta (float, optional): DPO beta parameter. Defaults to None.
 
         Returns:
             FinetuneResponse: Object containing information about fine-tuning job.
@@ -585,6 +614,8 @@ async def create(
             wandb_project_name=wandb_project_name,
             wandb_name=wandb_name,
             train_on_inputs=train_on_inputs,
+            training_method=training_method,
+            dpo_beta=dpo_beta,
         )
 
         if verbose:

diff --git a/src/together/types/__init__.py b/src/together/types/__init__.py
@@ -31,6 +31,8 @@
     FileType,
 )
 from together.types.finetune import (
+    TrainingMethodDPO,
+    TrainingMethodSFT,
     FinetuneDownloadResult,
     FinetuneLinearLRSchedulerArgs,
     FinetuneList,
@@ -79,6 +81,8 @@
     "TrainingType",
     "FullTrainingType",
     "LoRATrainingType",
+    "TrainingMethodDPO",
+    "TrainingMethodSFT",
     "RerankRequest",
     "RerankResponse",
     "FinetuneTrainingLimits",

diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py
@@ -135,6 +135,31 @@ class LoRATrainingType(TrainingType):
     type: str = "Lora"
 
 
+class TrainingMethod(BaseModel):
+    """
+    Training method type
+    """
+
+    method: str
+
+
+class TrainingMethodSFT(TrainingMethod):
+    """
+    Training method type for SFT training
+    """
+
+    method: Literal["sft"] = "sft"
+
+
+class TrainingMethodDPO(TrainingMethod):
+    """
+    Training method type for DPO training
+    """
+
+    method: Literal["dpo"] = "dpo"
+    dpo_beta: float | None = None
+
+
 class FinetuneRequest(BaseModel):
     """
     Fine-tune request type
@@ -178,6 +203,10 @@ class FinetuneRequest(BaseModel):
     training_type: FullTrainingType | LoRATrainingType | None = None
     # train on inputs
     train_on_inputs: StrictBool | Literal["auto"] = "auto"
+    # training method
+    training_method: TrainingMethodSFT | TrainingMethodDPO = Field(
+        default_factory=TrainingMethodSFT
+    )
 
 
 class FinetuneResponse(BaseModel):