diff --git a/pyproject.toml b/pyproject.toml index c8afe9b2..0f22984d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api" [tool.poetry] name = "together" -version = "1.5.13" +version = "1.5.14" authors = ["Together AI "] description = "Python client for Together's Cloud Platform!" readme = "README.md" diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py index 9aa581a8..f95235cf 100644 --- a/src/together/cli/api/finetune.py +++ b/src/together/cli/api/finetune.py @@ -142,6 +142,30 @@ def fine_tuning(ctx: click.Context) -> None: default=0.1, help="Beta parameter for DPO training (only used when '--training-method' is 'dpo')", ) +@click.option( + "--dpo-normalize-logratios-by-length", + type=bool, + default=False, + help=( + "Whether to normalize logratios by sample length " + "(only used when '--training-method' is 'dpo')" + ), +) +@click.option( + "--rpo-alpha", + type=float, + default=0.0, + help=( + "RPO alpha parameter of DPO training to include NLL in the loss " + "(only used when '--training-method' is 'dpo')" + ), +) +@click.option( + "--simpo-gamma", + type=float, + default=0.1, + help="SimPO gamma parameter (only used when '--training-method' is 'dpo')", +) @click.option( "--suffix", "-s", @@ -206,6 +230,9 @@ def create( train_on_inputs: bool | Literal["auto"], training_method: str, dpo_beta: float, + dpo_normalize_logratios_by_length: bool, + rpo_alpha: float, + simpo_gamma: float, from_checkpoint: str, ) -> None: """Start fine-tuning""" @@ -239,6 +266,9 @@ def create( train_on_inputs=train_on_inputs, training_method=training_method, dpo_beta=dpo_beta, + dpo_normalize_logratios_by_length=dpo_normalize_logratios_by_length, + rpo_alpha=rpo_alpha, + simpo_gamma=simpo_gamma, from_checkpoint=from_checkpoint, ) diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py index 1c55412a..27baf2d2 100644 --- a/src/together/resources/finetune.py +++ b/src/together/resources/finetune.py @@ -72,6 +72,9 @@ def create_finetune_request( train_on_inputs: bool | Literal["auto"] | None = None, training_method: str = "sft", dpo_beta: float | None = None, + dpo_normalize_logratios_by_length: bool = False, + rpo_alpha: float | None = None, + simpo_gamma: float | None = None, from_checkpoint: str | None = None, ) -> FinetuneRequest: if model is not None and from_checkpoint is not None: @@ -182,6 +185,21 @@ def create_finetune_request( if dpo_beta is not None and training_method != "dpo": raise ValueError("dpo_beta is only supported for DPO training") + if dpo_normalize_logratios_by_length and training_method != "dpo": + raise ValueError( + "dpo_normalize_logratios_by_length=True is only supported for DPO training" + ) + if rpo_alpha is not None: + if training_method != "dpo": + raise ValueError("rpo_alpha is only supported for DPO training") + if not rpo_alpha >= 0.0: + raise ValueError(f"rpo_alpha should be non-negative (got {rpo_alpha})") + + if simpo_gamma is not None: + if training_method != "dpo": + raise ValueError("simpo_gamma is only supported for DPO training") + if not simpo_gamma >= 0.0: + raise ValueError(f"simpo_gamma should be non-negative (got {simpo_gamma})") lr_scheduler: FinetuneLRScheduler if lr_scheduler_type == "cosine": @@ -204,7 +222,24 @@ def create_finetune_request( if training_method == "sft": training_method_cls = TrainingMethodSFT(train_on_inputs=train_on_inputs) elif training_method == "dpo": - training_method_cls = TrainingMethodDPO(dpo_beta=dpo_beta) + if simpo_gamma is not None and simpo_gamma > 0: + dpo_reference_free = True + dpo_normalize_logratios_by_length = True + rprint( + f"Parameter simpo_gamma was set to {simpo_gamma}. " + "SimPO training detected. Reference logits will not be used " + "and length normalization of log-probabilities will be enabled." + ) + else: + dpo_reference_free = False + + training_method_cls = TrainingMethodDPO( + dpo_beta=dpo_beta, + dpo_normalize_logratios_by_length=dpo_normalize_logratios_by_length, + dpo_reference_free=dpo_reference_free, + rpo_alpha=rpo_alpha, + simpo_gamma=simpo_gamma, + ) finetune_request = FinetuneRequest( model=model, @@ -302,6 +337,9 @@ def create( train_on_inputs: bool | Literal["auto"] | None = None, training_method: str = "sft", dpo_beta: float | None = None, + dpo_normalize_logratios_by_length: bool = False, + rpo_alpha: float | None = None, + simpo_gamma: float | None = None, from_checkpoint: str | None = None, ) -> FinetuneResponse: """ @@ -353,6 +391,9 @@ def create( training_method (str, optional): Training method. Defaults to "sft". Supported methods: "sft", "dpo". dpo_beta (float, optional): DPO beta parameter. Defaults to None. + dpo_normalize_logratios_by_length (bool): Whether or not normalize logratios by sample length. Defaults to False, + rpo_alpha (float, optional): RPO alpha parameter of DPO training to include NLL in the loss. Defaults to None. + simpo_gamma: (float, optional): SimPO gamma parameter. Defaults to None. from_checkpoint (str, optional): The checkpoint identifier to continue training from a previous fine-tuning job. The format: {$JOB_ID/$OUTPUT_MODEL_NAME}:{$STEP}. The step value is optional, without it the final checkpoint will be used. @@ -405,6 +446,9 @@ def create( train_on_inputs=train_on_inputs, training_method=training_method, dpo_beta=dpo_beta, + dpo_normalize_logratios_by_length=dpo_normalize_logratios_by_length, + rpo_alpha=rpo_alpha, + simpo_gamma=simpo_gamma, from_checkpoint=from_checkpoint, ) @@ -714,6 +758,9 @@ async def create( train_on_inputs: bool | Literal["auto"] | None = None, training_method: str = "sft", dpo_beta: float | None = None, + dpo_normalize_logratios_by_length: bool = False, + rpo_alpha: float | None = None, + simpo_gamma: float | None = None, from_checkpoint: str | None = None, ) -> FinetuneResponse: """ @@ -765,6 +812,9 @@ async def create( training_method (str, optional): Training method. Defaults to "sft". Supported methods: "sft", "dpo". dpo_beta (float, optional): DPO beta parameter. Defaults to None. + dpo_normalize_logratios_by_length (bool): Whether or not normalize logratios by sample length. Defaults to False, + rpo_alpha (float, optional): RPO alpha parameter of DPO training to include NLL in the loss. Defaults to None. + simpo_gamma: (float, optional): SimPO gamma parameter. Defaults to None. from_checkpoint (str, optional): The checkpoint identifier to continue training from a previous fine-tuning job. The format: {$JOB_ID/$OUTPUT_MODEL_NAME}:{$STEP}. The step value is optional, without it the final checkpoint will be used. @@ -817,6 +867,9 @@ async def create( train_on_inputs=train_on_inputs, training_method=training_method, dpo_beta=dpo_beta, + dpo_normalize_logratios_by_length=dpo_normalize_logratios_by_length, + rpo_alpha=rpo_alpha, + simpo_gamma=simpo_gamma, from_checkpoint=from_checkpoint, ) diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py index 07ee65ec..e8c388f9 100644 --- a/src/together/types/finetune.py +++ b/src/together/types/finetune.py @@ -159,6 +159,10 @@ class TrainingMethodDPO(TrainingMethod): method: Literal["dpo"] = "dpo" dpo_beta: float | None = None + dpo_normalize_logratios_by_length: bool = False + dpo_reference_free: bool = False + rpo_alpha: float | None = None + simpo_gamma: float | None = None class FinetuneRequest(BaseModel):