Skip to content
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api"

[tool.poetry]
name = "together"
version = "1.5.13"
version = "1.5.14"
authors = ["Together AI <[email protected]>"]
description = "Python client for Together's Cloud Platform!"
readme = "README.md"
Expand Down
30 changes: 30 additions & 0 deletions src/together/cli/api/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,30 @@ def fine_tuning(ctx: click.Context) -> None:
default=0.1,
help="Beta parameter for DPO training (only used when '--training-method' is 'dpo')",
)
@click.option(
"--dpo-normalize-logratios-by-length",
type=bool,
default=False,
help=(
"Whether to normalize logratios by sample length "
"(only used when '--training-method' is 'dpo')"
),
)
@click.option(
"--rpo-alpha",
type=float,
default=0.0,
help=(
"RPO alpha parameter of DPO training to include NLL in the loss "
"(only used when '--training-method' is 'dpo')"
),
)
@click.option(
"--simpo-gamma",
type=float,
default=0.1,
help="SimPO gamma parameter (only used when '--training-method' is 'dpo')",
)
@click.option(
"--suffix",
"-s",
Expand Down Expand Up @@ -206,6 +230,9 @@ def create(
train_on_inputs: bool | Literal["auto"],
training_method: str,
dpo_beta: float,
dpo_normalize_logratios_by_length: bool,
rpo_alpha: float,
simpo_gamma: float,
from_checkpoint: str,
) -> None:
"""Start fine-tuning"""
Expand Down Expand Up @@ -239,6 +266,9 @@ def create(
train_on_inputs=train_on_inputs,
training_method=training_method,
dpo_beta=dpo_beta,
dpo_normalize_logratios_by_length=dpo_normalize_logratios_by_length,
rpo_alpha=rpo_alpha,
simpo_gamma=simpo_gamma,
from_checkpoint=from_checkpoint,
)

Expand Down
55 changes: 54 additions & 1 deletion src/together/resources/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ def create_finetune_request(
train_on_inputs: bool | Literal["auto"] | None = None,
training_method: str = "sft",
dpo_beta: float | None = None,
dpo_normalize_logratios_by_length: bool = False,
rpo_alpha: float | None = None,
simpo_gamma: float | None = None,
from_checkpoint: str | None = None,
) -> FinetuneRequest:
if model is not None and from_checkpoint is not None:
Expand Down Expand Up @@ -182,6 +185,21 @@ def create_finetune_request(

if dpo_beta is not None and training_method != "dpo":
raise ValueError("dpo_beta is only supported for DPO training")
if dpo_normalize_logratios_by_length and training_method != "dpo":
raise ValueError(
"dpo_normalize_logratios_by_length=True is only supported for DPO training"
)
if rpo_alpha is not None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this could simply be if rpo_alpha

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A bit below I want to notify user that rpo_alpha==0.0 throws an error

if training_method != "dpo":
raise ValueError("rpo_alpha is only supported for DPO training")
if not rpo_alpha >= 0.0:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe it's wise to put an upper limit too

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what can be a limit here, lets say 10? Wdyt?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure we should be enforcing any particular limit on this value, although it might be helpful. The problem is that this limit will apply only when users submit jobs via together-python

raise ValueError(f"rpo_alpha should be non-negative (got {rpo_alpha})")

if simpo_gamma is not None:
if training_method != "dpo":
raise ValueError("simpo_gamma is only supported for DPO training")
if not simpo_gamma >= 0.0:
raise ValueError(f"simpo_gamma should be non-negative (got {simpo_gamma})")

lr_scheduler: FinetuneLRScheduler
if lr_scheduler_type == "cosine":
Expand All @@ -204,7 +222,24 @@ def create_finetune_request(
if training_method == "sft":
training_method_cls = TrainingMethodSFT(train_on_inputs=train_on_inputs)
elif training_method == "dpo":
training_method_cls = TrainingMethodDPO(dpo_beta=dpo_beta)
if simpo_gamma is not None and simpo_gamma > 0:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By the way, should we raise a ValueError if it's <=0?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added + added for rpo_alpha (can't imagine an use case for negative values for these parameters)

dpo_reference_free = True
dpo_normalize_logratios_by_length = True
rprint(
f"Parameter simpo_gamma was set to {simpo_gamma}. "
"SimPO training detected. Reference logits will not be used "
"and length normalization of log-probabilities will be enabled."
)
else:
dpo_reference_free = False

training_method_cls = TrainingMethodDPO(
dpo_beta=dpo_beta,
dpo_normalize_logratios_by_length=dpo_normalize_logratios_by_length,
dpo_reference_free=dpo_reference_free,
rpo_alpha=rpo_alpha,
simpo_gamma=simpo_gamma,
)

finetune_request = FinetuneRequest(
model=model,
Expand Down Expand Up @@ -302,6 +337,9 @@ def create(
train_on_inputs: bool | Literal["auto"] | None = None,
training_method: str = "sft",
dpo_beta: float | None = None,
dpo_normalize_logratios_by_length: bool = False,
rpo_alpha: float | None = None,
simpo_gamma: float | None = None,
from_checkpoint: str | None = None,
) -> FinetuneResponse:
"""
Expand Down Expand Up @@ -353,6 +391,9 @@ def create(
training_method (str, optional): Training method. Defaults to "sft".
Supported methods: "sft", "dpo".
dpo_beta (float, optional): DPO beta parameter. Defaults to None.
dpo_normalize_logratios_by_length (bool): Whether or not normalize logratios by sample length. Defaults to False,
rpo_alpha (float, optional): RPO alpha parameter of DPO training to include NLL in the loss. Defaults to None.
simpo_gamma: (float, optional): SimPO gamma parameter. Defaults to None.
from_checkpoint (str, optional): The checkpoint identifier to continue training from a previous fine-tuning job.
The format: {$JOB_ID/$OUTPUT_MODEL_NAME}:{$STEP}.
The step value is optional, without it the final checkpoint will be used.
Expand Down Expand Up @@ -405,6 +446,9 @@ def create(
train_on_inputs=train_on_inputs,
training_method=training_method,
dpo_beta=dpo_beta,
dpo_normalize_logratios_by_length=dpo_normalize_logratios_by_length,
rpo_alpha=rpo_alpha,
simpo_gamma=simpo_gamma,
from_checkpoint=from_checkpoint,
)

Expand Down Expand Up @@ -714,6 +758,9 @@ async def create(
train_on_inputs: bool | Literal["auto"] | None = None,
training_method: str = "sft",
dpo_beta: float | None = None,
dpo_normalize_logratios_by_length: bool = False,
rpo_alpha: float | None = None,
simpo_gamma: float | None = None,
from_checkpoint: str | None = None,
) -> FinetuneResponse:
"""
Expand Down Expand Up @@ -765,6 +812,9 @@ async def create(
training_method (str, optional): Training method. Defaults to "sft".
Supported methods: "sft", "dpo".
dpo_beta (float, optional): DPO beta parameter. Defaults to None.
dpo_normalize_logratios_by_length (bool): Whether or not normalize logratios by sample length. Defaults to False,
rpo_alpha (float, optional): RPO alpha parameter of DPO training to include NLL in the loss. Defaults to None.
simpo_gamma: (float, optional): SimPO gamma parameter. Defaults to None.
from_checkpoint (str, optional): The checkpoint identifier to continue training from a previous fine-tuning job.
The format: {$JOB_ID/$OUTPUT_MODEL_NAME}:{$STEP}.
The step value is optional, without it the final checkpoint will be used.
Expand Down Expand Up @@ -817,6 +867,9 @@ async def create(
train_on_inputs=train_on_inputs,
training_method=training_method,
dpo_beta=dpo_beta,
dpo_normalize_logratios_by_length=dpo_normalize_logratios_by_length,
rpo_alpha=rpo_alpha,
simpo_gamma=simpo_gamma,
from_checkpoint=from_checkpoint,
)

Expand Down
4 changes: 4 additions & 0 deletions src/together/types/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,10 @@ class TrainingMethodDPO(TrainingMethod):

method: Literal["dpo"] = "dpo"
dpo_beta: float | None = None
dpo_normalize_logratios_by_length: bool = False
dpo_reference_free: bool = False
rpo_alpha: float | None = None
simpo_gamma: float | None = None


class FinetuneRequest(BaseModel):
Expand Down