Skip to content
Merged
27 changes: 27 additions & 0 deletions src/together/cli/api/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,30 @@ def fine_tuning(ctx: click.Context) -> None:
)
@click.option("--batch-size", type=INT_WITH_MAX, default="max", help="Train batch size")
@click.option("--learning-rate", type=float, default=1e-5, help="Learning rate")
@click.option(
"--min-lr-ratio",
type=float,
default=0.0,
help="Final learning rate as a percentage of the initial learning rate",
)
@click.option(
"--warmup-ratio",
type=float,
default=0.0,
help="Warmup ratio for learning rate scheduler.",
)
@click.option(
"--max-grad-norm",
type=float,
default=None,
help="Max gradient norm",
)
@click.option(
"--weight-decay",
type=float,
default=None,
help="Weight decay",
)
@click.option(
"--lora/--no-lora",
type=bool,
Expand Down Expand Up @@ -103,7 +121,10 @@ def create(
n_checkpoints: int,
batch_size: int | Literal["max"],
learning_rate: float,
min_lr_ratio: float,
warmup_ratio: float,
max_grad_norm: float,
weight_decay: float,
lora: bool,
lora_r: int,
lora_dropout: float,
Expand All @@ -125,7 +146,10 @@ def create(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
min_lr_ratio=min_lr_ratio,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
lora=lora,
lora_r=lora_r,
lora_dropout=lora_dropout,
Expand Down Expand Up @@ -194,7 +218,10 @@ def create(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
min_lr_ratio=min_lr_ratio,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
lora=lora,
lora_r=lora_r,
lora_dropout=lora_dropout,
Expand Down
46 changes: 44 additions & 2 deletions src/together/resources/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
TogetherClient,
TogetherRequest,
TrainingType,
FinetuneLRScheduler,
FinetuneLinearLRSchedulerArgs,
)
from together.types.finetune import DownloadCheckpointType
from together.utils import log_warn_once, normalize_key
Expand All @@ -35,7 +37,10 @@ def createFinetuneRequest(
n_checkpoints: int | None = 1,
batch_size: int | Literal["max"] = "max",
learning_rate: float | None = 0.00001,
min_lr_ratio: float | None = 0.0,
warmup_ratio: float | None = 0.0,
max_grad_norm: float | None = None,
weight_decay: float | None = None,
lora: bool = False,
lora_r: int | None = None,
lora_dropout: float | None = 0,
Expand Down Expand Up @@ -82,6 +87,20 @@ def createFinetuneRequest(
if warmup_ratio > 1 or warmup_ratio < 0:
raise ValueError("Warmup ratio should be between 0 and 1")

if min_lr_ratio is not None and (min_lr_ratio > 1 or min_lr_ratio < 0):
raise ValueError("Ending rate should be between 0 and 1")

if max_grad_norm is not None and (max_grad_norm < 0):
raise ValueError("Max gradient norm should be non-negative")

if weight_decay is not None and (weight_decay < 0):
raise ValueError("Weight decay should be non-negative")

lrScheduler = FinetuneLRScheduler(
lr_scheduler_type="linear",
lr_scheduler_args=FinetuneLinearLRSchedulerArgs(min_lr_ratio=min_lr_ratio),
)

finetune_request = FinetuneRequest(
model=model,
training_file=training_file,
Expand All @@ -91,7 +110,10 @@ def createFinetuneRequest(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
lr_scheduler=lrScheduler,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
training_type=training_type,
suffix=suffix,
wandb_key=wandb_api_key,
Expand All @@ -115,7 +137,10 @@ def create(
n_checkpoints: int | None = 1,
batch_size: int | Literal["max"] = "max",
learning_rate: float | None = 0.00001,
min_lr_ratio: float | None = 0.0,
warmup_ratio: float | None = 0.0,
max_grad_norm: float | None = None,
weight_decay: float | None = None,
lora: bool = False,
lora_r: int | None = None,
lora_dropout: float | None = 0,
Expand All @@ -140,7 +165,11 @@ def create(
batch_size (int, optional): Batch size for fine-tuning. Defaults to max.
learning_rate (float, optional): Learning rate multiplier to use for training
Defaults to 0.00001.
min_lr_ratio (float, optional): Ending learning rate as a percentage of the initial learning rate for
learning rate scheduler. Defaults to 0.0.
warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
max_grad_norm (float, optional): Max gradient norm. Defaults to None.
weight_decay (float, optional): Weight decay. Defaults to None.
lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
lora_r (int, optional): Rank of LoRA adapters. Defaults to 8.
lora_dropout (float, optional): Dropout rate for LoRA adapters. Defaults to 0.
Expand Down Expand Up @@ -176,7 +205,10 @@ def create(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
min_lr_ratio=min_lr_ratio,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
lora=lora,
lora_r=lora_r,
lora_dropout=lora_dropout,
Expand Down Expand Up @@ -426,7 +458,10 @@ async def create(
n_checkpoints: int | None = 1,
batch_size: int | Literal["max"] = "max",
learning_rate: float | None = 0.00001,
min_lr_ratio: float | None = 0.0,
warmup_ratio: float | None = 0.0,
max_grad_norm: float | None = None,
weight_decay: float | None = None,
lora: bool = False,
lora_r: int | None = None,
lora_dropout: float | None = 0,
Expand All @@ -449,9 +484,13 @@ async def create(
n_checkpoints (int, optional): Number of checkpoints to save during fine-tuning.
Defaults to 1.
batch_size (int, optional): Batch size for fine-tuning. Defaults to max.
learning_rate (float, optional): Learning rate multiplier to use for training
Defaults to 0.00001.
min_lr_ratio (float, optional): Ending learning rate as a percentage of the initial learning rate for
learning rate scheduler. Defaults to 0.0.
min_lr_ratio (float, optional): Ending learning rate for learning rate scheduler.
Defaults to 0.0.
warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
max_grad_norm (float, optional): Max gradient norm. Defaults to None.
weight_decay (float, optional): Weight decay. Defaults to None.
lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
lora_r (int, optional): Rank of LoRA adapters. Defaults to 8.
lora_dropout (float, optional): Dropout rate for LoRA adapters. Defaults to 0.
Expand Down Expand Up @@ -487,7 +526,10 @@ async def create(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
min_lr_ratio=min_lr_ratio,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
lora=lora,
lora_r=lora_r,
lora_dropout=lora_dropout,
Expand Down
4 changes: 4 additions & 0 deletions src/together/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
LoRATrainingType,
TrainingType,
FinetuneTrainingLimits,
FinetuneLRScheduler,
FinetuneLinearLRSchedulerArgs,
)
from together.types.images import (
ImageRequest,
Expand Down Expand Up @@ -57,6 +59,8 @@
"FinetuneList",
"FinetuneListEvents",
"FinetuneDownloadResult",
"FinetuneLRScheduler",
"FinetuneLinearLRSchedulerArgs",
"FileRequest",
"FileResponse",
"FileList",
Expand Down
17 changes: 17 additions & 0 deletions src/together/types/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,8 @@ class FinetuneRequest(BaseModel):
n_epochs: int
# training learning rate
learning_rate: float
# learning rate scheduler type and args
lr_scheduler: FinetuneLRScheduler | None = None
# learning rate warmup ratio
warmup_ratio: float
# number of checkpoints to save
Expand Down Expand Up @@ -192,8 +194,14 @@ class FinetuneResponse(BaseModel):
batch_size: int | None = None
# training learning rate
learning_rate: float | None = None
# learning rate scheduler type and args
lr_scheduler: FinetuneLRScheduler | None = None
# learning rate warmup ratio
warmup_ratio: float | None = None
# max gradient norm
max_grad_norm: float | None = None
# weight decay
weight_decay: float | None = None
# number of steps between evals
eval_steps: int | None = None
# training type
Expand Down Expand Up @@ -285,3 +293,12 @@ class FinetuneTrainingLimits(BaseModel):
min_learning_rate: float
full_training: FinetuneFullTrainingLimits | None = None
lora_training: FinetuneLoraTrainingLimits | None = None


class FinetuneLRScheduler(BaseModel):
lr_scheduler_type: str
lr_scheduler_args: FinetuneLinearLRSchedulerArgs | None = None


class FinetuneLinearLRSchedulerArgs(BaseModel):
min_lr_ratio: float