diff --git a/pyproject.toml b/pyproject.toml index 716576d6..ed948da4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api" [tool.poetry] name = "together" -version = "1.3.4" +version = "1.3.5" authors = [ "Together AI " ] diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py index bd509e60..36fba827 100644 --- a/src/together/cli/api/finetune.py +++ b/src/together/cli/api/finetune.py @@ -65,12 +65,30 @@ def fine_tuning(ctx: click.Context) -> None: ) @click.option("--batch-size", type=INT_WITH_MAX, default="max", help="Train batch size") @click.option("--learning-rate", type=float, default=1e-5, help="Learning rate") +@click.option( + "--min-lr-ratio", + type=float, + default=0.0, + help="The ratio of the final learning rate to the peak learning rate", +) @click.option( "--warmup-ratio", type=float, default=0.0, help="Warmup ratio for learning rate scheduler.", ) +@click.option( + "--max-grad-norm", + type=float, + default=1.0, + help="Max gradient norm to be used for gradient clipping. Set to 0 to disable.", +) +@click.option( + "--weight-decay", + type=float, + default=0.0, + help="Weight decay", +) @click.option( "--lora/--no-lora", type=bool, @@ -115,7 +133,10 @@ def create( n_checkpoints: int, batch_size: int | Literal["max"], learning_rate: float, + min_lr_ratio: float, warmup_ratio: float, + max_grad_norm: float, + weight_decay: float, lora: bool, lora_r: int, lora_dropout: float, @@ -138,7 +159,10 @@ def create( n_checkpoints=n_checkpoints, batch_size=batch_size, learning_rate=learning_rate, + min_lr_ratio=min_lr_ratio, warmup_ratio=warmup_ratio, + max_grad_norm=max_grad_norm, + weight_decay=weight_decay, lora=lora, lora_r=lora_r, lora_dropout=lora_dropout, diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py index 79596dd2..ceb7bf0b 100644 --- a/src/together/resources/finetune.py +++ b/src/together/resources/finetune.py @@ -20,6 +20,8 @@ TogetherClient, TogetherRequest, TrainingType, + FinetuneLRScheduler, + FinetuneLinearLRSchedulerArgs, ) from together.types.finetune import DownloadCheckpointType from together.utils import log_warn_once, normalize_key @@ -35,7 +37,10 @@ def createFinetuneRequest( n_checkpoints: int | None = 1, batch_size: int | Literal["max"] = "max", learning_rate: float | None = 0.00001, - warmup_ratio: float | None = 0.0, + min_lr_ratio: float = 0.0, + warmup_ratio: float = 0.0, + max_grad_norm: float = 1.0, + weight_decay: float = 0.0, lora: bool = False, lora_r: int | None = None, lora_dropout: float | None = 0, @@ -83,6 +88,20 @@ def createFinetuneRequest( if warmup_ratio > 1 or warmup_ratio < 0: raise ValueError("Warmup ratio should be between 0 and 1") + if min_lr_ratio is not None and (min_lr_ratio > 1 or min_lr_ratio < 0): + raise ValueError("Min learning rate ratio should be between 0 and 1") + + if max_grad_norm < 0: + raise ValueError("Max gradient norm should be non-negative") + + if weight_decay is not None and (weight_decay < 0): + raise ValueError("Weight decay should be non-negative") + + lrScheduler = FinetuneLRScheduler( + lr_scheduler_type="linear", + lr_scheduler_args=FinetuneLinearLRSchedulerArgs(min_lr_ratio=min_lr_ratio), + ) + finetune_request = FinetuneRequest( model=model, training_file=training_file, @@ -92,7 +111,10 @@ def createFinetuneRequest( n_checkpoints=n_checkpoints, batch_size=batch_size, learning_rate=learning_rate, + lr_scheduler=lrScheduler, warmup_ratio=warmup_ratio, + max_grad_norm=max_grad_norm, + weight_decay=weight_decay, training_type=training_type, suffix=suffix, wandb_key=wandb_api_key, @@ -117,7 +139,10 @@ def create( n_checkpoints: int | None = 1, batch_size: int | Literal["max"] = "max", learning_rate: float | None = 0.00001, - warmup_ratio: float | None = 0.0, + min_lr_ratio: float = 0.0, + warmup_ratio: float = 0.0, + max_grad_norm: float = 1.0, + weight_decay: float = 0.0, lora: bool = False, lora_r: int | None = None, lora_dropout: float | None = 0, @@ -143,7 +168,11 @@ def create( batch_size (int or "max"): Batch size for fine-tuning. Defaults to max. learning_rate (float, optional): Learning rate multiplier to use for training Defaults to 0.00001. + min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for + the learning rate scheduler. Defaults to 0.0. warmup_ratio (float, optional): Warmup ratio for learning rate scheduler. + max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable. + weight_decay (float, optional): Weight decay. Defaults to 0.0. lora (bool, optional): Whether to use LoRA adapters. Defaults to True. lora_r (int, optional): Rank of LoRA adapters. Defaults to 8. lora_dropout (float, optional): Dropout rate for LoRA adapters. Defaults to 0. @@ -185,7 +214,10 @@ def create( n_checkpoints=n_checkpoints, batch_size=batch_size, learning_rate=learning_rate, + min_lr_ratio=min_lr_ratio, warmup_ratio=warmup_ratio, + max_grad_norm=max_grad_norm, + weight_decay=weight_decay, lora=lora, lora_r=lora_r, lora_dropout=lora_dropout, @@ -436,7 +468,10 @@ async def create( n_checkpoints: int | None = 1, batch_size: int | Literal["max"] = "max", learning_rate: float | None = 0.00001, - warmup_ratio: float | None = 0.0, + min_lr_ratio: float = 0.0, + warmup_ratio: float = 0.0, + max_grad_norm: float = 1.0, + weight_decay: float = 0.0, lora: bool = False, lora_r: int | None = None, lora_dropout: float | None = 0, @@ -462,7 +497,11 @@ async def create( batch_size (int, optional): Batch size for fine-tuning. Defaults to max. learning_rate (float, optional): Learning rate multiplier to use for training Defaults to 0.00001. + min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for + the learning rate scheduler. Defaults to 0.0. warmup_ratio (float, optional): Warmup ratio for learning rate scheduler. + max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable. + weight_decay (float, optional): Weight decay. Defaults to 0.0. lora (bool, optional): Whether to use LoRA adapters. Defaults to True. lora_r (int, optional): Rank of LoRA adapters. Defaults to 8. lora_dropout (float, optional): Dropout rate for LoRA adapters. Defaults to 0. @@ -504,7 +543,10 @@ async def create( n_checkpoints=n_checkpoints, batch_size=batch_size, learning_rate=learning_rate, + min_lr_ratio=min_lr_ratio, warmup_ratio=warmup_ratio, + max_grad_norm=max_grad_norm, + weight_decay=weight_decay, lora=lora, lora_r=lora_r, lora_dropout=lora_dropout, diff --git a/src/together/types/__init__.py b/src/together/types/__init__.py index 9aa8c161..f6f85083 100644 --- a/src/together/types/__init__.py +++ b/src/together/types/__init__.py @@ -30,6 +30,8 @@ LoRATrainingType, TrainingType, FinetuneTrainingLimits, + FinetuneLRScheduler, + FinetuneLinearLRSchedulerArgs, ) from together.types.images import ( ImageRequest, @@ -57,6 +59,8 @@ "FinetuneList", "FinetuneListEvents", "FinetuneDownloadResult", + "FinetuneLRScheduler", + "FinetuneLinearLRSchedulerArgs", "FileRequest", "FileResponse", "FileList", diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py index f1fabb04..7a638859 100644 --- a/src/together/types/finetune.py +++ b/src/together/types/finetune.py @@ -150,8 +150,14 @@ class FinetuneRequest(BaseModel): n_epochs: int # training learning rate learning_rate: float + # learning rate scheduler type and args + lr_scheduler: FinetuneLRScheduler | None = None # learning rate warmup ratio warmup_ratio: float + # max gradient norm + max_grad_norm: float + # weight decay + weight_decay: float # number of checkpoints to save n_checkpoints: int | None = None # number of evaluation loops to run @@ -193,8 +199,14 @@ class FinetuneResponse(BaseModel): batch_size: int | None = None # training learning rate learning_rate: float | None = None + # learning rate scheduler type and args + lr_scheduler: FinetuneLRScheduler | None = None # learning rate warmup ratio warmup_ratio: float | None = None + # max gradient norm + max_grad_norm: float | None = None + # weight decay + weight_decay: float | None = None # number of steps between evals eval_steps: int | None = None # training type @@ -287,3 +299,12 @@ class FinetuneTrainingLimits(BaseModel): min_learning_rate: float full_training: FinetuneFullTrainingLimits | None = None lora_training: FinetuneLoraTrainingLimits | None = None + + +class FinetuneLRScheduler(BaseModel): + lr_scheduler_type: str + lr_scheduler_args: FinetuneLinearLRSchedulerArgs | None = None + + +class FinetuneLinearLRSchedulerArgs(BaseModel): + min_lr_ratio: float | None = 0.0