diff --git a/pyproject.toml b/pyproject.toml index 82f8090c..c5683567 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api" [tool.poetry] name = "together" -version = "1.5.1" +version = "1.5.2" authors = [ "Together AI " ] diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py index b4943e1b..467c8296 100644 --- a/src/together/cli/api/finetune.py +++ b/src/together/cli/api/finetune.py @@ -79,17 +79,29 @@ def fine_tuning(ctx: click.Context) -> None: "--batch-size", "-b", type=INT_WITH_MAX, default="max", help="Train batch size" ) @click.option("--learning-rate", "-lr", type=float, default=1e-5, help="Learning rate") +@click.option( + "--lr-scheduler-type", + type=click.Choice(["linear", "cosine"]), + default="linear", + help="Learning rate scheduler type", +) @click.option( "--min-lr-ratio", type=float, default=0.0, help="The ratio of the final learning rate to the peak learning rate", ) +@click.option( + "--scheduler-num-cycles", + type=float, + default=0.5, + help="Number or fraction of cycles for the cosine learning rate scheduler.", +) @click.option( "--warmup-ratio", type=float, default=0.0, - help="Warmup ratio for learning rate scheduler.", + help="Warmup ratio for the learning rate scheduler.", ) @click.option( "--max-grad-norm", @@ -174,7 +186,9 @@ def create( n_checkpoints: int, batch_size: int | Literal["max"], learning_rate: float, + lr_scheduler_type: Literal["linear", "cosine"], min_lr_ratio: float, + scheduler_num_cycles: float, warmup_ratio: float, max_grad_norm: float, weight_decay: float, @@ -206,7 +220,9 @@ def create( n_checkpoints=n_checkpoints, batch_size=batch_size, learning_rate=learning_rate, + lr_scheduler_type=lr_scheduler_type, min_lr_ratio=min_lr_ratio, + scheduler_num_cycles=scheduler_num_cycles, warmup_ratio=warmup_ratio, max_grad_norm=max_grad_norm, weight_decay=weight_decay, diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py index 08710750..6de548ea 100644 --- a/src/together/resources/finetune.py +++ b/src/together/resources/finetune.py @@ -22,7 +22,10 @@ TogetherRequest, TrainingType, FinetuneLRScheduler, + FinetuneLinearLRScheduler, + FinetuneCosineLRScheduler, FinetuneLinearLRSchedulerArgs, + FinetuneCosineLRSchedulerArgs, TrainingMethodDPO, TrainingMethodSFT, FinetuneCheckpoint, @@ -57,7 +60,9 @@ def createFinetuneRequest( n_checkpoints: int | None = 1, batch_size: int | Literal["max"] = "max", learning_rate: float | None = 0.00001, + lr_scheduler_type: Literal["linear", "cosine"] = "linear", min_lr_ratio: float = 0.0, + scheduler_num_cycles: float = 0.5, warmup_ratio: float = 0.0, max_grad_norm: float = 1.0, weight_decay: float = 0.0, @@ -134,10 +139,22 @@ def createFinetuneRequest( f"training_method must be one of {', '.join(AVAILABLE_TRAINING_METHODS)}" ) - lrScheduler = FinetuneLRScheduler( - lr_scheduler_type="linear", - lr_scheduler_args=FinetuneLinearLRSchedulerArgs(min_lr_ratio=min_lr_ratio), - ) + # Default to generic lr scheduler + lrScheduler: FinetuneLRScheduler = FinetuneLRScheduler(lr_scheduler_type="linear") + + if lr_scheduler_type == "cosine": + if scheduler_num_cycles <= 0.0: + raise ValueError("Number of cycles should be greater than 0") + + lrScheduler = FinetuneCosineLRScheduler( + lr_scheduler_args=FinetuneCosineLRSchedulerArgs( + min_lr_ratio=min_lr_ratio, num_cycles=scheduler_num_cycles + ), + ) + else: + lrScheduler = FinetuneLinearLRScheduler( + lr_scheduler_args=FinetuneLinearLRSchedulerArgs(min_lr_ratio=min_lr_ratio), + ) training_method_cls: TrainingMethodSFT | TrainingMethodDPO = TrainingMethodSFT() if training_method == "dpo": @@ -249,7 +266,9 @@ def create( n_checkpoints: int | None = 1, batch_size: int | Literal["max"] = "max", learning_rate: float | None = 0.00001, + lr_scheduler_type: Literal["linear", "cosine"] = "linear", min_lr_ratio: float = 0.0, + scheduler_num_cycles: float = 0.5, warmup_ratio: float = 0.0, max_grad_norm: float = 1.0, weight_decay: float = 0.0, @@ -284,9 +303,11 @@ def create( batch_size (int or "max"): Batch size for fine-tuning. Defaults to max. learning_rate (float, optional): Learning rate multiplier to use for training Defaults to 0.00001. + lr_scheduler_type (Literal["linear", "cosine"]): Learning rate scheduler type. Defaults to "linear". min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for the learning rate scheduler. Defaults to 0.0. - warmup_ratio (float, optional): Warmup ratio for learning rate scheduler. + scheduler_num_cycles (float, optional): Number or fraction of cycles for the cosine learning rate scheduler. Defaults to 0.5. + warmup_ratio (float, optional): Warmup ratio for the learning rate scheduler. max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable. weight_decay (float, optional): Weight decay. Defaults to 0.0. lora (bool, optional): Whether to use LoRA adapters. Defaults to True. @@ -353,7 +374,9 @@ def create( n_checkpoints=n_checkpoints, batch_size=batch_size, learning_rate=learning_rate, + lr_scheduler_type=lr_scheduler_type, min_lr_ratio=min_lr_ratio, + scheduler_num_cycles=scheduler_num_cycles, warmup_ratio=warmup_ratio, max_grad_norm=max_grad_norm, weight_decay=weight_decay, @@ -634,7 +657,9 @@ async def create( n_checkpoints: int | None = 1, batch_size: int | Literal["max"] = "max", learning_rate: float | None = 0.00001, + lr_scheduler_type: Literal["linear", "cosine"] = "linear", min_lr_ratio: float = 0.0, + scheduler_num_cycles: float = 0.5, warmup_ratio: float = 0.0, max_grad_norm: float = 1.0, weight_decay: float = 0.0, @@ -669,9 +694,11 @@ async def create( batch_size (int, optional): Batch size for fine-tuning. Defaults to max. learning_rate (float, optional): Learning rate multiplier to use for training Defaults to 0.00001. + lr_scheduler_type (Literal["linear", "cosine"]): Learning rate scheduler type. Defaults to "linear". min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for the learning rate scheduler. Defaults to 0.0. - warmup_ratio (float, optional): Warmup ratio for learning rate scheduler. + scheduler_num_cycles (float, optional): Number or fraction of cycles for the cosine learning rate scheduler. Defaults to 0.5. + warmup_ratio (float, optional): Warmup ratio for the learning rate scheduler. max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable. weight_decay (float, optional): Weight decay. Defaults to 0.0. lora (bool, optional): Whether to use LoRA adapters. Defaults to True. @@ -738,7 +765,9 @@ async def create( n_checkpoints=n_checkpoints, batch_size=batch_size, learning_rate=learning_rate, + lr_scheduler_type=lr_scheduler_type, min_lr_ratio=min_lr_ratio, + scheduler_num_cycles=scheduler_num_cycles, warmup_ratio=warmup_ratio, max_grad_norm=max_grad_norm, weight_decay=weight_decay, diff --git a/src/together/types/__init__.py b/src/together/types/__init__.py index 47fed22b..53e1858e 100644 --- a/src/together/types/__init__.py +++ b/src/together/types/__init__.py @@ -34,11 +34,14 @@ TrainingMethodDPO, TrainingMethodSFT, FinetuneCheckpoint, + FinetuneCosineLRScheduler, + FinetuneCosineLRSchedulerArgs, FinetuneDownloadResult, + FinetuneLinearLRScheduler, FinetuneLinearLRSchedulerArgs, + FinetuneLRScheduler, FinetuneList, FinetuneListEvents, - FinetuneLRScheduler, FinetuneRequest, FinetuneResponse, FinetuneTrainingLimits, @@ -69,7 +72,10 @@ "FinetuneListEvents", "FinetuneDownloadResult", "FinetuneLRScheduler", + "FinetuneLinearLRScheduler", "FinetuneLinearLRSchedulerArgs", + "FinetuneCosineLRScheduler", + "FinetuneCosineLRSchedulerArgs", "FileRequest", "FileResponse", "FileList", diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py index 94140a92..7f085132 100644 --- a/src/together/types/finetune.py +++ b/src/together/types/finetune.py @@ -1,9 +1,9 @@ from __future__ import annotations from enum import Enum -from typing import List, Literal +from typing import List, Literal, Union -from pydantic import StrictBool, Field, validator, field_validator +from pydantic import StrictBool, Field, validator, field_validator, ValidationInfo from together.types.abstract import BaseModel from together.types.common import ( @@ -176,7 +176,7 @@ class FinetuneRequest(BaseModel): # training learning rate learning_rate: float # learning rate scheduler type and args - lr_scheduler: FinetuneLRScheduler | None = None + lr_scheduler: FinetuneLinearLRScheduler | FinetuneCosineLRScheduler | None = None # learning rate warmup ratio warmup_ratio: float # max gradient norm @@ -239,7 +239,7 @@ class FinetuneResponse(BaseModel): # training learning rate learning_rate: float | None = None # learning rate scheduler type and args - lr_scheduler: FinetuneLRScheduler | None = None + lr_scheduler: FinetuneLinearLRScheduler | FinetuneCosineLRScheduler | None = None # learning rate warmup ratio warmup_ratio: float | None = None # max gradient norm @@ -345,13 +345,27 @@ class FinetuneTrainingLimits(BaseModel): lora_training: FinetuneLoraTrainingLimits | None = None +class FinetuneLinearLRSchedulerArgs(BaseModel): + min_lr_ratio: float | None = 0.0 + + +class FinetuneCosineLRSchedulerArgs(BaseModel): + min_lr_ratio: float | None = 0.0 + num_cycles: float | None = 0.5 + + class FinetuneLRScheduler(BaseModel): lr_scheduler_type: str - lr_scheduler_args: FinetuneLinearLRSchedulerArgs | None = None -class FinetuneLinearLRSchedulerArgs(BaseModel): - min_lr_ratio: float | None = 0.0 +class FinetuneLinearLRScheduler(FinetuneLRScheduler): + lr_scheduler_type: Literal["linear"] = "linear" + lr_scheduler: FinetuneLinearLRSchedulerArgs | None = None + + +class FinetuneCosineLRScheduler(FinetuneLRScheduler): + lr_scheduler_type: Literal["cosine"] = "cosine" + lr_scheduler: FinetuneCosineLRSchedulerArgs | None = None class FinetuneCheckpoint(BaseModel):