Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api"

[tool.poetry]
name = "together"
version = "1.5.0"
version = "1.5.1"
authors = [
"Together AI <[email protected]>"
]
Expand Down
18 changes: 17 additions & 1 deletion src/together/cli/api/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,17 +71,29 @@ def fine_tuning(ctx: click.Context) -> None:
)
@click.option("--batch-size", type=INT_WITH_MAX, default="max", help="Train batch size")
@click.option("--learning-rate", type=float, default=1e-5, help="Learning rate")
@click.option(
"--lr-scheduler-type",
type=click.Choice(["linear", "cosine"]),
default="linear",
help="Learning rate scheduler type",
)
@click.option(
"--min-lr-ratio",
type=float,
default=0.0,
help="The ratio of the final learning rate to the peak learning rate",
)
@click.option(
"--scheduler-num-cycles",
type=float,
default=0.5,
help="Number or fraction of cycles for the cosine learning rate scheduler.",
)
@click.option(
"--warmup-ratio",
type=float,
default=0.0,
help="Warmup ratio for learning rate scheduler.",
help="Warmup ratio for the learning rate scheduler.",
)
@click.option(
"--max-grad-norm",
Expand Down Expand Up @@ -162,7 +174,9 @@ def create(
n_checkpoints: int,
batch_size: int | Literal["max"],
learning_rate: float,
lr_scheduler_type: Literal["linear", "cosine"],
min_lr_ratio: float,
scheduler_num_cycles: float,
warmup_ratio: float,
max_grad_norm: float,
weight_decay: float,
Expand Down Expand Up @@ -194,7 +208,9 @@ def create(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
lr_scheduler_type=lr_scheduler_type,
min_lr_ratio=min_lr_ratio,
scheduler_num_cycles=scheduler_num_cycles,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
Expand Down
41 changes: 35 additions & 6 deletions src/together/resources/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@
TogetherRequest,
TrainingType,
FinetuneLRScheduler,
FinetuneLinearLRScheduler,
FinetuneCosineLRScheduler,
FinetuneLinearLRSchedulerArgs,
FinetuneCosineLRSchedulerArgs,
TrainingMethodDPO,
TrainingMethodSFT,
FinetuneCheckpoint,
Expand Down Expand Up @@ -57,7 +60,9 @@ def createFinetuneRequest(
n_checkpoints: int | None = 1,
batch_size: int | Literal["max"] = "max",
learning_rate: float | None = 0.00001,
lr_scheduler_type: Literal["linear", "cosine"] = "linear",
min_lr_ratio: float = 0.0,
scheduler_num_cycles: float = 0.5,
warmup_ratio: float = 0.0,
max_grad_norm: float = 1.0,
weight_decay: float = 0.0,
Expand Down Expand Up @@ -129,10 +134,22 @@ def createFinetuneRequest(
f"training_method must be one of {', '.join(AVAILABLE_TRAINING_METHODS)}"
)

lrScheduler = FinetuneLRScheduler(
lr_scheduler_type="linear",
lr_scheduler_args=FinetuneLinearLRSchedulerArgs(min_lr_ratio=min_lr_ratio),
)
# Default to generic lr scheduler
lrScheduler: FinetuneLRScheduler = FinetuneLRScheduler(lr_scheduler_type="linear")

if lr_scheduler_type == "cosine":
if scheduler_num_cycles <= 0.0:
raise ValueError("Number of cycles should be greater than 0")

lrScheduler = FinetuneCosineLRScheduler(
lr_scheduler_args=FinetuneCosineLRSchedulerArgs(
min_lr_ratio=min_lr_ratio, num_cycles=scheduler_num_cycles
),
)
else:
lrScheduler = FinetuneLinearLRScheduler(
lr_scheduler_args=FinetuneLinearLRSchedulerArgs(min_lr_ratio=min_lr_ratio),
)

training_method_cls: TrainingMethodSFT | TrainingMethodDPO = TrainingMethodSFT()
if training_method == "dpo":
Expand Down Expand Up @@ -244,7 +261,9 @@ def create(
n_checkpoints: int | None = 1,
batch_size: int | Literal["max"] = "max",
learning_rate: float | None = 0.00001,
lr_scheduler_type: Literal["linear", "cosine"] = "linear",
min_lr_ratio: float = 0.0,
scheduler_num_cycles: float = 0.5,
warmup_ratio: float = 0.0,
max_grad_norm: float = 1.0,
weight_decay: float = 0.0,
Expand Down Expand Up @@ -279,9 +298,11 @@ def create(
batch_size (int or "max"): Batch size for fine-tuning. Defaults to max.
learning_rate (float, optional): Learning rate multiplier to use for training
Defaults to 0.00001.
lr_scheduler_type (Literal["linear", "cosine"]): Learning rate scheduler type. Defaults to "linear".
min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for
the learning rate scheduler. Defaults to 0.0.
warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
scheduler_num_cycles (float, optional): Number or fraction of cycles for the cosine learning rate scheduler. Defaults to 0.5.
warmup_ratio (float, optional): Warmup ratio for the learning rate scheduler.
max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable.
weight_decay (float, optional): Weight decay. Defaults to 0.0.
lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
Expand Down Expand Up @@ -336,7 +357,9 @@ def create(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
lr_scheduler_type=lr_scheduler_type,
min_lr_ratio=min_lr_ratio,
scheduler_num_cycles=scheduler_num_cycles,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
Expand Down Expand Up @@ -617,7 +640,9 @@ async def create(
n_checkpoints: int | None = 1,
batch_size: int | Literal["max"] = "max",
learning_rate: float | None = 0.00001,
lr_scheduler_type: Literal["linear", "cosine"] = "linear",
min_lr_ratio: float = 0.0,
scheduler_num_cycles: float = 0.5,
warmup_ratio: float = 0.0,
max_grad_norm: float = 1.0,
weight_decay: float = 0.0,
Expand Down Expand Up @@ -652,9 +677,11 @@ async def create(
batch_size (int, optional): Batch size for fine-tuning. Defaults to max.
learning_rate (float, optional): Learning rate multiplier to use for training
Defaults to 0.00001.
lr_scheduler_type (Literal["linear", "cosine"]): Learning rate scheduler type. Defaults to "linear".
min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for
the learning rate scheduler. Defaults to 0.0.
warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
scheduler_num_cycles (float, optional): Number or fraction of cycles for the cosine learning rate scheduler. Defaults to 0.5.
warmup_ratio (float, optional): Warmup ratio for the learning rate scheduler.
max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable.
weight_decay (float, optional): Weight decay. Defaults to 0.0.
lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
Expand Down Expand Up @@ -710,7 +737,9 @@ async def create(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
lr_scheduler_type=lr_scheduler_type,
min_lr_ratio=min_lr_ratio,
scheduler_num_cycles=scheduler_num_cycles,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
Expand Down
8 changes: 7 additions & 1 deletion src/together/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,14 @@
TrainingMethodDPO,
TrainingMethodSFT,
FinetuneCheckpoint,
FinetuneCosineLRScheduler,
FinetuneCosineLRSchedulerArgs,
FinetuneDownloadResult,
FinetuneLinearLRScheduler,
FinetuneLinearLRSchedulerArgs,
FinetuneLRScheduler,
FinetuneList,
FinetuneListEvents,
FinetuneLRScheduler,
FinetuneRequest,
FinetuneResponse,
FinetuneTrainingLimits,
Expand Down Expand Up @@ -69,7 +72,10 @@
"FinetuneListEvents",
"FinetuneDownloadResult",
"FinetuneLRScheduler",
"FinetuneLinearLRScheduler",
"FinetuneLinearLRSchedulerArgs",
"FinetuneCosineLRScheduler",
"FinetuneCosineLRSchedulerArgs",
"FileRequest",
"FileResponse",
"FileList",
Expand Down
28 changes: 21 additions & 7 deletions src/together/types/finetune.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from __future__ import annotations

from enum import Enum
from typing import List, Literal
from typing import List, Literal, Union

from pydantic import StrictBool, Field, validator, field_validator
from pydantic import StrictBool, Field, validator, field_validator, ValidationInfo

from together.types.abstract import BaseModel
from together.types.common import (
Expand Down Expand Up @@ -176,7 +176,7 @@ class FinetuneRequest(BaseModel):
# training learning rate
learning_rate: float
# learning rate scheduler type and args
lr_scheduler: FinetuneLRScheduler | None = None
lr_scheduler: FinetuneLinearLRScheduler | FinetuneCosineLRScheduler | None = None
# learning rate warmup ratio
warmup_ratio: float
# max gradient norm
Expand Down Expand Up @@ -239,7 +239,7 @@ class FinetuneResponse(BaseModel):
# training learning rate
learning_rate: float | None = None
# learning rate scheduler type and args
lr_scheduler: FinetuneLRScheduler | None = None
lr_scheduler: FinetuneLinearLRScheduler | FinetuneCosineLRScheduler | None = None
# learning rate warmup ratio
warmup_ratio: float | None = None
# max gradient norm
Expand Down Expand Up @@ -345,13 +345,27 @@ class FinetuneTrainingLimits(BaseModel):
lora_training: FinetuneLoraTrainingLimits | None = None


class FinetuneLinearLRSchedulerArgs(BaseModel):
min_lr_ratio: float | None = 0.0


class FinetuneCosineLRSchedulerArgs(BaseModel):
min_lr_ratio: float | None = 0.0
num_cycles: float | None = 0.5


class FinetuneLRScheduler(BaseModel):
lr_scheduler_type: str
lr_scheduler_args: FinetuneLinearLRSchedulerArgs | None = None


class FinetuneLinearLRSchedulerArgs(BaseModel):
min_lr_ratio: float | None = 0.0
class FinetuneLinearLRScheduler(FinetuneLRScheduler):
lr_scheduler_type: Literal["linear"] = "linear"
lr_scheduler: FinetuneLinearLRSchedulerArgs | None = None


class FinetuneCosineLRScheduler(FinetuneLRScheduler):
lr_scheduler_type: Literal["cosine"] = "cosine"
lr_scheduler: FinetuneCosineLRSchedulerArgs | None = None


class FinetuneCheckpoint(BaseModel):
Expand Down