Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 33 additions & 21 deletions src/together/resources/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,45 +2,46 @@

import re
from pathlib import Path
from typing import Literal, List
from typing import List, Literal

from rich import print as rprint

from together.abstract import api_requestor
from together.filemanager import DownloadManager
from together.together_response import TogetherResponse
from together.types import (
CosineLRScheduler,
CosineLRSchedulerArgs,
FinetuneCheckpoint,
FinetuneDownloadResult,
FinetuneList,
FinetuneListEvents,
FinetuneLRScheduler,
FinetuneRequest,
FinetuneResponse,
FinetuneTrainingLimits,
FullTrainingType,
LinearLRScheduler,
LinearLRSchedulerArgs,
LoRATrainingType,
TogetherClient,
TogetherRequest,
TrainingType,
FinetuneLRScheduler,
LinearLRScheduler,
CosineLRScheduler,
LinearLRSchedulerArgs,
CosineLRSchedulerArgs,
TrainingMethodDPO,
TrainingMethodSFT,
FinetuneCheckpoint,
TrainingType,
)
from together.types.finetune import (
DownloadCheckpointType,
FinetuneEventType,
FinetuneEvent,
FinetuneEventType,
)
from together.utils import (
get_event_step,
log_warn_once,
normalize_key,
get_event_step,
)


_FT_JOB_WITH_STEP_REGEX = r"^ft-[\dabcdef-]+:\d+$"


Expand All @@ -63,7 +64,7 @@ def create_finetune_request(
lr_scheduler_type: Literal["linear", "cosine"] = "linear",
min_lr_ratio: float = 0.0,
scheduler_num_cycles: float = 0.5,
warmup_ratio: float = 0.0,
warmup_ratio: float | None = None,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to match the if warmup_ratio is None check

max_grad_norm: float = 1.0,
weight_decay: float = 0.0,
lora: bool = False,
Expand All @@ -81,7 +82,6 @@ def create_finetune_request(
dpo_beta: float | None = None,
from_checkpoint: str | None = None,
) -> FinetuneRequest:

if model is not None and from_checkpoint is not None:
raise ValueError(
"You must specify either a model or a checkpoint to start a job from, not both"
Expand All @@ -90,6 +90,8 @@ def create_finetune_request(
if model is None and from_checkpoint is None:
raise ValueError("You must specify either a model or a checkpoint")

model_or_checkpoint = model or from_checkpoint

if batch_size == "max":
log_warn_once(
"Starting from together>=1.3.0, "
Expand All @@ -103,7 +105,9 @@ def create_finetune_request(
min_batch_size: int = 0
if lora:
if model_limits.lora_training is None:
raise ValueError("LoRA adapters are not supported for the selected model.")
raise ValueError(
f"LoRA adapters are not supported for the selected model ({model_or_checkpoint})."
)
lora_r = lora_r if lora_r is not None else model_limits.lora_training.max_rank
lora_alpha = lora_alpha if lora_alpha is not None else lora_r * 2
training_type = LoRATrainingType(
Expand All @@ -118,7 +122,9 @@ def create_finetune_request(

else:
if model_limits.full_training is None:
raise ValueError("Full training is not supported for the selected model.")
raise ValueError(
f"Full training is not supported for the selected model ({model_or_checkpoint})."
)

max_batch_size = model_limits.full_training.max_batch_size
min_batch_size = model_limits.full_training.min_batch_size
Expand All @@ -127,25 +133,29 @@ def create_finetune_request(

if batch_size > max_batch_size:
raise ValueError(
"Requested batch size is higher that the maximum allowed value."
f"Requested batch size of {batch_size} is higher that the maximum allowed value of {max_batch_size}."
)

if batch_size < min_batch_size:
raise ValueError(
"Requested batch size is lower that the minimum allowed value."
f"Requested batch size of {batch_size} is lower that the minimum allowed value of {min_batch_size}."
)

if warmup_ratio > 1 or warmup_ratio < 0:
raise ValueError("Warmup ratio should be between 0 and 1")
raise ValueError(f"Warmup ratio should be between 0 and 1 (got {warmup_ratio})")

if min_lr_ratio is not None and (min_lr_ratio > 1 or min_lr_ratio < 0):
raise ValueError("Min learning rate ratio should be between 0 and 1")
raise ValueError(
f"Min learning rate ratio should be between 0 and 1 (got {min_lr_ratio})"
)

if max_grad_norm < 0:
raise ValueError("Max gradient norm should be non-negative")
raise ValueError(
f"Max gradient norm should be non-negative (got {max_grad_norm})"
)

if weight_decay is not None and (weight_decay < 0):
raise ValueError("Weight decay should be non-negative")
raise ValueError(f"Weight decay should be non-negative (got {weight_decay})")

if training_method not in AVAILABLE_TRAINING_METHODS:
raise ValueError(
Expand All @@ -155,7 +165,9 @@ def create_finetune_request(
lr_scheduler: FinetuneLRScheduler
if lr_scheduler_type == "cosine":
if scheduler_num_cycles <= 0.0:
raise ValueError("Number of cycles should be greater than 0")
raise ValueError(
f"Number of cycles should be greater than 0 (got {scheduler_num_cycles})"
)

lr_scheduler = CosineLRScheduler(
lr_scheduler_args=CosineLRSchedulerArgs(
Expand Down
76 changes: 31 additions & 45 deletions tests/unit/test_finetune_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

from together.resources.finetune import create_finetune_request
from together.types.finetune import (
FinetuneTrainingLimits,
FinetuneFullTrainingLimits,
FinetuneLoraTrainingLimits,
FinetuneTrainingLimits,
)


Expand Down Expand Up @@ -117,50 +117,36 @@ def test_no_from_checkpoint_no_model_name():
)


def test_batch_size_limit():
with pytest.raises(
ValueError,
match="Requested batch size is higher that the maximum allowed value",
):
_ = create_finetune_request(
model_limits=_MODEL_LIMITS,
model=_MODEL_NAME,
training_file=_TRAINING_FILE,
batch_size=128,
)

with pytest.raises(
ValueError, match="Requested batch size is lower that the minimum allowed value"
):
_ = create_finetune_request(
model_limits=_MODEL_LIMITS,
model=_MODEL_NAME,
training_file=_TRAINING_FILE,
batch_size=1,
)

with pytest.raises(
ValueError,
match="Requested batch size is higher that the maximum allowed value",
):
_ = create_finetune_request(
model_limits=_MODEL_LIMITS,
model=_MODEL_NAME,
training_file=_TRAINING_FILE,
batch_size=256,
lora=True,
)

with pytest.raises(
ValueError, match="Requested batch size is lower that the minimum allowed value"
):
_ = create_finetune_request(
model_limits=_MODEL_LIMITS,
model=_MODEL_NAME,
training_file=_TRAINING_FILE,
batch_size=1,
lora=True,
)
@pytest.mark.parametrize("batch_size", [256, 1])
@pytest.mark.parametrize("use_lora", [False, True])
def test_batch_size_limit(batch_size, use_lora):
model_limits = (
_MODEL_LIMITS.full_training if not use_lora else _MODEL_LIMITS.lora_training
)
max_batch_size = model_limits.max_batch_size
min_batch_size = model_limits.min_batch_size

if batch_size > max_batch_size:
error_message = f"Requested batch size of {batch_size} is higher that the maximum allowed value of {max_batch_size}"
with pytest.raises(ValueError, match=error_message):
_ = create_finetune_request(
model_limits=_MODEL_LIMITS,
model=_MODEL_NAME,
training_file=_TRAINING_FILE,
batch_size=batch_size,
lora=use_lora,
)

if batch_size < min_batch_size:
error_message = f"Requested batch size of {batch_size} is lower that the minimum allowed value of {min_batch_size}"
with pytest.raises(ValueError, match=error_message):
_ = create_finetune_request(
model_limits=_MODEL_LIMITS,
model=_MODEL_NAME,
training_file=_TRAINING_FILE,
batch_size=batch_size,
lora=use_lora,
)


def test_non_lora_model():
Expand Down