diff --git a/pyproject.toml b/pyproject.toml index 55a69e7a..a2004874 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api" [tool.poetry] name = "together" -version = "1.5.21" +version = "1.5.22" authors = ["Together AI "] description = "Python client for Together's Cloud Platform!" readme = "README.md" diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py index eaff63b7..c3978501 100644 --- a/src/together/cli/api/finetune.py +++ b/src/together/cli/api/finetune.py @@ -304,13 +304,8 @@ def create( raise click.BadParameter( f"LoRA fine-tuning is not supported for the model `{model}`" ) - if training_method == "dpo": - default_batch_size = model_limits.lora_training.max_batch_size_dpo - else: - default_batch_size = model_limits.lora_training.max_batch_size default_values = { "lora_r": model_limits.lora_training.max_rank, - "batch_size": default_batch_size, "learning_rate": 1e-3, } @@ -335,15 +330,6 @@ def create( f"Please change the job type with --lora or remove `{param}` from the arguments" ) - batch_size_source = ctx.get_parameter_source("batch_size") # type: ignore[attr-defined] - if batch_size_source == ParameterSource.DEFAULT: - if training_method == "dpo": - training_args["batch_size"] = ( - model_limits.full_training.max_batch_size_dpo - ) - else: - training_args["batch_size"] = model_limits.full_training.max_batch_size - if n_evals <= 0 and validation_file: log_warn( "Warning: You have specified a validation file but the number of evaluation loops is set to 0. No evaluations will be performed." diff --git a/src/together/legacy/finetune.py b/src/together/legacy/finetune.py index a8a973bb..478d7c7b 100644 --- a/src/together/legacy/finetune.py +++ b/src/together/legacy/finetune.py @@ -16,7 +16,7 @@ def create( model: str, n_epochs: int = 1, n_checkpoints: int | None = 1, - batch_size: int | None = 32, + batch_size: int | Literal["max"] = "max", learning_rate: float = 0.00001, suffix: ( str | None @@ -43,7 +43,7 @@ def create( model=model, n_epochs=n_epochs, n_checkpoints=n_checkpoints, - batch_size=batch_size if isinstance(batch_size, int) else "max", + batch_size=batch_size, learning_rate=learning_rate, suffix=suffix, wandb_api_key=wandb_api_key, diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py index b69c2a3f..8c4d0eb1 100644 --- a/src/together/resources/finetune.py +++ b/src/together/resources/finetune.py @@ -89,18 +89,10 @@ def create_finetune_request( model_or_checkpoint = model or from_checkpoint - if batch_size == "max": - log_warn_once( - "Starting from together>=1.3.0, " - "the default batch size is set to the maximum allowed value for each model." - ) if warmup_ratio is None: warmup_ratio = 0.0 training_type: TrainingType = FullTrainingType() - max_batch_size: int = 0 - max_batch_size_dpo: int = 0 - min_batch_size: int = 0 if lora: if model_limits.lora_training is None: raise ValueError( @@ -133,28 +125,23 @@ def create_finetune_request( min_batch_size = model_limits.full_training.min_batch_size max_batch_size_dpo = model_limits.full_training.max_batch_size_dpo - if batch_size == "max": - if training_method == "dpo": - batch_size = max_batch_size_dpo - else: - batch_size = max_batch_size + if batch_size != "max": + if training_method == "sft": + if batch_size > max_batch_size: + raise ValueError( + f"Requested batch size of {batch_size} is higher that the maximum allowed value of {max_batch_size}." + ) + elif training_method == "dpo": + if batch_size > max_batch_size_dpo: + raise ValueError( + f"Requested batch size of {batch_size} is higher that the maximum allowed value of {max_batch_size_dpo}." + ) - if training_method == "sft": - if batch_size > max_batch_size: - raise ValueError( - f"Requested batch size of {batch_size} is higher that the maximum allowed value of {max_batch_size}." - ) - elif training_method == "dpo": - if batch_size > max_batch_size_dpo: + if batch_size < min_batch_size: raise ValueError( - f"Requested batch size of {batch_size} is higher that the maximum allowed value of {max_batch_size_dpo}." + f"Requested batch size of {batch_size} is lower that the minimum allowed value of {min_batch_size}." ) - if batch_size < min_batch_size: - raise ValueError( - f"Requested batch size of {batch_size} is lower that the minimum allowed value of {min_batch_size}." - ) - if warmup_ratio > 1 or warmup_ratio < 0: raise ValueError(f"Warmup ratio should be between 0 and 1 (got {warmup_ratio})") diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py index 72a35e8d..789c19f9 100644 --- a/src/together/types/finetune.py +++ b/src/together/types/finetune.py @@ -195,7 +195,7 @@ class FinetuneRequest(BaseModel): # number of evaluation loops to run n_evals: int | None = None # training batch size - batch_size: int | None = None + batch_size: int | Literal["max"] | None = None # up to 40 character suffix for output model name suffix: str | None = None # weights & biases api key diff --git a/tests/unit/test_finetune_resources.py b/tests/unit/test_finetune_resources.py index 46e2f2f8..b72e5b18 100644 --- a/tests/unit/test_finetune_resources.py +++ b/tests/unit/test_finetune_resources.py @@ -44,7 +44,7 @@ def test_simple_request(): assert request.n_epochs > 0 assert request.warmup_ratio == 0.0 assert request.training_type.type == "Full" - assert request.batch_size == _MODEL_LIMITS.full_training.max_batch_size + assert request.batch_size == "max" def test_validation_file(): @@ -82,7 +82,7 @@ def test_lora_request(): assert request.training_type.lora_alpha == _MODEL_LIMITS.lora_training.max_rank * 2 assert request.training_type.lora_dropout == 0.0 assert request.training_type.lora_trainable_modules == "all-linear" - assert request.batch_size == _MODEL_LIMITS.lora_training.max_batch_size + assert request.batch_size == "max" @pytest.mark.parametrize("lora_dropout", [-1, 0, 0.5, 1.0, 10.0]) @@ -124,7 +124,7 @@ def test_dpo_request_lora(): assert request.training_type.lora_alpha == _MODEL_LIMITS.lora_training.max_rank * 2 assert request.training_type.lora_dropout == 0.0 assert request.training_type.lora_trainable_modules == "all-linear" - assert request.batch_size == _MODEL_LIMITS.lora_training.max_batch_size_dpo + assert request.batch_size == "max" def test_dpo_request(): @@ -137,7 +137,7 @@ def test_dpo_request(): ) assert request.training_type.type == "Full" - assert request.batch_size == _MODEL_LIMITS.full_training.max_batch_size_dpo + assert request.batch_size == "max" def test_from_checkpoint_request():