-
Notifications
You must be signed in to change notification settings - Fork 21
Add Cosine LR Scheduler for Fine-Tuning #273
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
3c87636
a872438
c17a405
f6e2258
76f616b
9a8418e
968e5ab
c9061c0
2cbe8c1
fe039bd
840233e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api" | |
|
|
||
| [tool.poetry] | ||
| name = "together" | ||
| version = "1.4.6" | ||
| version = "1.4.7" | ||
| authors = [ | ||
| "Together AI <[email protected]>" | ||
| ] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,6 +23,7 @@ | |
| TrainingType, | ||
| FinetuneLRScheduler, | ||
| FinetuneLinearLRSchedulerArgs, | ||
| FinetuneCosineLRSchedulerArgs, | ||
| TrainingMethodDPO, | ||
| TrainingMethodSFT, | ||
| FinetuneCheckpoint, | ||
|
|
@@ -57,7 +58,9 @@ def createFinetuneRequest( | |
| n_checkpoints: int | None = 1, | ||
| batch_size: int | Literal["max"] = "max", | ||
| learning_rate: float | None = 0.00001, | ||
| lr_scheduler_type: Literal["linear", "cosine"] = "linear", | ||
| min_lr_ratio: float = 0.0, | ||
| num_cycles: float = 0.5, | ||
|
||
| warmup_ratio: float = 0.0, | ||
| max_grad_norm: float = 1.0, | ||
| weight_decay: float = 0.0, | ||
|
|
@@ -129,10 +132,21 @@ def createFinetuneRequest( | |
| f"training_method must be one of {', '.join(AVAILABLE_TRAINING_METHODS)}" | ||
| ) | ||
|
|
||
| lrScheduler = FinetuneLRScheduler( | ||
| lr_scheduler_type="linear", | ||
| lr_scheduler_args=FinetuneLinearLRSchedulerArgs(min_lr_ratio=min_lr_ratio), | ||
| ) | ||
| if lr_scheduler_type == "cosine": | ||
| if num_cycles <= 0.0: | ||
|
||
| raise ValueError("Number of cycles should be greater than 0") | ||
|
|
||
| lrScheduler = FinetuneLRScheduler( | ||
azahed98 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| lr_scheduler_type="cosine", | ||
| lr_scheduler_args=FinetuneCosineLRSchedulerArgs( | ||
| min_lr_ratio=min_lr_ratio, num_cycles=num_cycles | ||
| ), | ||
| ) | ||
| else: | ||
| lrScheduler = FinetuneLRScheduler( | ||
| lr_scheduler_type="linear", | ||
| lr_scheduler_args=FinetuneLinearLRSchedulerArgs(min_lr_ratio=min_lr_ratio), | ||
| ) | ||
|
|
||
| training_method_cls: TrainingMethodSFT | TrainingMethodDPO = TrainingMethodSFT() | ||
| if training_method == "dpo": | ||
|
|
@@ -244,7 +258,9 @@ def create( | |
| n_checkpoints: int | None = 1, | ||
| batch_size: int | Literal["max"] = "max", | ||
| learning_rate: float | None = 0.00001, | ||
| lr_scheduler_type: Literal["linear", "cosine"] = "linear", | ||
| min_lr_ratio: float = 0.0, | ||
| num_cycles: float = 0.5, | ||
| warmup_ratio: float = 0.0, | ||
| max_grad_norm: float = 1.0, | ||
| weight_decay: float = 0.0, | ||
|
|
@@ -279,8 +295,10 @@ def create( | |
| batch_size (int or "max"): Batch size for fine-tuning. Defaults to max. | ||
| learning_rate (float, optional): Learning rate multiplier to use for training | ||
| Defaults to 0.00001. | ||
| lr_scheduler_type (Literal["linear", "cosine"]): Learning rate scheduler type. Defaults to "linear". | ||
| min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for | ||
| the learning rate scheduler. Defaults to 0.0. | ||
| num_cycles (float, optional): Number of cycles for cosine learning rate scheduler. Defaults to 0.5. | ||
azahed98 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| warmup_ratio (float, optional): Warmup ratio for learning rate scheduler. | ||
| max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable. | ||
| weight_decay (float, optional): Weight decay. Defaults to 0.0. | ||
|
|
@@ -336,7 +354,9 @@ def create( | |
| n_checkpoints=n_checkpoints, | ||
| batch_size=batch_size, | ||
| learning_rate=learning_rate, | ||
| lr_scheduler_type=lr_scheduler_type, | ||
| min_lr_ratio=min_lr_ratio, | ||
| num_cycles=num_cycles, | ||
| warmup_ratio=warmup_ratio, | ||
| max_grad_norm=max_grad_norm, | ||
| weight_decay=weight_decay, | ||
|
|
@@ -617,7 +637,9 @@ async def create( | |
| n_checkpoints: int | None = 1, | ||
| batch_size: int | Literal["max"] = "max", | ||
| learning_rate: float | None = 0.00001, | ||
| lr_scheduler_type: Literal["linear", "cosine"] = "linear", | ||
| min_lr_ratio: float = 0.0, | ||
| num_cycles: float = 0.5, | ||
| warmup_ratio: float = 0.0, | ||
| max_grad_norm: float = 1.0, | ||
| weight_decay: float = 0.0, | ||
|
|
@@ -652,8 +674,10 @@ async def create( | |
| batch_size (int, optional): Batch size for fine-tuning. Defaults to max. | ||
| learning_rate (float, optional): Learning rate multiplier to use for training | ||
| Defaults to 0.00001. | ||
| lr_scheduler_type (Literal["linear", "cosine"]): Learning rate scheduler type. Defaults to "linear". | ||
| min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for | ||
| the learning rate scheduler. Defaults to 0.0. | ||
| num_cycles (float, optional): Number of cycles for cosine learning rate scheduler. Defaults to 0.5. | ||
azahed98 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| warmup_ratio (float, optional): Warmup ratio for learning rate scheduler. | ||
| max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable. | ||
| weight_decay (float, optional): Weight decay. Defaults to 0.0. | ||
|
|
@@ -710,7 +734,9 @@ async def create( | |
| n_checkpoints=n_checkpoints, | ||
| batch_size=batch_size, | ||
| learning_rate=learning_rate, | ||
| lr_scheduler_type=lr_scheduler_type, | ||
| min_lr_ratio=min_lr_ratio, | ||
| num_cycles=num_cycles, | ||
| warmup_ratio=warmup_ratio, | ||
| max_grad_norm=max_grad_norm, | ||
| weight_decay=weight_decay, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Maybe also add what fractional values mean.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated to "Number or fraction of cycles".