diff --git a/docs/source/en/optimizers.md b/docs/source/en/optimizers.md index 873b09349feb..e95d6e9c8d66 100644 --- a/docs/source/en/optimizers.md +++ b/docs/source/en/optimizers.md @@ -154,7 +154,7 @@ pip install schedulefree [Schedule Free optimizer (SFO)](https://hf.co/papers/2405.15682) replaces the base optimizers momentum with a combination of averaging and interpolation. Unlike a traditional scheduler, SFO completely removes the need to anneal the learning rate. -SFO supports the RAdam (`schedule_free_radam`), AdamW (`schedule_free_adamw`) and SGD (`schedule_free_sgd`) optimizers. The RAdam scheduler doesn't require `warmup_steps` or `warmup_ratio`. +SFO supports the RAdam (`schedule_free_radam`), AdamW (`schedule_free_adamw`) and SGD (`schedule_free_sgd`) optimizers. The RAdam scheduler doesn't require `warmup_steps`. By default, it is recommended to set `lr_scheduler_type="constant"`. Other `lr_scheduler_type` values may also work, but combining SFO optimizers with other learning rate schedules could affect SFOs intended behavior and performance. diff --git a/docs/source/en/tasks/audio_classification.md b/docs/source/en/tasks/audio_classification.md index 250b980be190..cc8a8238022a 100644 --- a/docs/source/en/tasks/audio_classification.md +++ b/docs/source/en/tasks/audio_classification.md @@ -220,7 +220,7 @@ At this point, only three steps remain: ... gradient_accumulation_steps=4, ... per_device_eval_batch_size=32, ... num_train_epochs=10, -... warmup_ratio=0.1, +... warmup_steps=0.1, ... logging_steps=10, ... load_best_model_at_end=True, ... metric_for_best_model="accuracy", diff --git a/docs/source/en/tasks/image_classification.md b/docs/source/en/tasks/image_classification.md index 4754a91bd482..0af4be8ed6b9 100644 --- a/docs/source/en/tasks/image_classification.md +++ b/docs/source/en/tasks/image_classification.md @@ -211,7 +211,7 @@ At this point, only three steps remain: ... gradient_accumulation_steps=4, ... per_device_eval_batch_size=16, ... num_train_epochs=3, -... warmup_ratio=0.1, +... warmup_steps=0.1, ... logging_steps=10, ... load_best_model_at_end=True, ... metric_for_best_model="accuracy", diff --git a/docs/source/en/tasks/video_classification.md b/docs/source/en/tasks/video_classification.md index bae638bd84ed..f60a8221caf6 100644 --- a/docs/source/en/tasks/video_classification.md +++ b/docs/source/en/tasks/video_classification.md @@ -378,7 +378,7 @@ Most of the training arguments are self-explanatory, but one that is quite impor ... learning_rate=5e-5, ... per_device_train_batch_size=batch_size, ... per_device_eval_batch_size=batch_size, -... warmup_ratio=0.1, +... warmup_steps=0.1, ... logging_steps=10, ... load_best_model_at_end=True, ... metric_for_best_model="accuracy", diff --git a/docs/source/es/tasks/audio_classification.md b/docs/source/es/tasks/audio_classification.md index 3b0446143262..bc63a93c88d2 100644 --- a/docs/source/es/tasks/audio_classification.md +++ b/docs/source/es/tasks/audio_classification.md @@ -220,7 +220,7 @@ Al llegar a este punto, solo quedan tres pasos: ... gradient_accumulation_steps=4, ... per_device_eval_batch_size=32, ... num_train_epochs=10, -... warmup_ratio=0.1, +... warmup_steps=0.1, ... logging_steps=10, ... load_best_model_at_end=True, ... metric_for_best_model="accuracy", diff --git a/docs/source/ja/main_classes/deepspeed.md b/docs/source/ja/main_classes/deepspeed.md index affb6c0a724c..aaa8191621bc 100644 --- a/docs/source/ja/main_classes/deepspeed.md +++ b/docs/source/ja/main_classes/deepspeed.md @@ -1292,7 +1292,7 @@ DeepSpeed は、`LRRangeTest`、`OneCycle`、`WarmupLR`、および`WarmupDecayL したがって、スケジューラを設定しない場合、これがデフォルトで設定されるスケジューラになります。 設定ファイルで `scheduler` エントリを設定しない場合、[`Trainer`] は -`--lr_scheduler_type`、`--learning_rate`、および `--warmup_steps` または `--warmup_ratio` の値を設定します。 +`--lr_scheduler_type`、`--learning_rate`、および `--warmup_steps` の値を設定します。 🤗 それのトランスフォーマーバージョン。 以下は、`WarmupLR`の自動構成された`scheduler`エントリの例です。 @@ -1316,8 +1316,7 @@ DeepSpeed は、`LRRangeTest`、`OneCycle`、`WarmupLR`、および`WarmupDecayL - `warmup_min_lr` の値は `0` です。 - `warmup_max_lr` と `--learning_rate` の値。 -- `warmup_num_steps` と `--warmup_steps` の値 (指定されている場合)。それ以外の場合は `--warmup_ratio` を使用します - トレーニング ステップの数を乗算し、切り上げます。 +- `warmup_num_steps` と `--warmup_steps` の値 (指定されている場合) - `total_num_steps` には `--max_steps` の値を指定するか、指定されていない場合は実行時に自動的に導出されます。 環境、データセットのサイズ、およびその他のコマンド ライン引数 ( `WarmupDecayLR`)。 diff --git a/docs/source/ja/tasks/audio_classification.md b/docs/source/ja/tasks/audio_classification.md index d37485cbe226..e1831aa50c38 100644 --- a/docs/source/ja/tasks/audio_classification.md +++ b/docs/source/ja/tasks/audio_classification.md @@ -219,7 +219,7 @@ MInDS-14 データセットのサンプリング レートは 8khz です (こ ... gradient_accumulation_steps=4, ... per_device_eval_batch_size=32, ... num_train_epochs=10, -... warmup_ratio=0.1, +... warmup_steps=0.1, ... logging_steps=10, ... load_best_model_at_end=True, ... metric_for_best_model="accuracy", diff --git a/docs/source/ja/tasks/image_classification.md b/docs/source/ja/tasks/image_classification.md index 32c30dcff7c8..164176a911d5 100644 --- a/docs/source/ja/tasks/image_classification.md +++ b/docs/source/ja/tasks/image_classification.md @@ -216,7 +216,7 @@ Datasets、🤗 データセット ライブラリから Food-101 データセ ... gradient_accumulation_steps=4, ... per_device_eval_batch_size=16, ... num_train_epochs=3, -... warmup_ratio=0.1, +... warmup_steps=0.1, ... logging_steps=10, ... load_best_model_at_end=True, ... metric_for_best_model="accuracy", diff --git a/docs/source/ja/tasks/video_classification.md b/docs/source/ja/tasks/video_classification.md index e7e7803c9408..32e871f0ab49 100644 --- a/docs/source/ja/tasks/video_classification.md +++ b/docs/source/ja/tasks/video_classification.md @@ -360,7 +360,7 @@ You should probably TRAIN this model on a down-stream task to be able to use it ... learning_rate=5e-5, ... per_device_train_batch_size=batch_size, ... per_device_eval_batch_size=batch_size, -... warmup_ratio=0.1, +... warmup_steps=0.1, ... logging_steps=10, ... load_best_model_at_end=True, ... metric_for_best_model="accuracy", diff --git a/docs/source/ko/optimizers.md b/docs/source/ko/optimizers.md index 7b6fcc7b1016..a5bf877ed6e5 100644 --- a/docs/source/ko/optimizers.md +++ b/docs/source/ko/optimizers.md @@ -154,7 +154,7 @@ pip install schedulefree [Schedule Free optimizer (SFO)](https://hf.co/papers/2405.15682)는 기본 옵티마이저의 모멘텀 대신 평균화(averaging)와 보간(interpolation)을 조합하여 사용합니다. 덕분에 기존의 학습률 스케줄러와 달리, SFO는 학습률을 점진적으로 낮추는 절차가 아예 필요 없습니다. -SFO는 RAdam(`schedule_free_radam`), AdamW(`schedule_free_adamw`), SGD(`schedule_free_sgd`) 옵티마이저를 지원합니다. RAdam 스케줄러는 `warmup_steps`나 `warmup_ratio` 설정이 필요하지 않습니다. +SFO는 RAdam(`schedule_free_radam`), AdamW(`schedule_free_adamw`), SGD(`schedule_free_sgd`) 옵티마이저를 지원합니다. RAdam 스케줄러는 `warmup_steps`. 기본적으로 `lr_scheduler_type="constant"`로 설정하는 것을 권장합니다. 다른 `lr_scheduler_type` 값도 동작할 순 있으나, SFO 옵티마이저와 다른 학습률 스케줄을 함께 사용하면 SFO의 의도된 동작과 성능에 영향을 줄 수 있습니다. diff --git a/docs/source/ko/tasks/audio_classification.md b/docs/source/ko/tasks/audio_classification.md index 789d7ee88373..983692bc100c 100644 --- a/docs/source/ko/tasks/audio_classification.md +++ b/docs/source/ko/tasks/audio_classification.md @@ -221,7 +221,7 @@ MinDS-14 데이터 세트의 샘플링 속도는 8khz이므로(이 정보는 [ ... gradient_accumulation_steps=4, ... per_device_eval_batch_size=32, ... num_train_epochs=10, -... warmup_ratio=0.1, +... warmup_steps=0.1, ... logging_steps=10, ... load_best_model_at_end=True, ... metric_for_best_model="accuracy", diff --git a/docs/source/ko/tasks/image_classification.md b/docs/source/ko/tasks/image_classification.md index 54490a6f939a..3e1e829ae8d5 100644 --- a/docs/source/ko/tasks/image_classification.md +++ b/docs/source/ko/tasks/image_classification.md @@ -212,7 +212,7 @@ Hugging Face 계정에 로그인하여 모델을 업로드하고 커뮤니티에 ... gradient_accumulation_steps=4, ... per_device_eval_batch_size=16, ... num_train_epochs=3, -... warmup_ratio=0.1, +... warmup_steps=0.1, ... logging_steps=10, ... load_best_model_at_end=True, ... metric_for_best_model="accuracy", diff --git a/docs/source/ko/tasks/video_classification.md b/docs/source/ko/tasks/video_classification.md index d39d669f8a6f..b220323aa2e3 100644 --- a/docs/source/ko/tasks/video_classification.md +++ b/docs/source/ko/tasks/video_classification.md @@ -357,7 +357,7 @@ You should probably TRAIN this model on a down-stream task to be able to use it ... learning_rate=5e-5, ... per_device_train_batch_size=batch_size, ... per_device_eval_batch_size=batch_size, -... warmup_ratio=0.1, +... warmup_steps=0.1, ... logging_steps=10, ... load_best_model_at_end=True, ... metric_for_best_model="accuracy", diff --git a/docs/source/zh/main_classes/deepspeed.md b/docs/source/zh/main_classes/deepspeed.md index 8319f5cad4a3..2e27e04cae45 100644 --- a/docs/source/zh/main_classes/deepspeed.md +++ b/docs/source/zh/main_classes/deepspeed.md @@ -1206,7 +1206,7 @@ DeepSpeed支持`LRRangeTest`、`OneCycle`、`WarmupLR`和`WarmupDecayLR`学习 - 通过 `--lr_scheduler_type constant_with_warmup` 实现 `WarmupLR` - 通过 `--lr_scheduler_type linear` 实现 `WarmupDecayLR`。这也是 `--lr_scheduler_type` 的默认值,因此,如果不配置调度器,这将是默认配置的调度器。 -如果在配置文件中不配置 `scheduler` 条目,[`Trainer`] 将使用 `--lr_scheduler_type`、`--learning_rate` 和 `--warmup_steps` 或 `--warmup_ratio` 的值来配置其🤗 Transformers 版本。 +如果在配置文件中不配置 `scheduler` 条目,[`Trainer`] 将使用 `--lr_scheduler_type`、`--learning_rate` 和 `--warmup_steps` 的值来配置其🤗 Transformers 版本。 以下是 `WarmupLR` 的自动配置示例: @@ -1227,7 +1227,7 @@ DeepSpeed支持`LRRangeTest`、`OneCycle`、`WarmupLR`和`WarmupDecayLR`学习 - `warmup_min_lr` 的值为 `0`。 - `warmup_max_lr` 的值为 `--learning_rate`。 -- `warmup_num_steps` 的值为 `--warmup_steps`(如果提供)。否则,将使用 `--warmup_ratio` 乘以训练步骤的数量,并四舍五入。 +- `warmup_num_steps` 的值为 `--warmup_steps`(如果提供)。 - `total_num_steps` 的值为 `--max_steps` 或者如果没有提供,将在运行时根据环境、数据集的大小和其他命令行参数(对于 `WarmupDecayLR` 来说需要)自动推导。 当然,您可以接管任何或所有的配置值,并自行设置这些值: diff --git a/examples/pytorch/audio-classification/README.md b/examples/pytorch/audio-classification/README.md index 6f9069b331ab..8aacb5fc38e1 100644 --- a/examples/pytorch/audio-classification/README.md +++ b/examples/pytorch/audio-classification/README.md @@ -42,7 +42,7 @@ python run_audio_classification.py \ --learning_rate 3e-5 \ --max_length_seconds 1 \ --attention_mask False \ - --warmup_ratio 0.1 \ + --warmup_steps 0.1 \ --num_train_epochs 5 \ --per_device_train_batch_size 32 \ --gradient_accumulation_steps 4 \ @@ -84,7 +84,7 @@ python run_audio_classification.py \ --learning_rate 3e-4 \ --max_length_seconds 16 \ --attention_mask False \ - --warmup_ratio 0.1 \ + --warmup_steps 0.1 \ --num_train_epochs 10 \ --per_device_train_batch_size 8 \ --gradient_accumulation_steps 4 \ diff --git a/examples/pytorch/image-pretraining/README.md b/examples/pytorch/image-pretraining/README.md index bca37f24135a..865818f52938 100644 --- a/examples/pytorch/image-pretraining/README.md +++ b/examples/pytorch/image-pretraining/README.md @@ -167,7 +167,7 @@ python run_mae.py \ --lr_scheduler_type cosine \ --weight_decay 0.05 \ --num_train_epochs 800 \ - --warmup_ratio 0.05 \ + --warmup_steps 0.05 \ --per_device_train_batch_size 8 \ --per_device_eval_batch_size 8 \ --logging_strategy steps \ diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py index cbd148153ca5..2b28dc98178a 100644 --- a/src/transformers/modelcard.py +++ b/src/transformers/modelcard.py @@ -753,8 +753,6 @@ def extract_hyperparameters_from_trainer(trainer): hyperparameters["optimizer"] = f"Use {optimizer_name} and the args are:\n{optimizer_args}" hyperparameters["lr_scheduler_type"] = trainer.args.lr_scheduler_type.value - if trainer.args.warmup_ratio != 0.0: - hyperparameters["lr_scheduler_warmup_ratio"] = trainer.args.warmup_ratio if trainer.args.warmup_steps != 0.0: hyperparameters["lr_scheduler_warmup_steps"] = trainer.args.warmup_steps if trainer.args.max_steps != -1: diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 68200f1af9c2..46b38b26a977 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -300,10 +300,9 @@ class TrainingArguments: The scheduler type to use. See the documentation of [`SchedulerType`] for all possible values. lr_scheduler_kwargs ('dict', *optional*, defaults to {}): The extra arguments for the lr_scheduler. See the documentation of each scheduler for possible values. - warmup_ratio (`float`, *optional*, defaults to 0.0): - Ratio of total training steps used for a linear warmup from 0 to `learning_rate`. - warmup_steps (`int`, *optional*, defaults to 0): - Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of `warmup_ratio`. + warmup_steps (`int` or `float`, *optional*, defaults to 0): + Number of steps used for a linear warmup from 0 to `learning_rate`. Should be an integer or a float in range `[0,1)`. + If smaller than 1, will be interpreted as ratio of steps used for a linear warmup from 0 to `learning_rate`. log_level (`str`, *optional*, defaults to `passive`): Logger log level to use on the main process. Possible choices are the log levels as strings: 'debug', 'info', 'warning', 'error' and 'critical', plus a 'passive' level which doesn't set anything and keeps the @@ -888,10 +887,14 @@ class TrainingArguments: ) }, ) - warmup_ratio: float = field( - default=0.0, metadata={"help": "Linear warmup over warmup_ratio fraction of total steps."} + warmup_ratio: Optional[float] = field( + default=None, + metadata={ + "help": "This argument is deprecated and will be removed in v5. Use `warmup_steps` instead as it also works with float values." + }, ) - warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."}) + + warmup_steps: float = field(default=0, metadata={"help": "Linear warmup over warmup_steps."}) log_level: str = field( default="passive", @@ -1724,16 +1727,12 @@ def __post_init__(self): elif not isinstance(self.report_to, list): self.report_to = [self.report_to] - if self.warmup_ratio < 0 or self.warmup_ratio > 1: - raise ValueError("warmup_ratio must lie in range [0,1]") - elif self.warmup_ratio > 0 and self.warmup_steps > 0: - logger.info( - "Both warmup_ratio and warmup_steps given, warmup_steps will override any effect of warmup_ratio" - " during training" - ) + if self.warmup_ratio is not None: + logger.warning("warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.") + self.warmup_steps = self.warmup_ratio - if not isinstance(self.warmup_steps, int) or self.warmup_steps < 0: - raise ValueError("warmup_steps must be of type int and must be 0 or a positive integer.") + if self.warmup_steps < 0: + raise ValueError("warmup_steps must be an integer or a float") if isinstance(self.fsdp, bool): self.fsdp = [FSDPOption.FULL_SHARD] if self.fsdp else "" @@ -2275,7 +2274,7 @@ def get_warmup_steps(self, num_training_steps: int): Get number of steps used for a linear warmup. """ warmup_steps = ( - self.warmup_steps if self.warmup_steps > 0 else math.ceil(num_training_steps * self.warmup_ratio) + int(self.warmup_steps) if self.warmup_steps >= 1 else math.ceil(num_training_steps * self.warmup_steps) ) return warmup_steps @@ -2771,8 +2770,8 @@ def set_lr_scheduler( name: Union[str, SchedulerType] = "linear", num_epochs: float = 3.0, max_steps: int = -1, - warmup_ratio: float = 0, - warmup_steps: int = 0, + warmup_steps: float = 0, + warmup_ratio: Optional[float] = None, ): """ A method that regroups all arguments linked to the learning rate scheduler and its hyperparameters. @@ -2787,11 +2786,9 @@ def set_lr_scheduler( If set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`. For a finite dataset, training is reiterated through the dataset (if all data is exhausted) until `max_steps` is reached. - warmup_ratio (`float`, *optional*, defaults to 0.0): - Ratio of total training steps used for a linear warmup from 0 to `learning_rate`. - warmup_steps (`int`, *optional*, defaults to 0): - Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of - `warmup_ratio`. + warmup_steps (`float`, *optional*, defaults to 0): + Number of steps used for a linear warmup from 0 to `learning_rate`. Should be an integer or a float in range `[0,1)`. + If smaller than 1, will be interpreted as ratio of steps used for a linear warmup from 0 to `learning_rate`. Example: @@ -2799,15 +2796,18 @@ def set_lr_scheduler( >>> from transformers import TrainingArguments >>> args = TrainingArguments("working_dir") - >>> args = args.set_lr_scheduler(name="cosine", warmup_ratio=0.05) - >>> args.warmup_ratio + >>> args = args.set_lr_scheduler(name="cosine", warmup_steps=0.05) + >>> args.warmup_steps 0.05 ``` """ + if warmup_ratio is not None: + logger.warning("warmup_ratio is deprecated and will be removed in v5. Use `warmup_steps` instead.") + warmup_steps = warmup_ratio + self.lr_scheduler_type = SchedulerType(name) self.num_train_epochs = num_epochs self.max_steps = max_steps - self.warmup_ratio = warmup_ratio self.warmup_steps = warmup_steps return self