diff --git a/optimum/neuron/accelerate/accelerator.py b/optimum/neuron/accelerate/accelerator.py index bcb1be71b..e2fba78df 100644 --- a/optimum/neuron/accelerate/accelerator.py +++ b/optimum/neuron/accelerate/accelerator.py @@ -222,10 +222,11 @@ def prepare_data_loader( num_replicas = parallel_layers.parallel_state.get_data_parallel_size() rank = parallel_layers.parallel_state.get_data_parallel_rank() force_drop_last = parallel_layers.parallel_state.get_pipeline_model_parallel_size() > 1 - logger.warning( - "Pipeline parallelsim: forcing the dataloader to drop the last incomplete batch because it can " - "cause failure if the last batch size is not divisible by the number of microbatches for the pipeline." - ) + if force_drop_last and not data_loader.drop_last: + logger.warning( + "Pipeline parallelsim: forcing the dataloader to drop the last incomplete batch because it can " + "cause failure if the last batch size is not divisible by the number of microbatches for the pipeline." + ) else: num_replicas = xr.world_size() rank = xr.global_ordinal() diff --git a/optimum/neuron/models/training/training_utils.py b/optimum/neuron/models/training/training_utils.py index 43fda903c..0988829f2 100644 --- a/optimum/neuron/models/training/training_utils.py +++ b/optimum/neuron/models/training/training_utils.py @@ -185,13 +185,6 @@ def is_logging_process() -> bool: return dp_rank == tp_rank == 0 and pp_rank == pp_size - 1 -def is_logging_process_method(self) -> bool: - """ - Method version of `is_logging_process`, useful when this is used to patch a method from the Trainer class. - """ - return is_logging_process() - - def is_custom_modeling_model(model) -> bool: from peft import PeftModel diff --git a/tests/training/test_overfit.py b/tests/training/test_overfit.py index 7f6cb6472..4066f477b 100644 --- a/tests/training/test_overfit.py +++ b/tests/training/test_overfit.py @@ -100,7 +100,6 @@ def gen(): tensor_parallel_size=tp_size, pipeline_parallel_size=pp_size, do_train=True, - do_eval=False, learning_rate=learning_rate, warmup_ratio=warmup_ratio, per_device_train_batch_size=1, @@ -114,10 +113,7 @@ def gen(): max_steps=6 if is_precompilation() else num_steps, output_dir=output_dir, run_name=wandb_run_name, - # This will load the weights on every worker at the same time. - # By default it is set to 8 to avoid OOM errors, but here the model are small enough to use the maximum size. - # This will save some time during weight loading. - num_local_ranks_per_step=-1, + num_local_ranks_per_step=16, **training_kwargs, ) @@ -127,7 +123,7 @@ def gen(): model_name_or_path, training_args.trn_config, torch_dtype=torch.bfloat16, - attn_implementation="flash_attention_2" if use_flash_attention_2 else None, + attn_implementation="flash_attention_2" if use_flash_attention_2 else "eager", ) else: model = model_class.from_pretrained( @@ -207,8 +203,11 @@ def on_log(self, args, state, control, logs=None, **kwargs): 50, ], [ - "Qwen3ForCausalLM", - "Qwen/Qwen3-0.6B", + # "Qwen3ForCausalLM", + "LlamaForCausalLM", + # "Qwen/Qwen3-0.6B", + # "michaelbenayoun/qwen3-tiny-4kv-heads-4layers-random", + "michaelbenayoun/llama-2-tiny-4kv-heads-4layers-random", 1e-4, 0.03, {}, @@ -227,7 +226,7 @@ def on_log(self, args, state, control, logs=None, **kwargs): @pytest.mark.parametrize( "world_size,tp_size,pp_size", [[32, 2, 4], [32, 8, 1]], - ids=["dp=4,tp=2,pp=4", "dp=4,tp=8"], + ids=["32_2_4", "32_8_1"], ) @pytest.mark.neuron_parallel_compile @is_trainium_test