From d0dfe0689f8737c3cd91656b41122e7433254252 Mon Sep 17 00:00:00 2001 From: Michael Benayoun Date: Wed, 29 Oct 2025 11:33:43 +0100 Subject: [PATCH 01/10] refactor: sync with transformers 4.57.1 --- README.md | 2 +- .../performance/llama3.3-70b.py | 2 +- .../contribute/contribute_for_training.mdx | 24 ++-- docs/source/model_doc/diffusers/flux.mdx | 2 +- .../model_doc/diffusers/pixart_alpha.mdx | 2 +- .../model_doc/diffusers/pixart_sigma.mdx | 2 +- docs/source/quickstart.mdx | 2 +- .../training_tutorials/finetune_llama.mdx | 2 +- .../training_tutorials/finetune_qwen3.mdx | 2 +- examples/training/qwen3/finetune_qwen3.sh | 3 +- optimum/commands/neuron/cache.py | 2 +- optimum/commands/neuron/serve.py | 4 +- optimum/exporters/neuron/__main__.py | 4 +- optimum/neuron/cache/entries/cache_entry.py | 2 +- optimum/neuron/cache/hub_cache.py | 2 +- optimum/neuron/generation/utils.py | 11 +- optimum/neuron/modeling_diffusion.py | 2 +- .../neuron/models/inference/backend/config.py | 6 +- .../modules/attention/attention_base.py | 14 +-- .../modules/decoder/decoder_builder.py | 8 +- .../modules/decoder/decoder_wrapper.py | 4 +- .../modules/kvcache/kv_cache_manager.py | 2 +- .../inference/backend/modules/moe_v2.py | 6 +- .../inference/backend/pretrained_model.py | 12 +- .../inference/granite/modeling_granite.py | 2 +- .../models/inference/llama/modeling_llama.py | 10 +- .../inference/llama4/modeling_llama4.py | 4 +- .../inference/mixtral/modeling_mixtral.py | 2 +- .../neuron/models/inference/modeling_utils.py | 2 +- .../models/inference/qwen2/modeling_qwen2.py | 2 +- .../models/inference/qwen3/modeling_qwen3.py | 2 +- .../inference/qwen3_moe/modeling_qwen3_moe.py | 2 +- .../inference/smollm3/modeling_smollm3.py | 2 +- .../models/training/llama/modeling_llama.py | 20 ++-- .../neuron/models/training/modeling_utils.py | 111 +++++++++++------- .../models/training/qwen3/modeling_qwen3.py | 2 +- optimum/neuron/trainers/sft_trainer.py | 4 +- optimum/neuron/trainers/training_args.py | 9 +- pyproject.toml | 2 +- tools/cache/auto_fill_diffusion_cache.py | 2 +- 40 files changed, 164 insertions(+), 136 deletions(-) diff --git a/README.md b/README.md index 5e7916c4c..e499d696a 100644 --- a/README.md +++ b/README.md @@ -115,7 +115,7 @@ def main(): model = NeuronModelForCausalLM.from_pretrained( model_id, training_args.trn_config, - torch_dtype=torch.bfloat16, + dtype=torch.bfloat16, attn_implementation="flash_attention_2", # Enable flash attention ) diff --git a/benchmark/text-generation/performance/llama3.3-70b.py b/benchmark/text-generation/performance/llama3.3-70b.py index d87bb5c8a..d4a29c118 100644 --- a/benchmark/text-generation/performance/llama3.3-70b.py +++ b/benchmark/text-generation/performance/llama3.3-70b.py @@ -32,7 +32,7 @@ def main(): assert neuron_config.sequence_length == seq_length, ( f"Model {model_name} is not configured for sequence length {seq_length}." ) - assert neuron_config.torch_dtype == torch.bfloat16, f"Model {model_name} is not configured for bf16." + assert neuron_config.dtype == torch.bfloat16, f"Model {model_name} is not configured for bf16." model = NeuronModelForCausalLM.from_pretrained(model_id) except Exception: model = NeuronModelForCausalLM.from_pretrained( diff --git a/docs/source/contribute/contribute_for_training.mdx b/docs/source/contribute/contribute_for_training.mdx index 994dc9991..62475c1e0 100644 --- a/docs/source/contribute/contribute_for_training.mdx +++ b/docs/source/contribute/contribute_for_training.mdx @@ -80,7 +80,7 @@ class YourModelEmbeddings(nn.Module): self.embed_tokens = ParallelEmbedding( config.vocab_size, config.hidden_size, - dtype=config.torch_dtype, + dtype=config.dtype, sequence_parallel_enabled=trn_config.sequence_parallel_enabled, ) ``` @@ -105,7 +105,7 @@ class YourModelMLP(nn.Module, CustomModule): bias=False, gather_output=False, sequence_parallel_enabled=trn_config.sequence_parallel_enabled, - dtype=config.torch_dtype, + dtype=config.dtype, ) self.down_proj = RowParallelLinear( @@ -114,7 +114,7 @@ class YourModelMLP(nn.Module, CustomModule): bias=False, input_is_parallel=True, sequence_parallel_enabled=trn_config.sequence_parallel_enabled, - dtype=config.torch_dtype, + dtype=config.dtype, ) # Define transformation specs @@ -151,7 +151,7 @@ class YourModelAttention(nn.Module, CustomModule): bias=False, gather_output=False, sequence_parallel_enabled=trn_config.sequence_parallel_enabled, - dtype=config.torch_dtype, + dtype=config.dtype, ) self.k_proj = ColumnParallelLinear( config.hidden_size, @@ -159,7 +159,7 @@ class YourModelAttention(nn.Module, CustomModule): bias=False, gather_output=False, sequence_parallel_enabled=trn_config.sequence_parallel_enabled, - dtype=config.torch_dtype, + dtype=config.dtype, ) self.v_proj = ColumnParallelLinear( config.hidden_size, @@ -167,7 +167,7 @@ class YourModelAttention(nn.Module, CustomModule): bias=False, gather_output=False, sequence_parallel_enabled=trn_config.sequence_parallel_enabled, - dtype=config.torch_dtype, + dtype=config.dtype, ) self.o_proj = RowParallelLinear( @@ -176,7 +176,7 @@ class YourModelAttention(nn.Module, CustomModule): bias=False, input_is_parallel=True, sequence_parallel_enabled=trn_config.sequence_parallel_enabled, - dtype=config.torch_dtype, + dtype=config.dtype, ) # No transformation specs needed - regular parallel layers @@ -201,7 +201,7 @@ class YourModelAttention(nn.Module, CustomModule): bias=False, gather_output=False, sequence_parallel_enabled=trn_config.sequence_parallel_enabled, - dtype=config.torch_dtype, + dtype=config.dtype, ) # Define transformation specs for fused QKV @@ -246,7 +246,7 @@ class YourModelAttention(nn.Module, CustomModule): sequence_parallel_enabled=trn_config.sequence_parallel_enabled, kv_size_multiplier=self.kv_size_multiplier, fuse_qkv=trn_config.fuse_qkv, - dtype=config.torch_dtype, + dtype=config.dtype, ) # Define transformation specs for GQA QKV @@ -336,7 +336,7 @@ class YourModelForCausalLM(NeuronModelMixin, YourPreTrainedModel): config.vocab_size, bias=False, gather_output=False, - dtype=config.torch_dtype, + dtype=config.dtype, ) self.post_init() @@ -473,7 +473,7 @@ Update `tests/training/test_modeling_auto.py`: @is_trainium_test def test_auto_model_with_supported_architecture(from_pretrained): trn_config = TrainingNeuronConfig() - kwargs = {"torch_dtype": torch.bfloat16} + kwargs = {"dtype": torch.bfloat16} for model_name_or_path in [ "michaelbenayoun/llama-2-tiny-4kv-heads-4layers-random", "michaelbenayoun/granite-tiny-4kv-heads-4layers-random", @@ -487,7 +487,7 @@ def test_auto_model_with_supported_architecture(from_pretrained): @is_trainium_test def test_auto_model_for_causal_lm_with_supported_architecture(from_pretrained): trn_config = TrainingNeuronConfig() - kwargs = {"torch_dtype": torch.bfloat16} + kwargs = {"dtype": torch.bfloat16} for model_name_or_path in [ "michaelbenayoun/llama-2-tiny-4kv-heads-4layers-random", "michaelbenayoun/granite-tiny-4kv-heads-4layers-random", diff --git a/docs/source/model_doc/diffusers/flux.mdx b/docs/source/model_doc/diffusers/flux.mdx index a47148517..c68b1554e 100644 --- a/docs/source/model_doc/diffusers/flux.mdx +++ b/docs/source/model_doc/diffusers/flux.mdx @@ -40,7 +40,7 @@ if __name__ == "__main__": pipe = NeuronFluxPipeline.from_pretrained( "black-forest-labs/FLUX.1-dev", - torch_dtype=torch.bfloat16, + dtype=torch.bfloat16, export=True, tensor_parallel_size=8, **compiler_args, diff --git a/docs/source/model_doc/diffusers/pixart_alpha.mdx b/docs/source/model_doc/diffusers/pixart_alpha.mdx index 31dca7023..375f292cb 100644 --- a/docs/source/model_doc/diffusers/pixart_alpha.mdx +++ b/docs/source/model_doc/diffusers/pixart_alpha.mdx @@ -61,7 +61,7 @@ from optimum.neuron import NeuronPixArtAlphaPipeline compiler_args = {"auto_cast": "none"} input_shapes = {"batch_size": 1, "height": 512, "width": 512, "sequence_length": 120} -neuron_model = NeuronPixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-512x512", torch_dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes) +neuron_model = NeuronPixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-512x512", dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes) # Save locally neuron_model.save_pretrained("pixart_alpha_neuron_512/") diff --git a/docs/source/model_doc/diffusers/pixart_sigma.mdx b/docs/source/model_doc/diffusers/pixart_sigma.mdx index 59ecbefb7..bc52effdb 100644 --- a/docs/source/model_doc/diffusers/pixart_sigma.mdx +++ b/docs/source/model_doc/diffusers/pixart_sigma.mdx @@ -61,7 +61,7 @@ from optimum.neuron import NeuronPixArtSigmaPipeline compiler_args = {"auto_cast": "none"} input_shapes = {"batch_size": 1, "height": 512, "width": 512, "sequence_length": 120} -neuron_model = NeuronPixArtSigmaPipeline.from_pretrained("Jingya/pixart_sigma_pipe_xl_2_512_ms", torch_dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes) +neuron_model = NeuronPixArtSigmaPipeline.from_pretrained("Jingya/pixart_sigma_pipe_xl_2_512_ms", dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes) # Save locally neuron_model.save_pretrained("pixart_sigma_neuron_512/") diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx index 95ddfc833..e9739de66 100644 --- a/docs/source/quickstart.mdx +++ b/docs/source/quickstart.mdx @@ -79,7 +79,7 @@ def main(): model = NeuronModelForCausalLM.from_pretrained( model_id, training_args.trn_config, - torch_dtype=torch.bfloat16, + dtype=torch.bfloat16, attn_implementation="flash_attention_2", # Enable flash attention ) diff --git a/docs/source/training_tutorials/finetune_llama.mdx b/docs/source/training_tutorials/finetune_llama.mdx index a28f5c04c..4e7ae6697 100644 --- a/docs/source/training_tutorials/finetune_llama.mdx +++ b/docs/source/training_tutorials/finetune_llama.mdx @@ -138,7 +138,7 @@ dtype = torch.bfloat16 if training_args.bf16 else torch.float32 model = NeuronModelForCausalLM.from_pretrained( model_id, trn_config, - torch_dtype=dtype, + dtype=dtype, # Use FlashAttention2 for better performance and to be able to use larger sequence lengths. attn_implementation="flash_attention_2", ) diff --git a/docs/source/training_tutorials/finetune_qwen3.mdx b/docs/source/training_tutorials/finetune_qwen3.mdx index 0c9f4f379..1cf4a2d6e 100644 --- a/docs/source/training_tutorials/finetune_qwen3.mdx +++ b/docs/source/training_tutorials/finetune_qwen3.mdx @@ -137,7 +137,7 @@ dtype = torch.bfloat16 if training_args.bf16 else torch.float32 model = NeuronModelForCausalLM.from_pretrained( model_id, trn_config, - torch_dtype=dtype, + dtype=dtype, # Use FlashAttention2 for better performance and to be able to use larger sequence lengths. attn_implementation="flash_attention_2", ) diff --git a/examples/training/qwen3/finetune_qwen3.sh b/examples/training/qwen3/finetune_qwen3.sh index d64a6572d..b2d7568e3 100755 --- a/examples/training/qwen3/finetune_qwen3.sh +++ b/examples/training/qwen3/finetune_qwen3.sh @@ -13,7 +13,8 @@ TP_DEGREE=8 BS=1 GRADIENT_ACCUMULATION_STEPS=8 LOGGING_STEPS=2 -MODEL_NAME="Qwen/Qwen3-8B" # Change this to the desired model name +# MODEL_NAME="Qwen/Qwen3-8B" # Change this to the desired model name +MODEL_NAME="Qwen/Qwen3-0.6B" # Change this to the desired model name OUTPUT_DIR="$(echo $MODEL_NAME | cut -d'/' -f2)-finetuned" DISTRIBUTED_ARGS="--nproc_per_node $PROCESSES_PER_NODE" SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) diff --git a/optimum/commands/neuron/cache.py b/optimum/commands/neuron/cache.py index cb437c347..f19fa1459 100644 --- a/optimum/commands/neuron/cache.py +++ b/optimum/commands/neuron/cache.py @@ -147,7 +147,7 @@ def _list_entries(self): str(entry["batch_size"]), str(entry["sequence_length"]), str(entry.get("tp_degree", entry.get("tensor_parallel_size"))), - str(entry["torch_dtype"]), + str(entry["dtype"]), str(entry["target"]), ) ) diff --git a/optimum/commands/neuron/serve.py b/optimum/commands/neuron/serve.py index d0f7ce0d3..e7ebd8fcf 100644 --- a/optimum/commands/neuron/serve.py +++ b/optimum/commands/neuron/serve.py @@ -101,7 +101,7 @@ def run(self): sequence_length = self.args.sequence_length tensor_parallel_size = self.args.tensor_parallel_size config = AutoConfig.from_pretrained(model_name_or_path) - torch_dtype = DTYPE_MAPPER.pt(config.torch_dtype) + torch_dtype = DTYPE_MAPPER.pt(config.dtype) try: # Look for a NeuronConfig in the model directory neuron_config = NeuronConfig.from_pretrained(model_name_or_path) @@ -202,7 +202,7 @@ def run(self): batch_size = selected_entry["batch_size"] sequence_length = selected_entry["sequence_length"] tensor_parallel_size = selected_entry["tp_degree"] - torch_dtype = DTYPE_MAPPER.pt(selected_entry["torch_dtype"]) + torch_dtype = DTYPE_MAPPER.pt(selected_entry["dtype"]) warning_msg = f"{model_id} is not a neuron model, but a cached configuration is available using" warning_msg += f" instance type {instance_type}," warning_msg += f" batch size = {batch_size}," diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py index afa451485..eda69a541 100644 --- a/optimum/exporters/neuron/__main__.py +++ b/optimum/exporters/neuron/__main__.py @@ -581,7 +581,7 @@ def load_models_and_neuron_configs( "trust_remote_code": trust_remote_code, "framework": "pt", "library_name": library_name, - "torch_dtype": torch_dtype, + "dtype": torch_dtype, } if model is None: model = TasksManager.get_model_from_task(**model_kwargs) @@ -878,7 +878,7 @@ def main(): model_name_or_path=args.model, output=args.output, compiler_kwargs=compiler_kwargs, - torch_dtype=args.torch_dtype, + torch_dtype=args.dtype, tensor_parallel_size=args.tensor_parallel_size, task=task, dynamic_batch_size=args.dynamic_batch_size, diff --git a/optimum/neuron/cache/entries/cache_entry.py b/optimum/neuron/cache/entries/cache_entry.py index 15bd8c520..b4b7a4680 100644 --- a/optimum/neuron/cache/entries/cache_entry.py +++ b/optimum/neuron/cache/entries/cache_entry.py @@ -28,7 +28,7 @@ "bos_token_id", "pad_token_id", "torchscript", - "torch_dtype", # this has been renamed as `float_dtype` for the check + "dtype", # this has been renamed as `float_dtype` for the check "_commit_hash", "sample_size", "projection_dim", diff --git a/optimum/neuron/cache/hub_cache.py b/optimum/neuron/cache/hub_cache.py index 73605e4a3..48f64647b 100644 --- a/optimum/neuron/cache/hub_cache.py +++ b/optimum/neuron/cache/hub_cache.py @@ -427,7 +427,7 @@ def select_hub_cached_entries( continue if torch_dtype is not None: target_value = DTYPE_MAPPER.pt(torch_dtype) if isinstance(torch_dtype, str) else torch_dtype - entry_value = DTYPE_MAPPER.pt(entry.get("torch_dtype")) + entry_value = DTYPE_MAPPER.pt(entry.get("dtype")) if target_value != entry_value: continue selected.append(entry) diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py index fd654b492..fc76cde80 100644 --- a/optimum/neuron/generation/utils.py +++ b/optimum/neuron/generation/utils.py @@ -325,18 +325,17 @@ class NeuronGenerationMixin(GenerationMixin): The class exposes [`~generation.GenerationMixin.generate`], which can be used for: - *greedy decoding* by calling [`~generation.GenerationMixin.greedy_search`] if `num_beams=1` and `do_sample=False` - - *contrastive search* by calling [`~generation.GenerationMixin.contrastive_search`] if `penalty_alpha>0` and - `top_k>1` - *multinomial sampling* by calling [`~generation.GenerationMixin.sample`] if `num_beams=1` and `do_sample=True` - *beam-search decoding* by calling [`~generation.GenerationMixin.beam_search`] if `num_beams>1` and `do_sample=False` - *beam-search multinomial sampling* by calling [`~generation.GenerationMixin.beam_sample`] if `num_beams>1` and `do_sample=True` - - *diverse beam-search decoding* by calling [`~generation.GenerationMixin.group_beam_search`], if `num_beams>1` - and `num_beam_groups>1` - - *constrained beam-search decoding* by calling [`~generation.GenerationMixin.constrained_beam_search`], if - `constraints!=None` or `force_words_ids!=None` + + Note: The following strategies have been removed in transformers 4.56.0+: + - constrained beam-search decoding (constraints and force_words_ids) + - group beam-search decoding (num_beam_groups > 1) + - contrastive search (penalty_alpha > 0) You do not need to call any of the above methods directly. Pass custom parameter values to 'generate' instead. To learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies). diff --git a/optimum/neuron/modeling_diffusion.py b/optimum/neuron/modeling_diffusion.py index 4bf9cb212..45d579a36 100644 --- a/optimum/neuron/modeling_diffusion.py +++ b/optimum/neuron/modeling_diffusion.py @@ -1193,7 +1193,7 @@ def forward( outputs = self.model(*inputs) if self.config.model_type == "t5" and isinstance(outputs, dict): # Flux text encoder 2 - return [outputs["last_hidden_state"].to(self.config.torch_dtype)] + return [outputs["last_hidden_state"].to(self.config.dtype)] if return_dict and not isinstance(outputs, dict): outputs = ModelOutput(dict(zip(self.neuron_config.outputs, outputs))) diff --git a/optimum/neuron/models/inference/backend/config.py b/optimum/neuron/models/inference/backend/config.py index 38209291f..170c2f026 100644 --- a/optimum/neuron/models/inference/backend/config.py +++ b/optimum/neuron/models/inference/backend/config.py @@ -83,9 +83,9 @@ def __init__( self.batch_size = batch_size self.sequence_length = sequence_length self.tp_degree = tp_degree - self.torch_dtype = torch_dtype - if isinstance(self.torch_dtype, str): - self.torch_dtype = DTYPE_MAPPER.pt(self.torch_dtype) + self.dtype = torch_dtype + if isinstance(self.dtype, str): + self.dtype = DTYPE_MAPPER.pt(self.dtype) self.n_active_tokens = self.sequence_length if n_active_tokens is None else n_active_tokens self.output_logits = output_logits diff --git a/optimum/neuron/models/inference/backend/modules/attention/attention_base.py b/optimum/neuron/models/inference/backend/modules/attention/attention_base.py index 37542e575..5ef35b0ba 100644 --- a/optimum/neuron/models/inference/backend/modules/attention/attention_base.py +++ b/optimum/neuron/models/inference/backend/modules/attention/attention_base.py @@ -99,7 +99,7 @@ def __init__( self.head_dim = self.hidden_size // self.num_attention_heads self.max_position_embeddings = config.max_position_embeddings self.rope_theta = config.rope_theta - self.torch_dtype = neuron_config.torch_dtype + self.dtype = neuron_config.dtype self.rms_norm_eps = config.rms_norm_eps self._qk_scale = qk_scale @@ -111,7 +111,7 @@ def __init__( num_attention_heads=self.num_attention_heads, num_key_value_heads=self.num_key_value_heads, tp_degree=neuron_config.tp_degree, - dtype=self.torch_dtype, + dtype=self.dtype, bias=qkv_proj_bias, gather_output=False, fused_qkv=neuron_config.fused_qkv, @@ -125,12 +125,12 @@ def __init__( num_attention_heads=self.num_attention_heads, num_key_value_heads=self.num_key_value_heads, tp_degree=neuron_config.tp_degree, - dtype=self.torch_dtype, + dtype=self.dtype, bias=o_proj_bias, input_is_parallel=True, layer_name=self.o_proj_layer_name, tensor_model_parallel_group=self.tensor_model_parallel_group, - rpl_reduce_dtype=neuron_config.torch_dtype, + rpl_reduce_dtype=neuron_config.dtype, ) self.num_heads = utils.divide(self.qkv_proj.get_num_attention_heads(), neuron_config.tp_degree) self.num_key_value_heads = utils.divide(self.qkv_proj.get_num_key_value_heads(), neuron_config.tp_degree) @@ -202,13 +202,13 @@ def perform_prefill(self, Q, K, V, q_len, bsz, attention_mask) -> Tensor: Q = ( Q.permute(0, 1, 3, 2) # after permute: batch, num_heads, d_head, seqlen .reshape((bsz * self.num_heads, self.head_dim, q_len)) - .to(self.torch_dtype) + .to(self.dtype) ) Q = Q * self.qk_scale K_active = ( - K_active.permute(0, 1, 3, 2).reshape((bsz * self.num_heads, self.head_dim, q_len)).to(self.torch_dtype) + K_active.permute(0, 1, 3, 2).reshape((bsz * self.num_heads, self.head_dim, q_len)).to(self.dtype) ) - V_active = V_active.reshape((bsz * self.num_heads, q_len, self.head_dim)).to(self.torch_dtype) + V_active = V_active.reshape((bsz * self.num_heads, q_len, self.head_dim)).to(self.dtype) # shape: (B*H)DS attn_output = torch.zeros(bsz * self.num_heads, self.head_dim, q_len, dtype=Q.dtype, device=Q.device) diff --git a/optimum/neuron/models/inference/backend/modules/decoder/decoder_builder.py b/optimum/neuron/models/inference/backend/modules/decoder/decoder_builder.py index ced8b09dc..8a11576c1 100644 --- a/optimum/neuron/models/inference/backend/modules/decoder/decoder_builder.py +++ b/optimum/neuron/models/inference/backend/modules/decoder/decoder_builder.py @@ -38,8 +38,8 @@ def __init__( self.max_tokens = max_tokens self.active_tokens = active_tokens - if not self.neuron_config.torch_dtype: - self.neuron_config.torch_dtype = torch.float32 + if not self.neuron_config.dtype: + self.neuron_config.dtype = torch.float32 if config.pad_token_id is None: config.pad_token_id = 0 @@ -88,9 +88,9 @@ def load_module(self): float_model = self.model_cls(self.config, self.neuron_config) float_model.eval() - if self.neuron_config.torch_dtype != torch.float32: + if self.neuron_config.dtype != torch.float32: float_model._apply( - lambda t: t.to(self.neuron_config.torch_dtype) + lambda t: t.to(self.neuron_config.dtype) if t.is_floating_point() and t.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2] else t ) diff --git a/optimum/neuron/models/inference/backend/modules/decoder/decoder_wrapper.py b/optimum/neuron/models/inference/backend/modules/decoder/decoder_wrapper.py index 6f58e3f5a..e1a34bd92 100644 --- a/optimum/neuron/models/inference/backend/modules/decoder/decoder_wrapper.py +++ b/optimum/neuron/models/inference/backend/modules/decoder/decoder_wrapper.py @@ -37,8 +37,8 @@ def __init__( self.model = model self.tag = tag - if not self.neuron_config.torch_dtype: - self.neuron_config.torch_dtype = torch.float32 + if not self.neuron_config.dtype: + self.neuron_config.dtype = torch.float32 if config.pad_token_id is None: config.pad_token_id = 0 diff --git a/optimum/neuron/models/inference/backend/modules/kvcache/kv_cache_manager.py b/optimum/neuron/models/inference/backend/modules/kvcache/kv_cache_manager.py index 50ecc4d0a..78bee92c6 100644 --- a/optimum/neuron/models/inference/backend/modules/kvcache/kv_cache_manager.py +++ b/optimum/neuron/models/inference/backend/modules/kvcache/kv_cache_manager.py @@ -47,7 +47,7 @@ def __init__(self, config: PretrainedConfig, neuron_config: NxDNeuronConfig, **k self._init_kv_shape(config, neuron_config) num_layer = config.num_hidden_layers - dtype = neuron_config.torch_dtype + dtype = neuron_config.dtype self.past_key_values = nn.ParameterList( [nn.Parameter(torch.zeros(self.kv_shape, dtype=dtype), requires_grad=False) for _ in range(num_layer * 2)] ) diff --git a/optimum/neuron/models/inference/backend/modules/moe_v2.py b/optimum/neuron/models/inference/backend/modules/moe_v2.py index 6c4063925..d9789c10b 100644 --- a/optimum/neuron/models/inference/backend/modules/moe_v2.py +++ b/optimum/neuron/models/inference/backend/modules/moe_v2.py @@ -36,7 +36,7 @@ def initialize_moe_module( glu_mlp=neuron_config.glu_mlp, early_expert_affinity_modulation=early_expert_affinity_modulation, ), - dtype=neuron_config.torch_dtype, + dtype=neuron_config.dtype, ) shared_experts = None if n_shared_experts is not None: @@ -45,8 +45,8 @@ def initialize_moe_module( intermediate_size=config.intermediate_size, num_shared_experts=n_shared_experts, hidden_act=config.hidden_act, - dtype=neuron_config.torch_dtype, - reduce_dtype=neuron_config.torch_dtype, + dtype=neuron_config.dtype, + reduce_dtype=neuron_config.dtype, fused_gate_up_projection=fused_shared_experts, ) diff --git a/optimum/neuron/models/inference/backend/pretrained_model.py b/optimum/neuron/models/inference/backend/pretrained_model.py index 985115de8..5a3a24a6f 100644 --- a/optimum/neuron/models/inference/backend/pretrained_model.py +++ b/optimum/neuron/models/inference/backend/pretrained_model.py @@ -117,7 +117,7 @@ def __init__( self.config = copy.deepcopy(config) self.neuron_config = copy.deepcopy(neuron_config) # Override torch_dtype in config as it is used by the neuronx_distributed code to cast weights to the correct type - self.config.torch_dtype = self.neuron_config.torch_dtype + self.config.dtype = self.neuron_config.dtype self._traced_model = traced_model self.graph_builders = graph_builders # Required for loading weights @@ -252,11 +252,11 @@ def checkpoint_loader_fn(self, checkpoint_path, config, neuron_config): """This function loads the model's state dictionary and weights from the hf model""" model_sd = self.get_state_dict(checkpoint_path, config, neuron_config) - if neuron_config.torch_dtype != torch.float32: + if neuron_config.dtype != torch.float32: for name, param in model_sd.items(): - if torch.is_floating_point(param) and param.dtype is not neuron_config.torch_dtype: - logger.debug(f"Converting {name} to {neuron_config.torch_dtype}") - model_sd[name] = param.to(neuron_config.torch_dtype) + if torch.is_floating_point(param) and param.dtype is not neuron_config.dtype: + logger.debug(f"Converting {name} to {neuron_config.dtype}") + model_sd[name] = param.to(neuron_config.dtype) return model_sd @classmethod @@ -344,7 +344,7 @@ def _export( trust_remote_code=trust_remote_code, ).get_text_config() # Override torch_dtype in config as it is used by the neuronx_distributed code to cast weights to the correct type - config.torch_dtype = neuron_config.torch_dtype + config.dtype = neuron_config.dtype # Evaluate head_dim if it is defined but set to null (like in Mixtral for transformers 4.54+) if hasattr(config, "head_dim") and config.head_dim is None: config.head_dim = config.hidden_size // config.num_attention_heads diff --git a/optimum/neuron/models/inference/granite/modeling_granite.py b/optimum/neuron/models/inference/granite/modeling_granite.py index 72b25fb19..1863ffe92 100644 --- a/optimum/neuron/models/inference/granite/modeling_granite.py +++ b/optimum/neuron/models/inference/granite/modeling_granite.py @@ -104,7 +104,7 @@ def __init__(self, config: GraniteConfig, neuron_config: NxDNeuronConfig): config.vocab_size, config.hidden_size, config.pad_token_id, - dtype=neuron_config.torch_dtype, + dtype=neuron_config.dtype, shard_across_embedding=True, pad=True, ) diff --git a/optimum/neuron/models/inference/llama/modeling_llama.py b/optimum/neuron/models/inference/llama/modeling_llama.py index 76b02eaec..18161c941 100644 --- a/optimum/neuron/models/inference/llama/modeling_llama.py +++ b/optimum/neuron/models/inference/llama/modeling_llama.py @@ -84,7 +84,7 @@ def __init__(self, config: LlamaConfig, neuron_config: NxDNeuronConfig): self.intermediate_size, bias=mlp_bias, gather_output=False, - dtype=neuron_config.torch_dtype, + dtype=neuron_config.dtype, pad=True, ) self.up_proj = ColumnParallelLinear( @@ -92,7 +92,7 @@ def __init__(self, config: LlamaConfig, neuron_config: NxDNeuronConfig): self.intermediate_size, bias=mlp_bias, gather_output=False, - dtype=neuron_config.torch_dtype, + dtype=neuron_config.dtype, pad=True, ) self.down_proj = RowParallelLinear( @@ -100,9 +100,9 @@ def __init__(self, config: LlamaConfig, neuron_config: NxDNeuronConfig): self.hidden_size, bias=mlp_bias, input_is_parallel=True, - dtype=neuron_config.torch_dtype, + dtype=neuron_config.dtype, pad=True, - reduce_dtype=neuron_config.torch_dtype, + reduce_dtype=neuron_config.dtype, ) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -288,7 +288,7 @@ def __init__(self, config: LlamaConfig, neuron_config: NxDNeuronConfig): config.vocab_size, config.hidden_size, config.pad_token_id, - dtype=neuron_config.torch_dtype, + dtype=neuron_config.dtype, shard_across_embedding=True, pad=True, ) diff --git a/optimum/neuron/models/inference/llama4/modeling_llama4.py b/optimum/neuron/models/inference/llama4/modeling_llama4.py index 5a3473aec..7b29dba66 100644 --- a/optimum/neuron/models/inference/llama4/modeling_llama4.py +++ b/optimum/neuron/models/inference/llama4/modeling_llama4.py @@ -209,7 +209,7 @@ def __init__(self, config: Llama4TextConfig, neuron_config: NxDNeuronConfig): config.vocab_size, config.hidden_size, config.pad_token_id, - dtype=neuron_config.torch_dtype, + dtype=neuron_config.dtype, shard_across_embedding=True, pad=True, ) @@ -226,7 +226,7 @@ def __init__(self, config: Llama4TextConfig, neuron_config: NxDNeuronConfig): gather_output=not neuron_config.on_device_sampling, bias=False, pad=True, - dtype=neuron_config.torch_dtype, + dtype=neuron_config.dtype, ) diff --git a/optimum/neuron/models/inference/mixtral/modeling_mixtral.py b/optimum/neuron/models/inference/mixtral/modeling_mixtral.py index 92adbc093..e7530aa64 100644 --- a/optimum/neuron/models/inference/mixtral/modeling_mixtral.py +++ b/optimum/neuron/models/inference/mixtral/modeling_mixtral.py @@ -206,7 +206,7 @@ def __init__(self, config: MixtralConfig, neuron_config: NxDNeuronConfig): config.vocab_size, config.hidden_size, config.pad_token_id, - dtype=neuron_config.torch_dtype, + dtype=neuron_config.dtype, shard_across_embedding=True, ) self.layers = nn.ModuleList( diff --git a/optimum/neuron/models/inference/modeling_utils.py b/optimum/neuron/models/inference/modeling_utils.py index 36c31afdb..3be29ae78 100644 --- a/optimum/neuron/models/inference/modeling_utils.py +++ b/optimum/neuron/models/inference/modeling_utils.py @@ -138,7 +138,7 @@ def get_neuron_config( batch_size=batch_size, sequence_length=sequence_length, tensor_parallel_size=tensor_parallel_size, - dtype=DTYPE_MAPPER.pt(config.torch_dtype), + dtype=DTYPE_MAPPER.pt(config.dtype), ) @classmethod diff --git a/optimum/neuron/models/inference/qwen2/modeling_qwen2.py b/optimum/neuron/models/inference/qwen2/modeling_qwen2.py index 7cd0d0f71..a000bc07e 100644 --- a/optimum/neuron/models/inference/qwen2/modeling_qwen2.py +++ b/optimum/neuron/models/inference/qwen2/modeling_qwen2.py @@ -76,7 +76,7 @@ def __init__(self, config: Qwen2Config, neuron_config: NxDNeuronConfig): config.vocab_size, config.hidden_size, config.pad_token_id, - dtype=neuron_config.torch_dtype, + dtype=neuron_config.dtype, shard_across_embedding=True, pad=True, ) diff --git a/optimum/neuron/models/inference/qwen3/modeling_qwen3.py b/optimum/neuron/models/inference/qwen3/modeling_qwen3.py index 3fb33928b..b35b87dd9 100644 --- a/optimum/neuron/models/inference/qwen3/modeling_qwen3.py +++ b/optimum/neuron/models/inference/qwen3/modeling_qwen3.py @@ -76,7 +76,7 @@ def __init__(self, config: Qwen3Config, neuron_config: NxDNeuronConfig): config.vocab_size, config.hidden_size, config.pad_token_id, - dtype=neuron_config.torch_dtype, + dtype=neuron_config.dtype, shard_across_embedding=True, pad=True, ) diff --git a/optimum/neuron/models/inference/qwen3_moe/modeling_qwen3_moe.py b/optimum/neuron/models/inference/qwen3_moe/modeling_qwen3_moe.py index fc64713f6..76eeda130 100644 --- a/optimum/neuron/models/inference/qwen3_moe/modeling_qwen3_moe.py +++ b/optimum/neuron/models/inference/qwen3_moe/modeling_qwen3_moe.py @@ -179,7 +179,7 @@ def __init__(self, config: Qwen3MoeConfig, neuron_config: NxDNeuronConfig): config.vocab_size, config.hidden_size, config.pad_token_id, - dtype=neuron_config.torch_dtype, + dtype=neuron_config.dtype, shard_across_embedding=True, ) self.layers = nn.ModuleList( diff --git a/optimum/neuron/models/inference/smollm3/modeling_smollm3.py b/optimum/neuron/models/inference/smollm3/modeling_smollm3.py index b45560202..d4ad65803 100644 --- a/optimum/neuron/models/inference/smollm3/modeling_smollm3.py +++ b/optimum/neuron/models/inference/smollm3/modeling_smollm3.py @@ -88,7 +88,7 @@ def __init__(self, config: SmolLM3Config, neuron_config: NxDNeuronConfig): config.vocab_size, config.hidden_size, config.pad_token_id, - dtype=neuron_config.torch_dtype, + dtype=neuron_config.dtype, shard_across_embedding=True, pad=True, ) diff --git a/optimum/neuron/models/training/llama/modeling_llama.py b/optimum/neuron/models/training/llama/modeling_llama.py index f4cfe4c38..45c913626 100644 --- a/optimum/neuron/models/training/llama/modeling_llama.py +++ b/optimum/neuron/models/training/llama/modeling_llama.py @@ -210,7 +210,7 @@ def __init__(self, config, trn_config: TrainingNeuronConfig): init_method=init_method, sequence_parallel_enabled=self.trn_config.sequence_parallel_enabled, sequence_dimension=0, - dtype=self.config.torch_dtype, + dtype=self.config.dtype, ) self.down_proj = RowParallelLinear( self.intermediate_size, @@ -220,7 +220,7 @@ def __init__(self, config, trn_config: TrainingNeuronConfig): init_method=init_method, sequence_parallel_enabled=self.trn_config.sequence_parallel_enabled, sequence_dimension=0, - dtype=self.config.torch_dtype, + dtype=self.config.dtype, ) def forward(self, x): @@ -333,7 +333,7 @@ def __init__(self, config: LlamaConfig, trn_config: TrainingNeuronConfig, layer_ sequence_parallel_enabled=trn_config.sequence_parallel_enabled, kv_size_multiplier=self.kv_size_multiplier, fuse_qkv=trn_config.fuse_qkv, - dtype=self.config.torch_dtype, + dtype=self.config.dtype, ) gqa_qkv_specs = GQAQKVColumnParallelLinearSpec( @@ -361,7 +361,7 @@ def __init__(self, config: LlamaConfig, trn_config: TrainingNeuronConfig, layer_ init_method=init_method, sequence_parallel_enabled=trn_config.sequence_parallel_enabled, sequence_dimension=0, - dtype=self.config.torch_dtype, + dtype=self.config.dtype, ) self.specs.add_spec( FusedLinearsSpec( @@ -382,7 +382,7 @@ def __init__(self, config: LlamaConfig, trn_config: TrainingNeuronConfig, layer_ init_method=init_method, sequence_parallel_enabled=trn_config.sequence_parallel_enabled, sequence_dimension=0, - dtype=self.config.torch_dtype, + dtype=self.config.dtype, ) self.k_proj = ColumnParallelLinear( self.hidden_size, @@ -392,7 +392,7 @@ def __init__(self, config: LlamaConfig, trn_config: TrainingNeuronConfig, layer_ init_method=init_method, sequence_parallel_enabled=trn_config.sequence_parallel_enabled, sequence_dimension=0, - dtype=self.config.torch_dtype, + dtype=self.config.dtype, ) self.v_proj = ColumnParallelLinear( self.hidden_size, @@ -402,7 +402,7 @@ def __init__(self, config: LlamaConfig, trn_config: TrainingNeuronConfig, layer_ init_method=init_method, sequence_parallel_enabled=trn_config.sequence_parallel_enabled, sequence_dimension=0, - dtype=self.config.torch_dtype, + dtype=self.config.dtype, ) self.o_proj = RowParallelLinear( self.num_heads * self.head_dim, @@ -412,7 +412,7 @@ def __init__(self, config: LlamaConfig, trn_config: TrainingNeuronConfig, layer_ init_method=init_method, sequence_parallel_enabled=trn_config.sequence_parallel_enabled, sequence_dimension=0, - dtype=self.config.torch_dtype, + dtype=self.config.dtype, ) self.num_heads = neuronx_dist_utils.divide(config.num_attention_heads, tp_size) self.num_key_value_heads = neuronx_dist_utils.divide( @@ -606,7 +606,7 @@ def __init__(self, config: LlamaConfig, trn_config: TrainingNeuronConfig): self.padding_idx, init_method=init_method, sequence_parallel_enabled=trn_config.sequence_parallel_enabled, - dtype=config.torch_dtype, + dtype=config.dtype, ) self.layers = nn.ModuleList( [LlamaDecoderLayer(config, trn_config, layer_idx) for layer_idx in range(config.num_hidden_layers)] @@ -715,7 +715,7 @@ def __init__(self, config, trn_config: TrainingNeuronConfig): init_method=init_method, sequence_parallel_enabled=trn_config.sequence_parallel_enabled, sequence_dimension=0, - dtype=self.config.torch_dtype, + dtype=self.config.dtype, ) self.vocab_size = config.vocab_size // get_tensor_model_parallel_size() diff --git a/optimum/neuron/models/training/modeling_utils.py b/optimum/neuron/models/training/modeling_utils.py index dfd5fd23c..cef3c7c5b 100644 --- a/optimum/neuron/models/training/modeling_utils.py +++ b/optimum/neuron/models/training/modeling_utils.py @@ -61,7 +61,6 @@ get_state_dict_dtype, load_state_dict, no_init_weights, - set_initialized_submodules, ) from transformers.pytorch_utils import id_tensor_storage from transformers.quantizers import AutoHfQuantizer @@ -213,7 +212,7 @@ def _check_and_adjust_attn_implementation( return attn_implementation def _flash_attn_2_can_dispatch(self, is_init_check: bool = False) -> bool: - torch_dtype = self.config.torch_dtype + dtype = self.config.dtype if not self._supports_flash_attn: raise ValueError( @@ -221,9 +220,9 @@ def _flash_attn_2_can_dispatch(self, is_init_check: bool = False) -> bool: "https://github.com/huggingface/optimum-neuron/issues" ) - if torch_dtype is None: + if dtype is None: logger.warning_once( - "You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour" + "You are attempting to use Flash Attention 2 without specifying a dtype. This might lead to unexpected behaviour" ) # If no error raise by this point, we can return `True` @@ -447,7 +446,14 @@ def _load_pretrained_model( _loaded_keys = [k[len(prefix) + 1 :] for k in loaded_keys] else: _loaded_keys = loaded_keys - not_initialized_submodules = set_initialized_submodules(model, _loaded_keys) + + # Mark loaded parameters/buffers as initialized (transformers 4.56.0+ approach) + for key in model.state_dict(): + if key in _loaded_keys: + param_or_buffer = model.get_parameter_or_buffer(key) + if param_or_buffer is not None: + param_or_buffer._is_hf_initialized = True + # If we're about to tie the output embeds to the input embeds we don't need to init them if ( hasattr(model.config.get_text_config(decoder=True), "tie_word_embeddings") @@ -458,6 +464,22 @@ def _load_pretrained_model( # Still need to initialize if there is a bias term since biases are not tied. if not hasattr(output_embeddings, "bias") or output_embeddings.bias is None: output_embeddings._is_hf_initialized = True + + # Set the flag on modules recursively + def set_is_initialized_for_modules(module): + if ( + all(getattr(child, "_is_hf_initialized", False) for child in module.children()) + and all(getattr(param, "_is_hf_initialized", False) for param in module.parameters(recurse=False)) + and all( + getattr(buffer, "_is_hf_initialized", False) + for buffer in module.buffers(recurse=False) + if buffer not in module._non_persistent_buffers_set + ) + ): + module._is_hf_initialized = True + + model.apply(set_is_initialized_for_modules) + not_initialized_submodules = {name: mod for name, mod in model.named_modules() if not getattr(mod, "_is_hf_initialized", False)} else: not_initialized_submodules = dict(model.named_modules()) @@ -704,7 +726,11 @@ def from_pretrained( from_pipeline = kwargs.pop("_from_pipeline", None) from_auto_class = kwargs.pop("_from_auto", False) _fast_init = kwargs.pop("_fast_init", True) + dtype = kwargs.pop("dtype", None) torch_dtype = kwargs.pop("torch_dtype", None) + # For BC on torch_dtype argument (deprecated in favor of dtype) + if torch_dtype is not None: + dtype = dtype if dtype is not None else torch_dtype low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", None) device_map = kwargs.pop("device_map", None) kwargs.pop("max_memory", None) @@ -859,6 +885,9 @@ def from_pretrained( _from_pipeline=from_pipeline, **kwargs, ) + # dtype is a config attribute, not a model parameter, so we remove it from model_kwargs + # Following the transformers pattern where dtype/torch_dtype are popped early from kwargs + model_kwargs.pop("dtype", None) else: config = copy.deepcopy(config) model_kwargs = kwargs @@ -1172,18 +1201,18 @@ def from_pretrained( # 1. If torch_dtype is not None, we use that dtype # 2. If torch_dtype is "auto", we auto-detect dtype from the loaded state_dict, by checking its first # weights entry that is of a floating type - we assume all floating dtype weights are of the same dtype - # We also may have config.torch_dtype available, but we won't rely on it till v5 + # We also may have config.dtype available, but we won't rely on it till v5 dtype_orig = None - if torch_dtype is not None: - if isinstance(torch_dtype, str): - if torch_dtype == "auto": - if hasattr(config, "torch_dtype") and config.torch_dtype is not None: - torch_dtype = config.torch_dtype - logger.info(f"Will use torch_dtype={torch_dtype} as defined in model's config object") + if dtype is not None: + if isinstance(dtype, str): + if dtype == "auto": + if hasattr(config, "dtype") and config.dtype is not None: + dtype = config.dtype + logger.info(f"Will use dtype={dtype} as defined in model's config object") else: if is_sharded and "dtype" in sharded_metadata: - torch_dtype = sharded_metadata["dtype"] + dtype = sharded_metadata["dtype"] elif not is_sharded: # ** Difference from original from_pretrained ** # Here we load the state dict only if we end up in this case, otherwise we defer the @@ -1193,52 +1222,52 @@ def from_pretrained( one_time_state_dict = load_state_dict( resolved_archive_file, weights_only=weights_only ) - torch_dtype = get_state_dict_dtype(one_time_state_dict) + dtype = get_state_dict_dtype(one_time_state_dict) del one_time_state_dict - xm.rendezvous(f"auto torch_dtype_{worker}") + xm.rendezvous(f"auto dtype_{worker}") else: one_state_dict = load_state_dict(resolved_archive_file[0], weights_only=weights_only) - torch_dtype = get_state_dict_dtype(one_state_dict) + dtype = get_state_dict_dtype(one_state_dict) del one_state_dict # free CPU memory logger.info( - "Since the `torch_dtype` attribute can't be found in model's config object, " - "will use torch_dtype={torch_dtype} as derived from model's weights" + "Since the `dtype` attribute can't be found in model's config object, " + "will use dtype={dtype} as derived from model's weights" ) - elif hasattr(torch, torch_dtype): - torch_dtype = getattr(torch, torch_dtype) + elif hasattr(torch, dtype): + dtype = getattr(torch, dtype) for sub_config_key in config.sub_configs.keys(): sub_config = getattr(config, sub_config_key) - sub_config.torch_dtype = torch_dtype - elif isinstance(torch_dtype, torch.dtype): + sub_config.dtype = dtype + elif isinstance(dtype, torch.dtype): for sub_config_key in config.sub_configs.keys(): sub_config = getattr(config, sub_config_key) - sub_config.torch_dtype = torch_dtype - elif isinstance(torch_dtype, dict): - for key, curr_dtype in torch_dtype.items(): + sub_config.dtype = dtype + elif isinstance(dtype, dict): + for key, curr_dtype in dtype.items(): if hasattr(config, key): value = getattr(config, key) - value.torch_dtype = curr_dtype + value.dtype = curr_dtype # main torch dtype for modules that aren't part of any sub-config - torch_dtype = torch_dtype.get("") - config.torch_dtype = torch_dtype - if isinstance(torch_dtype, str) and hasattr(torch, torch_dtype): - torch_dtype = getattr(torch, torch_dtype) - elif torch_dtype is None: - torch_dtype = torch.float32 + dtype = dtype.get("") + config.dtype = dtype + if isinstance(dtype, str) and hasattr(torch, dtype): + dtype = getattr(torch, dtype) + elif dtype is None: + dtype = torch.float32 else: raise ValueError( - f"`torch_dtype` can be one of: `torch.dtype`, `'auto'`, a string of a valid `torch.dtype` or a `dict` with valid `torch_dtype` " - f"for each sub-config in composite configs, but received {torch_dtype}" + f"`dtype` can be one of: `torch.dtype`, `'auto'`, a string of a valid `torch.dtype` or a `dict` with valid `dtype` " + f"for each sub-config in composite configs, but received {dtype}" ) - dtype_orig = cls._set_default_torch_dtype(torch_dtype) + dtype_orig = cls._set_default_torch_dtype(dtype) else: # set fp32 as the default dtype for BC default_dtype = str(torch.get_default_dtype()).split(".")[-1] - config.torch_dtype = default_dtype + config.dtype = default_dtype for key in config.sub_configs.keys(): value = getattr(config, key) - value.torch_dtype = default_dtype + value.dtype = default_dtype # ** Difference from original from_pretrained ** # We do not handle `use_keep_in_fp32_modules` here since it is not relevant for us. @@ -1264,9 +1293,9 @@ def from_pretrained( config = copy.deepcopy(config) # We do not want to modify the config inplace in from_pretrained. # ** Difference from original from_pretrained ** - # We make sure that config.torch_dtype is of type torch.dtype. + # We make sure that config.dtype is of type torch.dtype. # We do not change the config inplace since we are working from a deepcopy. - config.torch_dtype = torch_dtype + config.dtype = dtype # ** Difference from original from_pretrained ** # We do not support the `tie_word_embeddings` feature in pipeline parallelism. @@ -1316,7 +1345,7 @@ def from_pretrained( sharded_metadata=sharded_metadata, _fast_init=_fast_init, device_map=device_map, - dtype=torch_dtype, + dtype=dtype, weights_only=weights_only, ) @@ -1419,7 +1448,7 @@ def save_pretrained( # save the string version of dtype to the config, e.g. convert torch.float32 => "float32" # we currently don't use this setting automatically, but may start to use with v5 dtype = get_parameter_dtype(model_to_save) - model_to_save.config.torch_dtype = str(dtype).split(".")[1] + model_to_save.config.dtype = str(dtype).split(".")[1] # Attach architecture to the config model_to_save.config.architectures = [model_to_save.__class__.__name__] diff --git a/optimum/neuron/models/training/qwen3/modeling_qwen3.py b/optimum/neuron/models/training/qwen3/modeling_qwen3.py index b5536d910..8b1eddb6d 100644 --- a/optimum/neuron/models/training/qwen3/modeling_qwen3.py +++ b/optimum/neuron/models/training/qwen3/modeling_qwen3.py @@ -197,7 +197,7 @@ def __init__(self, config: Qwen3Config, trn_config: TrainingNeuronConfig): self.padding_idx, init_method=init_method, sequence_parallel_enabled=trn_config.sequence_parallel_enabled, - dtype=config.torch_dtype, + dtype=config.dtype, ) self.layers = nn.ModuleList( [Qwen3DecoderLayer(config, trn_config, layer_idx) for layer_idx in range(config.num_hidden_layers)] diff --git a/optimum/neuron/trainers/sft_trainer.py b/optimum/neuron/trainers/sft_trainer.py index b69e909d2..c9a481bb4 100644 --- a/optimum/neuron/trainers/sft_trainer.py +++ b/optimum/neuron/trainers/sft_trainer.py @@ -138,7 +138,7 @@ def __init__( raise ValueError("You passed model_init_kwargs to the SFTConfig, but your model is already instantiated.") else: model_init_kwargs = args.model_init_kwargs - torch_dtype = model_init_kwargs.get("torch_dtype") + torch_dtype = model_init_kwargs.get("dtype") if torch_dtype is not None: # Convert to `torch.dtype` if an str is passed if isinstance(torch_dtype, str) and torch_dtype != "auto": @@ -147,7 +147,7 @@ def __init__( raise ValueError( f"Invalid `torch_dtype` passed to the SFTConfig. Expected a string with either `torch.dtype` or 'auto', but got {torch_dtype}." ) - model_init_kwargs["torch_dtype"] = torch_dtype + model_init_kwargs["dtype"] = torch_dtype if isinstance(model, str): logging.warning( diff --git a/optimum/neuron/trainers/training_args.py b/optimum/neuron/trainers/training_args.py index f0e6f164e..b54704843 100644 --- a/optimum/neuron/trainers/training_args.py +++ b/optimum/neuron/trainers/training_args.py @@ -29,10 +29,9 @@ SchedulerType, get_last_checkpoint, ) +from functools import cached_property + from transformers.training_args import OptimizerNames, _convert_str_dict, default_logdir, trainer_log_levels -from transformers.utils import ( - cached_property, -) from ...utils import logging from ..accelerate import NeuronAcceleratorState, NeuronPartialState @@ -759,8 +758,8 @@ def _dict_torch_dtype_to_str(self, d: dict[str, Any]) -> None: converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"* string, which can then be stored in the json format. """ - if d.get("torch_dtype", None) is not None and not isinstance(d["torch_dtype"], str): - d["torch_dtype"] = str(d["torch_dtype"]).split(".")[1] + if d.get("dtype", None) is not None and not isinstance(d["dtype"], str): + d["dtype"] = str(d["dtype"]).split(".")[1] for value in d.values(): if isinstance(value, dict): self._dict_torch_dtype_to_str(value) diff --git a/pyproject.toml b/pyproject.toml index 6f82fa577..9c1836bba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ classifiers = [ "Topic :: Scientific/Engineering :: Artificial Intelligence", ] dependencies = [ - "transformers ~= 4.55.4", + "transformers ~= 4.57.1", "accelerate == 1.8.1", "optimum ~= 1.24.0", "huggingface_hub >= 0.31.4", diff --git a/tools/cache/auto_fill_diffusion_cache.py b/tools/cache/auto_fill_diffusion_cache.py index 7213dda88..3a00c819f 100644 --- a/tools/cache/auto_fill_diffusion_cache.py +++ b/tools/cache/auto_fill_diffusion_cache.py @@ -219,7 +219,7 @@ def compile_and_cache_model( task=model_config.get("task", None), auto_cast=model_config.get("auto_cast", None), auto_cast_type=model_config.get("auto_cast_type", None), - torch_dtype=model_config.get("torch_dtype", None), + torch_dtype=model_config.get("dtype", None), ) elif args.hf_model_id is None: raise ValueError("You must provide --hf_model_id to compile a model without a config file.") From 6fe874b29e205368dd6dcdb9eb5008620e56a09c Mon Sep 17 00:00:00 2001 From: Michael Benayoun Date: Wed, 29 Oct 2025 12:10:21 +0100 Subject: [PATCH 02/10] fix: bugs linked to refactor --- optimum/neuron/models/training/modeling_utils.py | 13 +++++++------ optimum/neuron/trainers/training_args.py | 3 +-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/optimum/neuron/models/training/modeling_utils.py b/optimum/neuron/models/training/modeling_utils.py index cef3c7c5b..31efbab17 100644 --- a/optimum/neuron/models/training/modeling_utils.py +++ b/optimum/neuron/models/training/modeling_utils.py @@ -469,7 +469,9 @@ def _load_pretrained_model( def set_is_initialized_for_modules(module): if ( all(getattr(child, "_is_hf_initialized", False) for child in module.children()) - and all(getattr(param, "_is_hf_initialized", False) for param in module.parameters(recurse=False)) + and all( + getattr(param, "_is_hf_initialized", False) for param in module.parameters(recurse=False) + ) and all( getattr(buffer, "_is_hf_initialized", False) for buffer in module.buffers(recurse=False) @@ -479,7 +481,9 @@ def set_is_initialized_for_modules(module): module._is_hf_initialized = True model.apply(set_is_initialized_for_modules) - not_initialized_submodules = {name: mod for name, mod in model.named_modules() if not getattr(mod, "_is_hf_initialized", False)} + not_initialized_submodules = { + name: mod for name, mod in model.named_modules() if not getattr(mod, "_is_hf_initialized", False) + } else: not_initialized_submodules = dict(model.named_modules()) @@ -885,9 +889,6 @@ def from_pretrained( _from_pipeline=from_pipeline, **kwargs, ) - # dtype is a config attribute, not a model parameter, so we remove it from model_kwargs - # Following the transformers pattern where dtype/torch_dtype are popped early from kwargs - model_kwargs.pop("dtype", None) else: config = copy.deepcopy(config) model_kwargs = kwargs @@ -1260,7 +1261,7 @@ def from_pretrained( f"for each sub-config in composite configs, but received {dtype}" ) - dtype_orig = cls._set_default_torch_dtype(dtype) + dtype_orig = cls._set_default_dtype(dtype) else: # set fp32 as the default dtype for BC default_dtype = str(torch.get_default_dtype()).split(".")[-1] diff --git a/optimum/neuron/trainers/training_args.py b/optimum/neuron/trainers/training_args.py index b54704843..e692d38e3 100644 --- a/optimum/neuron/trainers/training_args.py +++ b/optimum/neuron/trainers/training_args.py @@ -18,6 +18,7 @@ import os from dataclasses import dataclass, field, fields from enum import Enum +from functools import cached_property from typing import Any import torch @@ -29,8 +30,6 @@ SchedulerType, get_last_checkpoint, ) -from functools import cached_property - from transformers.training_args import OptimizerNames, _convert_str_dict, default_logdir, trainer_log_levels from ...utils import logging From 1e910a52730c3fff937bdb3fdd1b9fbaf8873de6 Mon Sep 17 00:00:00 2001 From: Michael Benayoun Date: Wed, 29 Oct 2025 12:27:38 +0100 Subject: [PATCH 03/10] fix: restore for inference related code --- optimum/commands/neuron/cache.py | 2 +- optimum/commands/neuron/serve.py | 2 +- optimum/neuron/cache/hub_cache.py | 2 +- optimum/neuron/models/inference/backend/config.py | 6 +++--- .../backend/modules/attention/attention_base.py | 4 ++-- .../backend/modules/decoder/decoder_builder.py | 8 ++++---- .../backend/modules/decoder/decoder_wrapper.py | 4 ++-- .../backend/modules/kvcache/kv_cache_manager.py | 2 +- .../models/inference/backend/modules/moe_v2.py | 6 +++--- .../models/inference/backend/pretrained_model.py | 12 ++++++------ .../models/inference/granite/modeling_granite.py | 2 +- .../neuron/models/inference/llama/modeling_llama.py | 10 +++++----- .../models/inference/llama4/modeling_llama4.py | 4 ++-- .../models/inference/mixtral/modeling_mixtral.py | 2 +- .../neuron/models/inference/qwen2/modeling_qwen2.py | 2 +- .../neuron/models/inference/qwen3/modeling_qwen3.py | 2 +- .../models/inference/qwen3_moe/modeling_qwen3_moe.py | 2 +- .../models/inference/smollm3/modeling_smollm3.py | 2 +- 18 files changed, 37 insertions(+), 37 deletions(-) diff --git a/optimum/commands/neuron/cache.py b/optimum/commands/neuron/cache.py index f19fa1459..a8365c89a 100644 --- a/optimum/commands/neuron/cache.py +++ b/optimum/commands/neuron/cache.py @@ -147,7 +147,7 @@ def _list_entries(self): str(entry["batch_size"]), str(entry["sequence_length"]), str(entry.get("tp_degree", entry.get("tensor_parallel_size"))), - str(entry["dtype"]), + str(entry.get("torch_dtype", "unknown")), str(entry["target"]), ) ) diff --git a/optimum/commands/neuron/serve.py b/optimum/commands/neuron/serve.py index e7ebd8fcf..cc972a993 100644 --- a/optimum/commands/neuron/serve.py +++ b/optimum/commands/neuron/serve.py @@ -202,7 +202,7 @@ def run(self): batch_size = selected_entry["batch_size"] sequence_length = selected_entry["sequence_length"] tensor_parallel_size = selected_entry["tp_degree"] - torch_dtype = DTYPE_MAPPER.pt(selected_entry["dtype"]) + torch_dtype = DTYPE_MAPPER.pt(selected_entry["torch_dtype"]) warning_msg = f"{model_id} is not a neuron model, but a cached configuration is available using" warning_msg += f" instance type {instance_type}," warning_msg += f" batch size = {batch_size}," diff --git a/optimum/neuron/cache/hub_cache.py b/optimum/neuron/cache/hub_cache.py index 48f64647b..73605e4a3 100644 --- a/optimum/neuron/cache/hub_cache.py +++ b/optimum/neuron/cache/hub_cache.py @@ -427,7 +427,7 @@ def select_hub_cached_entries( continue if torch_dtype is not None: target_value = DTYPE_MAPPER.pt(torch_dtype) if isinstance(torch_dtype, str) else torch_dtype - entry_value = DTYPE_MAPPER.pt(entry.get("dtype")) + entry_value = DTYPE_MAPPER.pt(entry.get("torch_dtype")) if target_value != entry_value: continue selected.append(entry) diff --git a/optimum/neuron/models/inference/backend/config.py b/optimum/neuron/models/inference/backend/config.py index 170c2f026..38209291f 100644 --- a/optimum/neuron/models/inference/backend/config.py +++ b/optimum/neuron/models/inference/backend/config.py @@ -83,9 +83,9 @@ def __init__( self.batch_size = batch_size self.sequence_length = sequence_length self.tp_degree = tp_degree - self.dtype = torch_dtype - if isinstance(self.dtype, str): - self.dtype = DTYPE_MAPPER.pt(self.dtype) + self.torch_dtype = torch_dtype + if isinstance(self.torch_dtype, str): + self.torch_dtype = DTYPE_MAPPER.pt(self.torch_dtype) self.n_active_tokens = self.sequence_length if n_active_tokens is None else n_active_tokens self.output_logits = output_logits diff --git a/optimum/neuron/models/inference/backend/modules/attention/attention_base.py b/optimum/neuron/models/inference/backend/modules/attention/attention_base.py index 5ef35b0ba..9d3c21b7e 100644 --- a/optimum/neuron/models/inference/backend/modules/attention/attention_base.py +++ b/optimum/neuron/models/inference/backend/modules/attention/attention_base.py @@ -99,7 +99,7 @@ def __init__( self.head_dim = self.hidden_size // self.num_attention_heads self.max_position_embeddings = config.max_position_embeddings self.rope_theta = config.rope_theta - self.dtype = neuron_config.dtype + self.dtype = neuron_config.torch_dtype self.rms_norm_eps = config.rms_norm_eps self._qk_scale = qk_scale @@ -130,7 +130,7 @@ def __init__( input_is_parallel=True, layer_name=self.o_proj_layer_name, tensor_model_parallel_group=self.tensor_model_parallel_group, - rpl_reduce_dtype=neuron_config.dtype, + rpl_reduce_dtype=neuron_config.torch_dtype, ) self.num_heads = utils.divide(self.qkv_proj.get_num_attention_heads(), neuron_config.tp_degree) self.num_key_value_heads = utils.divide(self.qkv_proj.get_num_key_value_heads(), neuron_config.tp_degree) diff --git a/optimum/neuron/models/inference/backend/modules/decoder/decoder_builder.py b/optimum/neuron/models/inference/backend/modules/decoder/decoder_builder.py index 8a11576c1..ced8b09dc 100644 --- a/optimum/neuron/models/inference/backend/modules/decoder/decoder_builder.py +++ b/optimum/neuron/models/inference/backend/modules/decoder/decoder_builder.py @@ -38,8 +38,8 @@ def __init__( self.max_tokens = max_tokens self.active_tokens = active_tokens - if not self.neuron_config.dtype: - self.neuron_config.dtype = torch.float32 + if not self.neuron_config.torch_dtype: + self.neuron_config.torch_dtype = torch.float32 if config.pad_token_id is None: config.pad_token_id = 0 @@ -88,9 +88,9 @@ def load_module(self): float_model = self.model_cls(self.config, self.neuron_config) float_model.eval() - if self.neuron_config.dtype != torch.float32: + if self.neuron_config.torch_dtype != torch.float32: float_model._apply( - lambda t: t.to(self.neuron_config.dtype) + lambda t: t.to(self.neuron_config.torch_dtype) if t.is_floating_point() and t.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2] else t ) diff --git a/optimum/neuron/models/inference/backend/modules/decoder/decoder_wrapper.py b/optimum/neuron/models/inference/backend/modules/decoder/decoder_wrapper.py index e1a34bd92..6f58e3f5a 100644 --- a/optimum/neuron/models/inference/backend/modules/decoder/decoder_wrapper.py +++ b/optimum/neuron/models/inference/backend/modules/decoder/decoder_wrapper.py @@ -37,8 +37,8 @@ def __init__( self.model = model self.tag = tag - if not self.neuron_config.dtype: - self.neuron_config.dtype = torch.float32 + if not self.neuron_config.torch_dtype: + self.neuron_config.torch_dtype = torch.float32 if config.pad_token_id is None: config.pad_token_id = 0 diff --git a/optimum/neuron/models/inference/backend/modules/kvcache/kv_cache_manager.py b/optimum/neuron/models/inference/backend/modules/kvcache/kv_cache_manager.py index 78bee92c6..50ecc4d0a 100644 --- a/optimum/neuron/models/inference/backend/modules/kvcache/kv_cache_manager.py +++ b/optimum/neuron/models/inference/backend/modules/kvcache/kv_cache_manager.py @@ -47,7 +47,7 @@ def __init__(self, config: PretrainedConfig, neuron_config: NxDNeuronConfig, **k self._init_kv_shape(config, neuron_config) num_layer = config.num_hidden_layers - dtype = neuron_config.dtype + dtype = neuron_config.torch_dtype self.past_key_values = nn.ParameterList( [nn.Parameter(torch.zeros(self.kv_shape, dtype=dtype), requires_grad=False) for _ in range(num_layer * 2)] ) diff --git a/optimum/neuron/models/inference/backend/modules/moe_v2.py b/optimum/neuron/models/inference/backend/modules/moe_v2.py index d9789c10b..6c4063925 100644 --- a/optimum/neuron/models/inference/backend/modules/moe_v2.py +++ b/optimum/neuron/models/inference/backend/modules/moe_v2.py @@ -36,7 +36,7 @@ def initialize_moe_module( glu_mlp=neuron_config.glu_mlp, early_expert_affinity_modulation=early_expert_affinity_modulation, ), - dtype=neuron_config.dtype, + dtype=neuron_config.torch_dtype, ) shared_experts = None if n_shared_experts is not None: @@ -45,8 +45,8 @@ def initialize_moe_module( intermediate_size=config.intermediate_size, num_shared_experts=n_shared_experts, hidden_act=config.hidden_act, - dtype=neuron_config.dtype, - reduce_dtype=neuron_config.dtype, + dtype=neuron_config.torch_dtype, + reduce_dtype=neuron_config.torch_dtype, fused_gate_up_projection=fused_shared_experts, ) diff --git a/optimum/neuron/models/inference/backend/pretrained_model.py b/optimum/neuron/models/inference/backend/pretrained_model.py index 5a3a24a6f..8cad5b6f2 100644 --- a/optimum/neuron/models/inference/backend/pretrained_model.py +++ b/optimum/neuron/models/inference/backend/pretrained_model.py @@ -117,7 +117,7 @@ def __init__( self.config = copy.deepcopy(config) self.neuron_config = copy.deepcopy(neuron_config) # Override torch_dtype in config as it is used by the neuronx_distributed code to cast weights to the correct type - self.config.dtype = self.neuron_config.dtype + self.config.dtype = self.neuron_config.torch_dtype self._traced_model = traced_model self.graph_builders = graph_builders # Required for loading weights @@ -252,11 +252,11 @@ def checkpoint_loader_fn(self, checkpoint_path, config, neuron_config): """This function loads the model's state dictionary and weights from the hf model""" model_sd = self.get_state_dict(checkpoint_path, config, neuron_config) - if neuron_config.dtype != torch.float32: + if neuron_config.torch_dtype != torch.float32: for name, param in model_sd.items(): - if torch.is_floating_point(param) and param.dtype is not neuron_config.dtype: - logger.debug(f"Converting {name} to {neuron_config.dtype}") - model_sd[name] = param.to(neuron_config.dtype) + if torch.is_floating_point(param) and param.dtype is not neuron_config.torch_dtype: + logger.debug(f"Converting {name} to {neuron_config.torch_dtype}") + model_sd[name] = param.to(neuron_config.torch_dtype) return model_sd @classmethod @@ -344,7 +344,7 @@ def _export( trust_remote_code=trust_remote_code, ).get_text_config() # Override torch_dtype in config as it is used by the neuronx_distributed code to cast weights to the correct type - config.dtype = neuron_config.dtype + config.dtype = neuron_config.torch_dtype # Evaluate head_dim if it is defined but set to null (like in Mixtral for transformers 4.54+) if hasattr(config, "head_dim") and config.head_dim is None: config.head_dim = config.hidden_size // config.num_attention_heads diff --git a/optimum/neuron/models/inference/granite/modeling_granite.py b/optimum/neuron/models/inference/granite/modeling_granite.py index 1863ffe92..72b25fb19 100644 --- a/optimum/neuron/models/inference/granite/modeling_granite.py +++ b/optimum/neuron/models/inference/granite/modeling_granite.py @@ -104,7 +104,7 @@ def __init__(self, config: GraniteConfig, neuron_config: NxDNeuronConfig): config.vocab_size, config.hidden_size, config.pad_token_id, - dtype=neuron_config.dtype, + dtype=neuron_config.torch_dtype, shard_across_embedding=True, pad=True, ) diff --git a/optimum/neuron/models/inference/llama/modeling_llama.py b/optimum/neuron/models/inference/llama/modeling_llama.py index 18161c941..76b02eaec 100644 --- a/optimum/neuron/models/inference/llama/modeling_llama.py +++ b/optimum/neuron/models/inference/llama/modeling_llama.py @@ -84,7 +84,7 @@ def __init__(self, config: LlamaConfig, neuron_config: NxDNeuronConfig): self.intermediate_size, bias=mlp_bias, gather_output=False, - dtype=neuron_config.dtype, + dtype=neuron_config.torch_dtype, pad=True, ) self.up_proj = ColumnParallelLinear( @@ -92,7 +92,7 @@ def __init__(self, config: LlamaConfig, neuron_config: NxDNeuronConfig): self.intermediate_size, bias=mlp_bias, gather_output=False, - dtype=neuron_config.dtype, + dtype=neuron_config.torch_dtype, pad=True, ) self.down_proj = RowParallelLinear( @@ -100,9 +100,9 @@ def __init__(self, config: LlamaConfig, neuron_config: NxDNeuronConfig): self.hidden_size, bias=mlp_bias, input_is_parallel=True, - dtype=neuron_config.dtype, + dtype=neuron_config.torch_dtype, pad=True, - reduce_dtype=neuron_config.dtype, + reduce_dtype=neuron_config.torch_dtype, ) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -288,7 +288,7 @@ def __init__(self, config: LlamaConfig, neuron_config: NxDNeuronConfig): config.vocab_size, config.hidden_size, config.pad_token_id, - dtype=neuron_config.dtype, + dtype=neuron_config.torch_dtype, shard_across_embedding=True, pad=True, ) diff --git a/optimum/neuron/models/inference/llama4/modeling_llama4.py b/optimum/neuron/models/inference/llama4/modeling_llama4.py index 7b29dba66..5a3473aec 100644 --- a/optimum/neuron/models/inference/llama4/modeling_llama4.py +++ b/optimum/neuron/models/inference/llama4/modeling_llama4.py @@ -209,7 +209,7 @@ def __init__(self, config: Llama4TextConfig, neuron_config: NxDNeuronConfig): config.vocab_size, config.hidden_size, config.pad_token_id, - dtype=neuron_config.dtype, + dtype=neuron_config.torch_dtype, shard_across_embedding=True, pad=True, ) @@ -226,7 +226,7 @@ def __init__(self, config: Llama4TextConfig, neuron_config: NxDNeuronConfig): gather_output=not neuron_config.on_device_sampling, bias=False, pad=True, - dtype=neuron_config.dtype, + dtype=neuron_config.torch_dtype, ) diff --git a/optimum/neuron/models/inference/mixtral/modeling_mixtral.py b/optimum/neuron/models/inference/mixtral/modeling_mixtral.py index e7530aa64..92adbc093 100644 --- a/optimum/neuron/models/inference/mixtral/modeling_mixtral.py +++ b/optimum/neuron/models/inference/mixtral/modeling_mixtral.py @@ -206,7 +206,7 @@ def __init__(self, config: MixtralConfig, neuron_config: NxDNeuronConfig): config.vocab_size, config.hidden_size, config.pad_token_id, - dtype=neuron_config.dtype, + dtype=neuron_config.torch_dtype, shard_across_embedding=True, ) self.layers = nn.ModuleList( diff --git a/optimum/neuron/models/inference/qwen2/modeling_qwen2.py b/optimum/neuron/models/inference/qwen2/modeling_qwen2.py index a000bc07e..7cd0d0f71 100644 --- a/optimum/neuron/models/inference/qwen2/modeling_qwen2.py +++ b/optimum/neuron/models/inference/qwen2/modeling_qwen2.py @@ -76,7 +76,7 @@ def __init__(self, config: Qwen2Config, neuron_config: NxDNeuronConfig): config.vocab_size, config.hidden_size, config.pad_token_id, - dtype=neuron_config.dtype, + dtype=neuron_config.torch_dtype, shard_across_embedding=True, pad=True, ) diff --git a/optimum/neuron/models/inference/qwen3/modeling_qwen3.py b/optimum/neuron/models/inference/qwen3/modeling_qwen3.py index b35b87dd9..3fb33928b 100644 --- a/optimum/neuron/models/inference/qwen3/modeling_qwen3.py +++ b/optimum/neuron/models/inference/qwen3/modeling_qwen3.py @@ -76,7 +76,7 @@ def __init__(self, config: Qwen3Config, neuron_config: NxDNeuronConfig): config.vocab_size, config.hidden_size, config.pad_token_id, - dtype=neuron_config.dtype, + dtype=neuron_config.torch_dtype, shard_across_embedding=True, pad=True, ) diff --git a/optimum/neuron/models/inference/qwen3_moe/modeling_qwen3_moe.py b/optimum/neuron/models/inference/qwen3_moe/modeling_qwen3_moe.py index 76eeda130..fc64713f6 100644 --- a/optimum/neuron/models/inference/qwen3_moe/modeling_qwen3_moe.py +++ b/optimum/neuron/models/inference/qwen3_moe/modeling_qwen3_moe.py @@ -179,7 +179,7 @@ def __init__(self, config: Qwen3MoeConfig, neuron_config: NxDNeuronConfig): config.vocab_size, config.hidden_size, config.pad_token_id, - dtype=neuron_config.dtype, + dtype=neuron_config.torch_dtype, shard_across_embedding=True, ) self.layers = nn.ModuleList( diff --git a/optimum/neuron/models/inference/smollm3/modeling_smollm3.py b/optimum/neuron/models/inference/smollm3/modeling_smollm3.py index d4ad65803..b45560202 100644 --- a/optimum/neuron/models/inference/smollm3/modeling_smollm3.py +++ b/optimum/neuron/models/inference/smollm3/modeling_smollm3.py @@ -88,7 +88,7 @@ def __init__(self, config: SmolLM3Config, neuron_config: NxDNeuronConfig): config.vocab_size, config.hidden_size, config.pad_token_id, - dtype=neuron_config.dtype, + dtype=neuron_config.torch_dtype, shard_across_embedding=True, pad=True, ) From 94c837365ad155ee33d4097f34821ea2a49d5eb4 Mon Sep 17 00:00:00 2001 From: Michael Benayoun Date: Wed, 29 Oct 2025 12:37:15 +0100 Subject: [PATCH 04/10] fix: restore for inference related code --- .../text-generation/performance/llama3.3-70b.py | 2 +- docs/source/model_doc/diffusers/flux.mdx | 2 +- docs/source/model_doc/diffusers/pixart_alpha.mdx | 2 +- docs/source/model_doc/diffusers/pixart_sigma.mdx | 2 +- optimum/commands/neuron/cache.py | 2 +- optimum/commands/neuron/serve.py | 2 +- optimum/exporters/neuron/__main__.py | 4 ++-- optimum/neuron/cache/entries/cache_entry.py | 2 +- optimum/neuron/generation/utils.py | 11 ++++++----- .../backend/modules/attention/attention_base.py | 12 ++++++------ .../models/inference/backend/pretrained_model.py | 4 ++-- 11 files changed, 23 insertions(+), 22 deletions(-) diff --git a/benchmark/text-generation/performance/llama3.3-70b.py b/benchmark/text-generation/performance/llama3.3-70b.py index d4a29c118..d87bb5c8a 100644 --- a/benchmark/text-generation/performance/llama3.3-70b.py +++ b/benchmark/text-generation/performance/llama3.3-70b.py @@ -32,7 +32,7 @@ def main(): assert neuron_config.sequence_length == seq_length, ( f"Model {model_name} is not configured for sequence length {seq_length}." ) - assert neuron_config.dtype == torch.bfloat16, f"Model {model_name} is not configured for bf16." + assert neuron_config.torch_dtype == torch.bfloat16, f"Model {model_name} is not configured for bf16." model = NeuronModelForCausalLM.from_pretrained(model_id) except Exception: model = NeuronModelForCausalLM.from_pretrained( diff --git a/docs/source/model_doc/diffusers/flux.mdx b/docs/source/model_doc/diffusers/flux.mdx index c68b1554e..a47148517 100644 --- a/docs/source/model_doc/diffusers/flux.mdx +++ b/docs/source/model_doc/diffusers/flux.mdx @@ -40,7 +40,7 @@ if __name__ == "__main__": pipe = NeuronFluxPipeline.from_pretrained( "black-forest-labs/FLUX.1-dev", - dtype=torch.bfloat16, + torch_dtype=torch.bfloat16, export=True, tensor_parallel_size=8, **compiler_args, diff --git a/docs/source/model_doc/diffusers/pixart_alpha.mdx b/docs/source/model_doc/diffusers/pixart_alpha.mdx index 375f292cb..31dca7023 100644 --- a/docs/source/model_doc/diffusers/pixart_alpha.mdx +++ b/docs/source/model_doc/diffusers/pixart_alpha.mdx @@ -61,7 +61,7 @@ from optimum.neuron import NeuronPixArtAlphaPipeline compiler_args = {"auto_cast": "none"} input_shapes = {"batch_size": 1, "height": 512, "width": 512, "sequence_length": 120} -neuron_model = NeuronPixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-512x512", dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes) +neuron_model = NeuronPixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-512x512", torch_dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes) # Save locally neuron_model.save_pretrained("pixart_alpha_neuron_512/") diff --git a/docs/source/model_doc/diffusers/pixart_sigma.mdx b/docs/source/model_doc/diffusers/pixart_sigma.mdx index bc52effdb..59ecbefb7 100644 --- a/docs/source/model_doc/diffusers/pixart_sigma.mdx +++ b/docs/source/model_doc/diffusers/pixart_sigma.mdx @@ -61,7 +61,7 @@ from optimum.neuron import NeuronPixArtSigmaPipeline compiler_args = {"auto_cast": "none"} input_shapes = {"batch_size": 1, "height": 512, "width": 512, "sequence_length": 120} -neuron_model = NeuronPixArtSigmaPipeline.from_pretrained("Jingya/pixart_sigma_pipe_xl_2_512_ms", dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes) +neuron_model = NeuronPixArtSigmaPipeline.from_pretrained("Jingya/pixart_sigma_pipe_xl_2_512_ms", torch_dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes) # Save locally neuron_model.save_pretrained("pixart_sigma_neuron_512/") diff --git a/optimum/commands/neuron/cache.py b/optimum/commands/neuron/cache.py index a8365c89a..cb437c347 100644 --- a/optimum/commands/neuron/cache.py +++ b/optimum/commands/neuron/cache.py @@ -147,7 +147,7 @@ def _list_entries(self): str(entry["batch_size"]), str(entry["sequence_length"]), str(entry.get("tp_degree", entry.get("tensor_parallel_size"))), - str(entry.get("torch_dtype", "unknown")), + str(entry["torch_dtype"]), str(entry["target"]), ) ) diff --git a/optimum/commands/neuron/serve.py b/optimum/commands/neuron/serve.py index cc972a993..d0f7ce0d3 100644 --- a/optimum/commands/neuron/serve.py +++ b/optimum/commands/neuron/serve.py @@ -101,7 +101,7 @@ def run(self): sequence_length = self.args.sequence_length tensor_parallel_size = self.args.tensor_parallel_size config = AutoConfig.from_pretrained(model_name_or_path) - torch_dtype = DTYPE_MAPPER.pt(config.dtype) + torch_dtype = DTYPE_MAPPER.pt(config.torch_dtype) try: # Look for a NeuronConfig in the model directory neuron_config = NeuronConfig.from_pretrained(model_name_or_path) diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py index eda69a541..afa451485 100644 --- a/optimum/exporters/neuron/__main__.py +++ b/optimum/exporters/neuron/__main__.py @@ -581,7 +581,7 @@ def load_models_and_neuron_configs( "trust_remote_code": trust_remote_code, "framework": "pt", "library_name": library_name, - "dtype": torch_dtype, + "torch_dtype": torch_dtype, } if model is None: model = TasksManager.get_model_from_task(**model_kwargs) @@ -878,7 +878,7 @@ def main(): model_name_or_path=args.model, output=args.output, compiler_kwargs=compiler_kwargs, - torch_dtype=args.dtype, + torch_dtype=args.torch_dtype, tensor_parallel_size=args.tensor_parallel_size, task=task, dynamic_batch_size=args.dynamic_batch_size, diff --git a/optimum/neuron/cache/entries/cache_entry.py b/optimum/neuron/cache/entries/cache_entry.py index b4b7a4680..15bd8c520 100644 --- a/optimum/neuron/cache/entries/cache_entry.py +++ b/optimum/neuron/cache/entries/cache_entry.py @@ -28,7 +28,7 @@ "bos_token_id", "pad_token_id", "torchscript", - "dtype", # this has been renamed as `float_dtype` for the check + "torch_dtype", # this has been renamed as `float_dtype` for the check "_commit_hash", "sample_size", "projection_dim", diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py index fc76cde80..fd654b492 100644 --- a/optimum/neuron/generation/utils.py +++ b/optimum/neuron/generation/utils.py @@ -325,17 +325,18 @@ class NeuronGenerationMixin(GenerationMixin): The class exposes [`~generation.GenerationMixin.generate`], which can be used for: - *greedy decoding* by calling [`~generation.GenerationMixin.greedy_search`] if `num_beams=1` and `do_sample=False` + - *contrastive search* by calling [`~generation.GenerationMixin.contrastive_search`] if `penalty_alpha>0` and + `top_k>1` - *multinomial sampling* by calling [`~generation.GenerationMixin.sample`] if `num_beams=1` and `do_sample=True` - *beam-search decoding* by calling [`~generation.GenerationMixin.beam_search`] if `num_beams>1` and `do_sample=False` - *beam-search multinomial sampling* by calling [`~generation.GenerationMixin.beam_sample`] if `num_beams>1` and `do_sample=True` - - Note: The following strategies have been removed in transformers 4.56.0+: - - constrained beam-search decoding (constraints and force_words_ids) - - group beam-search decoding (num_beam_groups > 1) - - contrastive search (penalty_alpha > 0) + - *diverse beam-search decoding* by calling [`~generation.GenerationMixin.group_beam_search`], if `num_beams>1` + and `num_beam_groups>1` + - *constrained beam-search decoding* by calling [`~generation.GenerationMixin.constrained_beam_search`], if + `constraints!=None` or `force_words_ids!=None` You do not need to call any of the above methods directly. Pass custom parameter values to 'generate' instead. To learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies). diff --git a/optimum/neuron/models/inference/backend/modules/attention/attention_base.py b/optimum/neuron/models/inference/backend/modules/attention/attention_base.py index 9d3c21b7e..37542e575 100644 --- a/optimum/neuron/models/inference/backend/modules/attention/attention_base.py +++ b/optimum/neuron/models/inference/backend/modules/attention/attention_base.py @@ -99,7 +99,7 @@ def __init__( self.head_dim = self.hidden_size // self.num_attention_heads self.max_position_embeddings = config.max_position_embeddings self.rope_theta = config.rope_theta - self.dtype = neuron_config.torch_dtype + self.torch_dtype = neuron_config.torch_dtype self.rms_norm_eps = config.rms_norm_eps self._qk_scale = qk_scale @@ -111,7 +111,7 @@ def __init__( num_attention_heads=self.num_attention_heads, num_key_value_heads=self.num_key_value_heads, tp_degree=neuron_config.tp_degree, - dtype=self.dtype, + dtype=self.torch_dtype, bias=qkv_proj_bias, gather_output=False, fused_qkv=neuron_config.fused_qkv, @@ -125,7 +125,7 @@ def __init__( num_attention_heads=self.num_attention_heads, num_key_value_heads=self.num_key_value_heads, tp_degree=neuron_config.tp_degree, - dtype=self.dtype, + dtype=self.torch_dtype, bias=o_proj_bias, input_is_parallel=True, layer_name=self.o_proj_layer_name, @@ -202,13 +202,13 @@ def perform_prefill(self, Q, K, V, q_len, bsz, attention_mask) -> Tensor: Q = ( Q.permute(0, 1, 3, 2) # after permute: batch, num_heads, d_head, seqlen .reshape((bsz * self.num_heads, self.head_dim, q_len)) - .to(self.dtype) + .to(self.torch_dtype) ) Q = Q * self.qk_scale K_active = ( - K_active.permute(0, 1, 3, 2).reshape((bsz * self.num_heads, self.head_dim, q_len)).to(self.dtype) + K_active.permute(0, 1, 3, 2).reshape((bsz * self.num_heads, self.head_dim, q_len)).to(self.torch_dtype) ) - V_active = V_active.reshape((bsz * self.num_heads, q_len, self.head_dim)).to(self.dtype) + V_active = V_active.reshape((bsz * self.num_heads, q_len, self.head_dim)).to(self.torch_dtype) # shape: (B*H)DS attn_output = torch.zeros(bsz * self.num_heads, self.head_dim, q_len, dtype=Q.dtype, device=Q.device) diff --git a/optimum/neuron/models/inference/backend/pretrained_model.py b/optimum/neuron/models/inference/backend/pretrained_model.py index 8cad5b6f2..985115de8 100644 --- a/optimum/neuron/models/inference/backend/pretrained_model.py +++ b/optimum/neuron/models/inference/backend/pretrained_model.py @@ -117,7 +117,7 @@ def __init__( self.config = copy.deepcopy(config) self.neuron_config = copy.deepcopy(neuron_config) # Override torch_dtype in config as it is used by the neuronx_distributed code to cast weights to the correct type - self.config.dtype = self.neuron_config.torch_dtype + self.config.torch_dtype = self.neuron_config.torch_dtype self._traced_model = traced_model self.graph_builders = graph_builders # Required for loading weights @@ -344,7 +344,7 @@ def _export( trust_remote_code=trust_remote_code, ).get_text_config() # Override torch_dtype in config as it is used by the neuronx_distributed code to cast weights to the correct type - config.dtype = neuron_config.torch_dtype + config.torch_dtype = neuron_config.torch_dtype # Evaluate head_dim if it is defined but set to null (like in Mixtral for transformers 4.54+) if hasattr(config, "head_dim") and config.head_dim is None: config.head_dim = config.hidden_size // config.num_attention_heads From 741e542a86dda83a1a5770f0a237a174472cfaf7 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Wed, 29 Oct 2025 13:55:32 +0000 Subject: [PATCH 05/10] fix: dict as outputs during the padding --- optimum/neuron/modeling_traced.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/optimum/neuron/modeling_traced.py b/optimum/neuron/modeling_traced.py index 99184f8f9..7f00214c4 100644 --- a/optimum/neuron/modeling_traced.py +++ b/optimum/neuron/modeling_traced.py @@ -612,7 +612,7 @@ def neuron_padding_manager(self, inputs: dict[str, "torch.Tensor"]): @staticmethod def remove_padding( - outputs: list[torch.Tensor], + outputs: list[torch.Tensor] | dict, dims: list[int], indices: list[int], padding_side: Literal["right", "left"] = "right", @@ -633,6 +633,8 @@ def remove_padding( if len(dims) != len(indices): raise ValueError(f"The size of `dims`({len(dims)}) and indices`({len(indices)}) must be equal.") + if isinstance(outputs, dict): + outputs = list(outputs.values()) for dim, indice in zip(dims, indices): if padding_side == "right": outputs = [ From 0bbe203a7f6319b1a96150ab4435b158f5799308 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Wed, 29 Oct 2025 14:27:23 +0000 Subject: [PATCH 06/10] fix: update t5 modeling with new cahce utils --- .../neuron/models/inference/t5/modeling_t5.py | 48 +++++++++---------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/optimum/neuron/models/inference/t5/modeling_t5.py b/optimum/neuron/models/inference/t5/modeling_t5.py index 15c3e8c67..49b69ff30 100644 --- a/optimum/neuron/models/inference/t5/modeling_t5.py +++ b/optimum/neuron/models/inference/t5/modeling_t5.py @@ -27,6 +27,7 @@ from torch import nn from transformers import T5Config from transformers.activations import ACT2FN +from transformers.cache_utils import EncoderDecoderCache from transformers.models.t5.modeling_t5 import ( T5Attention, T5DenseActDense, @@ -154,7 +155,7 @@ def forward( mask=None, key_value_states=None, position_bias=None, - past_key_value=None, + past_key_values=None, layer_head_mask=None, query_length=None, use_cache=False, @@ -177,38 +178,38 @@ def forward( batch_size, -1, self.num_attention_heads_per_partition, self.key_value_proj_dim ).transpose(1, 2) - if past_key_value is not None: - is_updated = past_key_value.is_updated.get(self.layer_idx) + # Check is encoder-decoder model is being used. Otherwise we'll get `DynamicCache` + is_updated = False + if isinstance(past_key_values, EncoderDecoderCache): + is_updated = past_key_values.is_updated.get(self.layer_idx) if is_cross_attention: # after the first generated id, we can subsequently re-use all key/value_states from cache - curr_past_key_value = past_key_value.cross_attention_cache + curr_past_key_values = past_key_values.cross_attention_cache else: - curr_past_key_value = past_key_value.self_attention_cache + curr_past_key_values = past_key_values.self_attention_cache + else: + curr_past_key_values = past_key_values current_states = key_value_states if is_cross_attention else hidden_states - if is_cross_attention and past_key_value is not None and is_updated: + if is_cross_attention and past_key_values is not None and is_updated: # reuse k,v, cross_attentions - key_states = curr_past_key_value.key_cache[self.layer_idx] - value_states = curr_past_key_value.value_cache[self.layer_idx] + key_states = curr_past_key_values.layers[self.layer_idx].keys + value_states = curr_past_key_values.layers[self.layer_idx].values else: key_states = self.k(current_states) value_states = self.v(current_states) - key_states = key_states.view( - batch_size, -1, self.num_attention_heads_per_partition, self.key_value_proj_dim - ).transpose(1, 2) - value_states = value_states.view( - batch_size, -1, self.num_attention_heads_per_partition, self.key_value_proj_dim - ).transpose(1, 2) - - if past_key_value is not None: + key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + + if past_key_values is not None: # save all key/value_states to cache to be re-used for fast auto-regressive generation cache_position = cache_position if not is_cross_attention else None - key_states, value_states = curr_past_key_value.update( + key_states, value_states = curr_past_key_values.update( key_states, value_states, self.layer_idx, {"cache_position": cache_position} ) # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls - if is_cross_attention: - past_key_value.is_updated[self.layer_idx] = True + if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache): + past_key_values.is_updated[self.layer_idx] = True # compute scores, equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 scores = torch.matmul(query_states, key_states.transpose(3, 2)) @@ -235,14 +236,9 @@ def forward( causal_mask = mask[:, :, :, : key_states.shape[-2]] position_bias = position_bias + causal_mask - if self.pruned_heads: - mask = torch.ones(position_bias.shape[1]) - mask[list(self.pruned_heads)] = 0 - position_bias_masked = position_bias[:, mask.bool()] - else: - position_bias_masked = position_bias - + position_bias_masked = position_bias scores += position_bias_masked + # (batch_size, n_heads, seq_length, key_length) attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores) attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) From fda56be6501c8128b5b577a81dde038f8693702f Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Wed, 29 Oct 2025 14:54:59 +0000 Subject: [PATCH 07/10] fix: generate assistant mode --- .../inference/backend/modules/generation/generation_utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/optimum/neuron/models/inference/backend/modules/generation/generation_utils.py b/optimum/neuron/models/inference/backend/modules/generation/generation_utils.py index c72bd5762..163690355 100644 --- a/optimum/neuron/models/inference/backend/modules/generation/generation_utils.py +++ b/optimum/neuron/models/inference/backend/modules/generation/generation_utils.py @@ -17,7 +17,7 @@ from typing import Any import torch -from transformers import GenerationConfig +from transformers import GenerationConfig, PreTrainedModel from transformers.generation import GenerationMixin, SampleDecoderOnlyOutput from transformers.generation.logits_process import LogitsProcessorList from transformers.generation.stopping_criteria import StoppingCriteriaList @@ -270,14 +270,13 @@ def _update_model_kwargs_for_generation( def _assisted_decoding( self, input_ids: torch.LongTensor, - candidate_generator: "CandidateGenerator", # noqa stopping_criteria: StoppingCriteriaList, generation_config: GenerationConfig, + assistant_model: "PreTrainedModel | None" = None, **model_kwargs, ): pad_token_id = generation_config.pad_token_id eos_token_id = generation_config.eos_token_id - assistant_model = candidate_generator.assistant_model if assistant_model.neuron_config.on_device_sampling: raise ValueError("Assistant model must not use on-device sampling") From b72a90fe437d03ef37b0f713afeda3a833ebf081 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Wed, 29 Oct 2025 15:07:20 +0000 Subject: [PATCH 08/10] fix: diffusers caching tool --- tools/cache/auto_fill_diffusion_cache.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/cache/auto_fill_diffusion_cache.py b/tools/cache/auto_fill_diffusion_cache.py index 3a00c819f..72dcbac45 100644 --- a/tools/cache/auto_fill_diffusion_cache.py +++ b/tools/cache/auto_fill_diffusion_cache.py @@ -219,7 +219,7 @@ def compile_and_cache_model( task=model_config.get("task", None), auto_cast=model_config.get("auto_cast", None), auto_cast_type=model_config.get("auto_cast_type", None), - torch_dtype=model_config.get("dtype", None), + torch_dtype=model_config.get("dtype", None) or model_config.get("torch_dtype", None), ) elif args.hf_model_id is None: raise ValueError("You must provide --hf_model_id to compile a model without a config file.") @@ -235,4 +235,5 @@ def compile_and_cache_model( task=args.task, auto_cast=args.auto_cast, auto_cast_type=args.auto_cast_type, + torch_dtype=args.torch_dtype, ) From bf06f2e969260f1328ec77af521381dd19b7cd4f Mon Sep 17 00:00:00 2001 From: Michael Benayoun Date: Thu, 30 Oct 2025 16:51:32 +0100 Subject: [PATCH 09/10] fix: remove deprecated tests --- tests/training/test_custom_modeling.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/training/test_custom_modeling.py b/tests/training/test_custom_modeling.py index 29746b9d1..bda6d3b89 100644 --- a/tests/training/test_custom_modeling.py +++ b/tests/training/test_custom_modeling.py @@ -614,8 +614,6 @@ def test_each_pp_rank_only_loads_relevant_parameters(set_cache_for_ci): ("flash_attention_2", "flash_attention_2"), ("eager", "eager"), (None, "eager"), - # Unsupported attention implementation - should default to eager - ("sdpa", "eager"), ], ) @distributed_test(world_size=8, tp_size=2, pp_size=1) From df9fdc53df99c74edfbf861d190b8af12f874000 Mon Sep 17 00:00:00 2001 From: Michael Benayoun Date: Thu, 30 Oct 2025 16:58:59 +0100 Subject: [PATCH 10/10] fix: restore file --- examples/training/qwen3/finetune_qwen3.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/training/qwen3/finetune_qwen3.sh b/examples/training/qwen3/finetune_qwen3.sh index b2d7568e3..d64a6572d 100755 --- a/examples/training/qwen3/finetune_qwen3.sh +++ b/examples/training/qwen3/finetune_qwen3.sh @@ -13,8 +13,7 @@ TP_DEGREE=8 BS=1 GRADIENT_ACCUMULATION_STEPS=8 LOGGING_STEPS=2 -# MODEL_NAME="Qwen/Qwen3-8B" # Change this to the desired model name -MODEL_NAME="Qwen/Qwen3-0.6B" # Change this to the desired model name +MODEL_NAME="Qwen/Qwen3-8B" # Change this to the desired model name OUTPUT_DIR="$(echo $MODEL_NAME | cut -d'/' -f2)-finetuned" DISTRIBUTED_ARGS="--nproc_per_node $PROCESSES_PER_NODE" SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )