From d0dfe0689f8737c3cd91656b41122e7433254252 Mon Sep 17 00:00:00 2001
From: Michael Benayoun <mickbenayoun@gmail.com>
Date: Wed, 29 Oct 2025 11:33:43 +0100
Subject: [PATCH 01/10] refactor: sync with transformers 4.57.1

---
 README.md                                     |   2 +-
 .../performance/llama3.3-70b.py               |   2 +-
 .../contribute/contribute_for_training.mdx    |  24 ++--
 docs/source/model_doc/diffusers/flux.mdx      |   2 +-
 .../model_doc/diffusers/pixart_alpha.mdx      |   2 +-
 .../model_doc/diffusers/pixart_sigma.mdx      |   2 +-
 docs/source/quickstart.mdx                    |   2 +-
 .../training_tutorials/finetune_llama.mdx     |   2 +-
 .../training_tutorials/finetune_qwen3.mdx     |   2 +-
 examples/training/qwen3/finetune_qwen3.sh     |   3 +-
 optimum/commands/neuron/cache.py              |   2 +-
 optimum/commands/neuron/serve.py              |   4 +-
 optimum/exporters/neuron/__main__.py          |   4 +-
 optimum/neuron/cache/entries/cache_entry.py   |   2 +-
 optimum/neuron/cache/hub_cache.py             |   2 +-
 optimum/neuron/generation/utils.py            |  11 +-
 optimum/neuron/modeling_diffusion.py          |   2 +-
 .../neuron/models/inference/backend/config.py |   6 +-
 .../modules/attention/attention_base.py       |  14 +--
 .../modules/decoder/decoder_builder.py        |   8 +-
 .../modules/decoder/decoder_wrapper.py        |   4 +-
 .../modules/kvcache/kv_cache_manager.py       |   2 +-
 .../inference/backend/modules/moe_v2.py       |   6 +-
 .../inference/backend/pretrained_model.py     |  12 +-
 .../inference/granite/modeling_granite.py     |   2 +-
 .../models/inference/llama/modeling_llama.py  |  10 +-
 .../inference/llama4/modeling_llama4.py       |   4 +-
 .../inference/mixtral/modeling_mixtral.py     |   2 +-
 .../neuron/models/inference/modeling_utils.py |   2 +-
 .../models/inference/qwen2/modeling_qwen2.py  |   2 +-
 .../models/inference/qwen3/modeling_qwen3.py  |   2 +-
 .../inference/qwen3_moe/modeling_qwen3_moe.py |   2 +-
 .../inference/smollm3/modeling_smollm3.py     |   2 +-
 .../models/training/llama/modeling_llama.py   |  20 ++--
 .../neuron/models/training/modeling_utils.py  | 111 +++++++++++-------
 .../models/training/qwen3/modeling_qwen3.py   |   2 +-
 optimum/neuron/trainers/sft_trainer.py        |   4 +-
 optimum/neuron/trainers/training_args.py      |   9 +-
 pyproject.toml                                |   2 +-
 tools/cache/auto_fill_diffusion_cache.py      |   2 +-
 40 files changed, 164 insertions(+), 136 deletions(-)

diff --git a/README.md b/README.md
index 5e7916c4c..e499d696a 100644
--- a/README.md
+++ b/README.md
@@ -115,7 +115,7 @@ def main():
     model = NeuronModelForCausalLM.from_pretrained(
         model_id,
         training_args.trn_config,
-        torch_dtype=torch.bfloat16,
+        dtype=torch.bfloat16,
         attn_implementation="flash_attention_2", # Enable flash attention
     )
 
diff --git a/benchmark/text-generation/performance/llama3.3-70b.py b/benchmark/text-generation/performance/llama3.3-70b.py
index d87bb5c8a..d4a29c118 100644
--- a/benchmark/text-generation/performance/llama3.3-70b.py
+++ b/benchmark/text-generation/performance/llama3.3-70b.py
@@ -32,7 +32,7 @@ def main():
             assert neuron_config.sequence_length == seq_length, (
                 f"Model {model_name} is not configured for sequence length {seq_length}."
             )
-            assert neuron_config.torch_dtype == torch.bfloat16, f"Model {model_name} is not configured for bf16."
+            assert neuron_config.dtype == torch.bfloat16, f"Model {model_name} is not configured for bf16."
             model = NeuronModelForCausalLM.from_pretrained(model_id)
         except Exception:
             model = NeuronModelForCausalLM.from_pretrained(
diff --git a/docs/source/contribute/contribute_for_training.mdx b/docs/source/contribute/contribute_for_training.mdx
index 994dc9991..62475c1e0 100644
--- a/docs/source/contribute/contribute_for_training.mdx
+++ b/docs/source/contribute/contribute_for_training.mdx
@@ -80,7 +80,7 @@ class YourModelEmbeddings(nn.Module):
         self.embed_tokens = ParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
             sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
         )
 ```
@@ -105,7 +105,7 @@ class YourModelMLP(nn.Module, CustomModule):
             bias=False,
             gather_output=False,
             sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
         )
         
         self.down_proj = RowParallelLinear(
@@ -114,7 +114,7 @@ class YourModelMLP(nn.Module, CustomModule):
             bias=False,
             input_is_parallel=True,
             sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
         )
         
         # Define transformation specs
@@ -151,7 +151,7 @@ class YourModelAttention(nn.Module, CustomModule):
             bias=False,
             gather_output=False,
             sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
         )
         self.k_proj = ColumnParallelLinear(
             config.hidden_size,
@@ -159,7 +159,7 @@ class YourModelAttention(nn.Module, CustomModule):
             bias=False,
             gather_output=False,
             sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
         )
         self.v_proj = ColumnParallelLinear(
             config.hidden_size,
@@ -167,7 +167,7 @@ class YourModelAttention(nn.Module, CustomModule):
             bias=False,
             gather_output=False,
             sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
         )
         
         self.o_proj = RowParallelLinear(
@@ -176,7 +176,7 @@ class YourModelAttention(nn.Module, CustomModule):
             bias=False,
             input_is_parallel=True,
             sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
         )
         
         # No transformation specs needed - regular parallel layers
@@ -201,7 +201,7 @@ class YourModelAttention(nn.Module, CustomModule):
                 bias=False,
                 gather_output=False,
                 sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
-                dtype=config.torch_dtype,
+                dtype=config.dtype,
             )
             
             # Define transformation specs for fused QKV
@@ -246,7 +246,7 @@ class YourModelAttention(nn.Module, CustomModule):
                 sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
                 kv_size_multiplier=self.kv_size_multiplier,
                 fuse_qkv=trn_config.fuse_qkv,
-                dtype=config.torch_dtype,
+                dtype=config.dtype,
             )
             
             # Define transformation specs for GQA QKV
@@ -336,7 +336,7 @@ class YourModelForCausalLM(NeuronModelMixin, YourPreTrainedModel):
             config.vocab_size,
             bias=False,
             gather_output=False,
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
         )
         
         self.post_init()
@@ -473,7 +473,7 @@ Update `tests/training/test_modeling_auto.py`:
 @is_trainium_test
 def test_auto_model_with_supported_architecture(from_pretrained):
     trn_config = TrainingNeuronConfig()
-    kwargs = {"torch_dtype": torch.bfloat16}
+    kwargs = {"dtype": torch.bfloat16}
     for model_name_or_path in [
         "michaelbenayoun/llama-2-tiny-4kv-heads-4layers-random",
         "michaelbenayoun/granite-tiny-4kv-heads-4layers-random", 
@@ -487,7 +487,7 @@ def test_auto_model_with_supported_architecture(from_pretrained):
 @is_trainium_test
 def test_auto_model_for_causal_lm_with_supported_architecture(from_pretrained):
     trn_config = TrainingNeuronConfig()
-    kwargs = {"torch_dtype": torch.bfloat16}
+    kwargs = {"dtype": torch.bfloat16}
     for model_name_or_path in [
         "michaelbenayoun/llama-2-tiny-4kv-heads-4layers-random",
         "michaelbenayoun/granite-tiny-4kv-heads-4layers-random",
diff --git a/docs/source/model_doc/diffusers/flux.mdx b/docs/source/model_doc/diffusers/flux.mdx
index a47148517..c68b1554e 100644
--- a/docs/source/model_doc/diffusers/flux.mdx
+++ b/docs/source/model_doc/diffusers/flux.mdx
@@ -40,7 +40,7 @@ if __name__ == "__main__":
 
     pipe = NeuronFluxPipeline.from_pretrained(
         "black-forest-labs/FLUX.1-dev",
-        torch_dtype=torch.bfloat16,
+        dtype=torch.bfloat16,
         export=True,
         tensor_parallel_size=8,
         **compiler_args,
diff --git a/docs/source/model_doc/diffusers/pixart_alpha.mdx b/docs/source/model_doc/diffusers/pixart_alpha.mdx
index 31dca7023..375f292cb 100644
--- a/docs/source/model_doc/diffusers/pixart_alpha.mdx
+++ b/docs/source/model_doc/diffusers/pixart_alpha.mdx
@@ -61,7 +61,7 @@ from optimum.neuron import NeuronPixArtAlphaPipeline
 compiler_args = {"auto_cast": "none"}
 input_shapes = {"batch_size": 1, "height": 512, "width": 512, "sequence_length": 120}
 
-neuron_model = NeuronPixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-512x512", torch_dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes)
+neuron_model = NeuronPixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-512x512", dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes)
 
 # Save locally
 neuron_model.save_pretrained("pixart_alpha_neuron_512/")
diff --git a/docs/source/model_doc/diffusers/pixart_sigma.mdx b/docs/source/model_doc/diffusers/pixart_sigma.mdx
index 59ecbefb7..bc52effdb 100644
--- a/docs/source/model_doc/diffusers/pixart_sigma.mdx
+++ b/docs/source/model_doc/diffusers/pixart_sigma.mdx
@@ -61,7 +61,7 @@ from optimum.neuron import NeuronPixArtSigmaPipeline
 compiler_args = {"auto_cast": "none"}
 input_shapes = {"batch_size": 1, "height": 512, "width": 512, "sequence_length": 120}
 
-neuron_model = NeuronPixArtSigmaPipeline.from_pretrained("Jingya/pixart_sigma_pipe_xl_2_512_ms", torch_dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes)
+neuron_model = NeuronPixArtSigmaPipeline.from_pretrained("Jingya/pixart_sigma_pipe_xl_2_512_ms", dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes)
 
 # Save locally
 neuron_model.save_pretrained("pixart_sigma_neuron_512/")
diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx
index 95ddfc833..e9739de66 100644
--- a/docs/source/quickstart.mdx
+++ b/docs/source/quickstart.mdx
@@ -79,7 +79,7 @@ def main():
     model = NeuronModelForCausalLM.from_pretrained(
         model_id,
         training_args.trn_config,
-        torch_dtype=torch.bfloat16,
+        dtype=torch.bfloat16,
         attn_implementation="flash_attention_2",  # Enable flash attention
     )
     
diff --git a/docs/source/training_tutorials/finetune_llama.mdx b/docs/source/training_tutorials/finetune_llama.mdx
index a28f5c04c..4e7ae6697 100644
--- a/docs/source/training_tutorials/finetune_llama.mdx
+++ b/docs/source/training_tutorials/finetune_llama.mdx
@@ -138,7 +138,7 @@ dtype = torch.bfloat16 if training_args.bf16 else torch.float32
 model = NeuronModelForCausalLM.from_pretrained(
     model_id,
     trn_config,
-    torch_dtype=dtype,
+    dtype=dtype,
     # Use FlashAttention2 for better performance and to be able to use larger sequence lengths.
     attn_implementation="flash_attention_2",
 )
diff --git a/docs/source/training_tutorials/finetune_qwen3.mdx b/docs/source/training_tutorials/finetune_qwen3.mdx
index 0c9f4f379..1cf4a2d6e 100644
--- a/docs/source/training_tutorials/finetune_qwen3.mdx
+++ b/docs/source/training_tutorials/finetune_qwen3.mdx
@@ -137,7 +137,7 @@ dtype = torch.bfloat16 if training_args.bf16 else torch.float32
 model = NeuronModelForCausalLM.from_pretrained(
     model_id,
     trn_config,
-    torch_dtype=dtype,
+    dtype=dtype,
     # Use FlashAttention2 for better performance and to be able to use larger sequence lengths.
     attn_implementation="flash_attention_2",
 )
diff --git a/examples/training/qwen3/finetune_qwen3.sh b/examples/training/qwen3/finetune_qwen3.sh
index d64a6572d..b2d7568e3 100755
--- a/examples/training/qwen3/finetune_qwen3.sh
+++ b/examples/training/qwen3/finetune_qwen3.sh
@@ -13,7 +13,8 @@ TP_DEGREE=8
 BS=1
 GRADIENT_ACCUMULATION_STEPS=8
 LOGGING_STEPS=2
-MODEL_NAME="Qwen/Qwen3-8B" # Change this to the desired model name
+# MODEL_NAME="Qwen/Qwen3-8B" # Change this to the desired model name
+MODEL_NAME="Qwen/Qwen3-0.6B" # Change this to the desired model name
 OUTPUT_DIR="$(echo $MODEL_NAME | cut -d'/' -f2)-finetuned"
 DISTRIBUTED_ARGS="--nproc_per_node $PROCESSES_PER_NODE"
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
diff --git a/optimum/commands/neuron/cache.py b/optimum/commands/neuron/cache.py
index cb437c347..f19fa1459 100644
--- a/optimum/commands/neuron/cache.py
+++ b/optimum/commands/neuron/cache.py
@@ -147,7 +147,7 @@ def _list_entries(self):
                     str(entry["batch_size"]),
                     str(entry["sequence_length"]),
                     str(entry.get("tp_degree", entry.get("tensor_parallel_size"))),
-                    str(entry["torch_dtype"]),
+                    str(entry["dtype"]),
                     str(entry["target"]),
                 )
             )
diff --git a/optimum/commands/neuron/serve.py b/optimum/commands/neuron/serve.py
index d0f7ce0d3..e7ebd8fcf 100644
--- a/optimum/commands/neuron/serve.py
+++ b/optimum/commands/neuron/serve.py
@@ -101,7 +101,7 @@ def run(self):
         sequence_length = self.args.sequence_length
         tensor_parallel_size = self.args.tensor_parallel_size
         config = AutoConfig.from_pretrained(model_name_or_path)
-        torch_dtype = DTYPE_MAPPER.pt(config.torch_dtype)
+        torch_dtype = DTYPE_MAPPER.pt(config.dtype)
         try:
             # Look for a NeuronConfig in the model directory
             neuron_config = NeuronConfig.from_pretrained(model_name_or_path)
@@ -202,7 +202,7 @@ def run(self):
                 batch_size = selected_entry["batch_size"]
                 sequence_length = selected_entry["sequence_length"]
                 tensor_parallel_size = selected_entry["tp_degree"]
-                torch_dtype = DTYPE_MAPPER.pt(selected_entry["torch_dtype"])
+                torch_dtype = DTYPE_MAPPER.pt(selected_entry["dtype"])
                 warning_msg = f"{model_id} is not a neuron model, but a cached configuration is available using"
                 warning_msg += f" instance type {instance_type},"
                 warning_msg += f" batch size = {batch_size},"
diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py
index afa451485..eda69a541 100644
--- a/optimum/exporters/neuron/__main__.py
+++ b/optimum/exporters/neuron/__main__.py
@@ -581,7 +581,7 @@ def load_models_and_neuron_configs(
         "trust_remote_code": trust_remote_code,
         "framework": "pt",
         "library_name": library_name,
-        "torch_dtype": torch_dtype,
+        "dtype": torch_dtype,
     }
     if model is None:
         model = TasksManager.get_model_from_task(**model_kwargs)
@@ -878,7 +878,7 @@ def main():
         model_name_or_path=args.model,
         output=args.output,
         compiler_kwargs=compiler_kwargs,
-        torch_dtype=args.torch_dtype,
+        torch_dtype=args.dtype,
         tensor_parallel_size=args.tensor_parallel_size,
         task=task,
         dynamic_batch_size=args.dynamic_batch_size,
diff --git a/optimum/neuron/cache/entries/cache_entry.py b/optimum/neuron/cache/entries/cache_entry.py
index 15bd8c520..b4b7a4680 100644
--- a/optimum/neuron/cache/entries/cache_entry.py
+++ b/optimum/neuron/cache/entries/cache_entry.py
@@ -28,7 +28,7 @@
     "bos_token_id",
     "pad_token_id",
     "torchscript",
-    "torch_dtype",  # this has been renamed as `float_dtype` for the check
+    "dtype",  # this has been renamed as `float_dtype` for the check
     "_commit_hash",
     "sample_size",
     "projection_dim",
diff --git a/optimum/neuron/cache/hub_cache.py b/optimum/neuron/cache/hub_cache.py
index 73605e4a3..48f64647b 100644
--- a/optimum/neuron/cache/hub_cache.py
+++ b/optimum/neuron/cache/hub_cache.py
@@ -427,7 +427,7 @@ def select_hub_cached_entries(
             continue
         if torch_dtype is not None:
             target_value = DTYPE_MAPPER.pt(torch_dtype) if isinstance(torch_dtype, str) else torch_dtype
-            entry_value = DTYPE_MAPPER.pt(entry.get("torch_dtype"))
+            entry_value = DTYPE_MAPPER.pt(entry.get("dtype"))
             if target_value != entry_value:
                 continue
         selected.append(entry)
diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py
index fd654b492..fc76cde80 100644
--- a/optimum/neuron/generation/utils.py
+++ b/optimum/neuron/generation/utils.py
@@ -325,18 +325,17 @@ class NeuronGenerationMixin(GenerationMixin):
     The class exposes [`~generation.GenerationMixin.generate`], which can be used for:
         - *greedy decoding* by calling [`~generation.GenerationMixin.greedy_search`] if `num_beams=1` and
           `do_sample=False`
-        - *contrastive search* by calling [`~generation.GenerationMixin.contrastive_search`] if `penalty_alpha>0` and
-          `top_k>1`
         - *multinomial sampling* by calling [`~generation.GenerationMixin.sample`] if `num_beams=1` and
           `do_sample=True`
         - *beam-search decoding* by calling [`~generation.GenerationMixin.beam_search`] if `num_beams>1` and
           `do_sample=False`
         - *beam-search multinomial sampling* by calling [`~generation.GenerationMixin.beam_sample`] if `num_beams>1`
           and `do_sample=True`
-        - *diverse beam-search decoding* by calling [`~generation.GenerationMixin.group_beam_search`], if `num_beams>1`
-          and `num_beam_groups>1`
-        - *constrained beam-search decoding* by calling [`~generation.GenerationMixin.constrained_beam_search`], if
-          `constraints!=None` or `force_words_ids!=None`
+
+    Note: The following strategies have been removed in transformers 4.56.0+:
+        - constrained beam-search decoding (constraints and force_words_ids)
+        - group beam-search decoding (num_beam_groups > 1)
+        - contrastive search (penalty_alpha > 0)
 
     You do not need to call any of the above methods directly. Pass custom parameter values to 'generate' instead. To
     learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
diff --git a/optimum/neuron/modeling_diffusion.py b/optimum/neuron/modeling_diffusion.py
index 4bf9cb212..45d579a36 100644
--- a/optimum/neuron/modeling_diffusion.py
+++ b/optimum/neuron/modeling_diffusion.py
@@ -1193,7 +1193,7 @@ def forward(
 
         outputs = self.model(*inputs)
         if self.config.model_type == "t5" and isinstance(outputs, dict):  # Flux text encoder 2
-            return [outputs["last_hidden_state"].to(self.config.torch_dtype)]
+            return [outputs["last_hidden_state"].to(self.config.dtype)]
 
         if return_dict and not isinstance(outputs, dict):
             outputs = ModelOutput(dict(zip(self.neuron_config.outputs, outputs)))
diff --git a/optimum/neuron/models/inference/backend/config.py b/optimum/neuron/models/inference/backend/config.py
index 38209291f..170c2f026 100644
--- a/optimum/neuron/models/inference/backend/config.py
+++ b/optimum/neuron/models/inference/backend/config.py
@@ -83,9 +83,9 @@ def __init__(
         self.batch_size = batch_size
         self.sequence_length = sequence_length
         self.tp_degree = tp_degree
-        self.torch_dtype = torch_dtype
-        if isinstance(self.torch_dtype, str):
-            self.torch_dtype = DTYPE_MAPPER.pt(self.torch_dtype)
+        self.dtype = torch_dtype
+        if isinstance(self.dtype, str):
+            self.dtype = DTYPE_MAPPER.pt(self.dtype)
         self.n_active_tokens = self.sequence_length if n_active_tokens is None else n_active_tokens
         self.output_logits = output_logits
 
diff --git a/optimum/neuron/models/inference/backend/modules/attention/attention_base.py b/optimum/neuron/models/inference/backend/modules/attention/attention_base.py
index 37542e575..5ef35b0ba 100644
--- a/optimum/neuron/models/inference/backend/modules/attention/attention_base.py
+++ b/optimum/neuron/models/inference/backend/modules/attention/attention_base.py
@@ -99,7 +99,7 @@ def __init__(
             self.head_dim = self.hidden_size // self.num_attention_heads
         self.max_position_embeddings = config.max_position_embeddings
         self.rope_theta = config.rope_theta
-        self.torch_dtype = neuron_config.torch_dtype
+        self.dtype = neuron_config.dtype
         self.rms_norm_eps = config.rms_norm_eps
         self._qk_scale = qk_scale
 
@@ -111,7 +111,7 @@ def __init__(
             num_attention_heads=self.num_attention_heads,
             num_key_value_heads=self.num_key_value_heads,
             tp_degree=neuron_config.tp_degree,
-            dtype=self.torch_dtype,
+            dtype=self.dtype,
             bias=qkv_proj_bias,
             gather_output=False,
             fused_qkv=neuron_config.fused_qkv,
@@ -125,12 +125,12 @@ def __init__(
             num_attention_heads=self.num_attention_heads,
             num_key_value_heads=self.num_key_value_heads,
             tp_degree=neuron_config.tp_degree,
-            dtype=self.torch_dtype,
+            dtype=self.dtype,
             bias=o_proj_bias,
             input_is_parallel=True,
             layer_name=self.o_proj_layer_name,
             tensor_model_parallel_group=self.tensor_model_parallel_group,
-            rpl_reduce_dtype=neuron_config.torch_dtype,
+            rpl_reduce_dtype=neuron_config.dtype,
         )
         self.num_heads = utils.divide(self.qkv_proj.get_num_attention_heads(), neuron_config.tp_degree)
         self.num_key_value_heads = utils.divide(self.qkv_proj.get_num_key_value_heads(), neuron_config.tp_degree)
@@ -202,13 +202,13 @@ def perform_prefill(self, Q, K, V, q_len, bsz, attention_mask) -> Tensor:
             Q = (
                 Q.permute(0, 1, 3, 2)  # after permute: batch, num_heads, d_head, seqlen
                 .reshape((bsz * self.num_heads, self.head_dim, q_len))
-                .to(self.torch_dtype)
+                .to(self.dtype)
             )
             Q = Q * self.qk_scale
             K_active = (
-                K_active.permute(0, 1, 3, 2).reshape((bsz * self.num_heads, self.head_dim, q_len)).to(self.torch_dtype)
+                K_active.permute(0, 1, 3, 2).reshape((bsz * self.num_heads, self.head_dim, q_len)).to(self.dtype)
             )
-            V_active = V_active.reshape((bsz * self.num_heads, q_len, self.head_dim)).to(self.torch_dtype)
+            V_active = V_active.reshape((bsz * self.num_heads, q_len, self.head_dim)).to(self.dtype)
             # shape: (B*H)DS
             attn_output = torch.zeros(bsz * self.num_heads, self.head_dim, q_len, dtype=Q.dtype, device=Q.device)
 
diff --git a/optimum/neuron/models/inference/backend/modules/decoder/decoder_builder.py b/optimum/neuron/models/inference/backend/modules/decoder/decoder_builder.py
index ced8b09dc..8a11576c1 100644
--- a/optimum/neuron/models/inference/backend/modules/decoder/decoder_builder.py
+++ b/optimum/neuron/models/inference/backend/modules/decoder/decoder_builder.py
@@ -38,8 +38,8 @@ def __init__(
         self.max_tokens = max_tokens
         self.active_tokens = active_tokens
 
-        if not self.neuron_config.torch_dtype:
-            self.neuron_config.torch_dtype = torch.float32
+        if not self.neuron_config.dtype:
+            self.neuron_config.dtype = torch.float32
 
         if config.pad_token_id is None:
             config.pad_token_id = 0
@@ -88,9 +88,9 @@ def load_module(self):
         float_model = self.model_cls(self.config, self.neuron_config)
         float_model.eval()
 
-        if self.neuron_config.torch_dtype != torch.float32:
+        if self.neuron_config.dtype != torch.float32:
             float_model._apply(
-                lambda t: t.to(self.neuron_config.torch_dtype)
+                lambda t: t.to(self.neuron_config.dtype)
                 if t.is_floating_point() and t.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]
                 else t
             )
diff --git a/optimum/neuron/models/inference/backend/modules/decoder/decoder_wrapper.py b/optimum/neuron/models/inference/backend/modules/decoder/decoder_wrapper.py
index 6f58e3f5a..e1a34bd92 100644
--- a/optimum/neuron/models/inference/backend/modules/decoder/decoder_wrapper.py
+++ b/optimum/neuron/models/inference/backend/modules/decoder/decoder_wrapper.py
@@ -37,8 +37,8 @@ def __init__(
         self.model = model
         self.tag = tag
 
-        if not self.neuron_config.torch_dtype:
-            self.neuron_config.torch_dtype = torch.float32
+        if not self.neuron_config.dtype:
+            self.neuron_config.dtype = torch.float32
 
         if config.pad_token_id is None:
             config.pad_token_id = 0
diff --git a/optimum/neuron/models/inference/backend/modules/kvcache/kv_cache_manager.py b/optimum/neuron/models/inference/backend/modules/kvcache/kv_cache_manager.py
index 50ecc4d0a..78bee92c6 100644
--- a/optimum/neuron/models/inference/backend/modules/kvcache/kv_cache_manager.py
+++ b/optimum/neuron/models/inference/backend/modules/kvcache/kv_cache_manager.py
@@ -47,7 +47,7 @@ def __init__(self, config: PretrainedConfig, neuron_config: NxDNeuronConfig, **k
         self._init_kv_shape(config, neuron_config)
 
         num_layer = config.num_hidden_layers
-        dtype = neuron_config.torch_dtype
+        dtype = neuron_config.dtype
         self.past_key_values = nn.ParameterList(
             [nn.Parameter(torch.zeros(self.kv_shape, dtype=dtype), requires_grad=False) for _ in range(num_layer * 2)]
         )
diff --git a/optimum/neuron/models/inference/backend/modules/moe_v2.py b/optimum/neuron/models/inference/backend/modules/moe_v2.py
index 6c4063925..d9789c10b 100644
--- a/optimum/neuron/models/inference/backend/modules/moe_v2.py
+++ b/optimum/neuron/models/inference/backend/modules/moe_v2.py
@@ -36,7 +36,7 @@ def initialize_moe_module(
             glu_mlp=neuron_config.glu_mlp,
             early_expert_affinity_modulation=early_expert_affinity_modulation,
         ),
-        dtype=neuron_config.torch_dtype,
+        dtype=neuron_config.dtype,
     )
     shared_experts = None
     if n_shared_experts is not None:
@@ -45,8 +45,8 @@ def initialize_moe_module(
             intermediate_size=config.intermediate_size,
             num_shared_experts=n_shared_experts,
             hidden_act=config.hidden_act,
-            dtype=neuron_config.torch_dtype,
-            reduce_dtype=neuron_config.torch_dtype,
+            dtype=neuron_config.dtype,
+            reduce_dtype=neuron_config.dtype,
             fused_gate_up_projection=fused_shared_experts,
         )
 
diff --git a/optimum/neuron/models/inference/backend/pretrained_model.py b/optimum/neuron/models/inference/backend/pretrained_model.py
index 985115de8..5a3a24a6f 100644
--- a/optimum/neuron/models/inference/backend/pretrained_model.py
+++ b/optimum/neuron/models/inference/backend/pretrained_model.py
@@ -117,7 +117,7 @@ def __init__(
         self.config = copy.deepcopy(config)
         self.neuron_config = copy.deepcopy(neuron_config)
         # Override torch_dtype in config as it is used by the neuronx_distributed code to cast weights to the correct type
-        self.config.torch_dtype = self.neuron_config.torch_dtype
+        self.config.dtype = self.neuron_config.dtype
         self._traced_model = traced_model
         self.graph_builders = graph_builders  # Required for loading weights
 
@@ -252,11 +252,11 @@ def checkpoint_loader_fn(self, checkpoint_path, config, neuron_config):
         """This function loads the model's state dictionary and weights from the hf model"""
 
         model_sd = self.get_state_dict(checkpoint_path, config, neuron_config)
-        if neuron_config.torch_dtype != torch.float32:
+        if neuron_config.dtype != torch.float32:
             for name, param in model_sd.items():
-                if torch.is_floating_point(param) and param.dtype is not neuron_config.torch_dtype:
-                    logger.debug(f"Converting {name} to {neuron_config.torch_dtype}")
-                    model_sd[name] = param.to(neuron_config.torch_dtype)
+                if torch.is_floating_point(param) and param.dtype is not neuron_config.dtype:
+                    logger.debug(f"Converting {name} to {neuron_config.dtype}")
+                    model_sd[name] = param.to(neuron_config.dtype)
         return model_sd
 
     @classmethod
@@ -344,7 +344,7 @@ def _export(
                 trust_remote_code=trust_remote_code,
             ).get_text_config()
         # Override torch_dtype in config as it is used by the neuronx_distributed code to cast weights to the correct type
-        config.torch_dtype = neuron_config.torch_dtype
+        config.dtype = neuron_config.dtype
         # Evaluate head_dim if it is defined but set to null (like in Mixtral for transformers 4.54+)
         if hasattr(config, "head_dim") and config.head_dim is None:
             config.head_dim = config.hidden_size // config.num_attention_heads
diff --git a/optimum/neuron/models/inference/granite/modeling_granite.py b/optimum/neuron/models/inference/granite/modeling_granite.py
index 72b25fb19..1863ffe92 100644
--- a/optimum/neuron/models/inference/granite/modeling_granite.py
+++ b/optimum/neuron/models/inference/granite/modeling_granite.py
@@ -104,7 +104,7 @@ def __init__(self, config: GraniteConfig, neuron_config: NxDNeuronConfig):
             config.vocab_size,
             config.hidden_size,
             config.pad_token_id,
-            dtype=neuron_config.torch_dtype,
+            dtype=neuron_config.dtype,
             shard_across_embedding=True,
             pad=True,
         )
diff --git a/optimum/neuron/models/inference/llama/modeling_llama.py b/optimum/neuron/models/inference/llama/modeling_llama.py
index 76b02eaec..18161c941 100644
--- a/optimum/neuron/models/inference/llama/modeling_llama.py
+++ b/optimum/neuron/models/inference/llama/modeling_llama.py
@@ -84,7 +84,7 @@ def __init__(self, config: LlamaConfig, neuron_config: NxDNeuronConfig):
             self.intermediate_size,
             bias=mlp_bias,
             gather_output=False,
-            dtype=neuron_config.torch_dtype,
+            dtype=neuron_config.dtype,
             pad=True,
         )
         self.up_proj = ColumnParallelLinear(
@@ -92,7 +92,7 @@ def __init__(self, config: LlamaConfig, neuron_config: NxDNeuronConfig):
             self.intermediate_size,
             bias=mlp_bias,
             gather_output=False,
-            dtype=neuron_config.torch_dtype,
+            dtype=neuron_config.dtype,
             pad=True,
         )
         self.down_proj = RowParallelLinear(
@@ -100,9 +100,9 @@ def __init__(self, config: LlamaConfig, neuron_config: NxDNeuronConfig):
             self.hidden_size,
             bias=mlp_bias,
             input_is_parallel=True,
-            dtype=neuron_config.torch_dtype,
+            dtype=neuron_config.dtype,
             pad=True,
-            reduce_dtype=neuron_config.torch_dtype,
+            reduce_dtype=neuron_config.dtype,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -288,7 +288,7 @@ def __init__(self, config: LlamaConfig, neuron_config: NxDNeuronConfig):
             config.vocab_size,
             config.hidden_size,
             config.pad_token_id,
-            dtype=neuron_config.torch_dtype,
+            dtype=neuron_config.dtype,
             shard_across_embedding=True,
             pad=True,
         )
diff --git a/optimum/neuron/models/inference/llama4/modeling_llama4.py b/optimum/neuron/models/inference/llama4/modeling_llama4.py
index 5a3473aec..7b29dba66 100644
--- a/optimum/neuron/models/inference/llama4/modeling_llama4.py
+++ b/optimum/neuron/models/inference/llama4/modeling_llama4.py
@@ -209,7 +209,7 @@ def __init__(self, config: Llama4TextConfig, neuron_config: NxDNeuronConfig):
             config.vocab_size,
             config.hidden_size,
             config.pad_token_id,
-            dtype=neuron_config.torch_dtype,
+            dtype=neuron_config.dtype,
             shard_across_embedding=True,
             pad=True,
         )
@@ -226,7 +226,7 @@ def __init__(self, config: Llama4TextConfig, neuron_config: NxDNeuronConfig):
             gather_output=not neuron_config.on_device_sampling,
             bias=False,
             pad=True,
-            dtype=neuron_config.torch_dtype,
+            dtype=neuron_config.dtype,
         )
 
 
diff --git a/optimum/neuron/models/inference/mixtral/modeling_mixtral.py b/optimum/neuron/models/inference/mixtral/modeling_mixtral.py
index 92adbc093..e7530aa64 100644
--- a/optimum/neuron/models/inference/mixtral/modeling_mixtral.py
+++ b/optimum/neuron/models/inference/mixtral/modeling_mixtral.py
@@ -206,7 +206,7 @@ def __init__(self, config: MixtralConfig, neuron_config: NxDNeuronConfig):
             config.vocab_size,
             config.hidden_size,
             config.pad_token_id,
-            dtype=neuron_config.torch_dtype,
+            dtype=neuron_config.dtype,
             shard_across_embedding=True,
         )
         self.layers = nn.ModuleList(
diff --git a/optimum/neuron/models/inference/modeling_utils.py b/optimum/neuron/models/inference/modeling_utils.py
index 36c31afdb..3be29ae78 100644
--- a/optimum/neuron/models/inference/modeling_utils.py
+++ b/optimum/neuron/models/inference/modeling_utils.py
@@ -138,7 +138,7 @@ def get_neuron_config(
             batch_size=batch_size,
             sequence_length=sequence_length,
             tensor_parallel_size=tensor_parallel_size,
-            dtype=DTYPE_MAPPER.pt(config.torch_dtype),
+            dtype=DTYPE_MAPPER.pt(config.dtype),
         )
 
     @classmethod
diff --git a/optimum/neuron/models/inference/qwen2/modeling_qwen2.py b/optimum/neuron/models/inference/qwen2/modeling_qwen2.py
index 7cd0d0f71..a000bc07e 100644
--- a/optimum/neuron/models/inference/qwen2/modeling_qwen2.py
+++ b/optimum/neuron/models/inference/qwen2/modeling_qwen2.py
@@ -76,7 +76,7 @@ def __init__(self, config: Qwen2Config, neuron_config: NxDNeuronConfig):
             config.vocab_size,
             config.hidden_size,
             config.pad_token_id,
-            dtype=neuron_config.torch_dtype,
+            dtype=neuron_config.dtype,
             shard_across_embedding=True,
             pad=True,
         )
diff --git a/optimum/neuron/models/inference/qwen3/modeling_qwen3.py b/optimum/neuron/models/inference/qwen3/modeling_qwen3.py
index 3fb33928b..b35b87dd9 100644
--- a/optimum/neuron/models/inference/qwen3/modeling_qwen3.py
+++ b/optimum/neuron/models/inference/qwen3/modeling_qwen3.py
@@ -76,7 +76,7 @@ def __init__(self, config: Qwen3Config, neuron_config: NxDNeuronConfig):
             config.vocab_size,
             config.hidden_size,
             config.pad_token_id,
-            dtype=neuron_config.torch_dtype,
+            dtype=neuron_config.dtype,
             shard_across_embedding=True,
             pad=True,
         )
diff --git a/optimum/neuron/models/inference/qwen3_moe/modeling_qwen3_moe.py b/optimum/neuron/models/inference/qwen3_moe/modeling_qwen3_moe.py
index fc64713f6..76eeda130 100644
--- a/optimum/neuron/models/inference/qwen3_moe/modeling_qwen3_moe.py
+++ b/optimum/neuron/models/inference/qwen3_moe/modeling_qwen3_moe.py
@@ -179,7 +179,7 @@ def __init__(self, config: Qwen3MoeConfig, neuron_config: NxDNeuronConfig):
             config.vocab_size,
             config.hidden_size,
             config.pad_token_id,
-            dtype=neuron_config.torch_dtype,
+            dtype=neuron_config.dtype,
             shard_across_embedding=True,
         )
         self.layers = nn.ModuleList(
diff --git a/optimum/neuron/models/inference/smollm3/modeling_smollm3.py b/optimum/neuron/models/inference/smollm3/modeling_smollm3.py
index b45560202..d4ad65803 100644
--- a/optimum/neuron/models/inference/smollm3/modeling_smollm3.py
+++ b/optimum/neuron/models/inference/smollm3/modeling_smollm3.py
@@ -88,7 +88,7 @@ def __init__(self, config: SmolLM3Config, neuron_config: NxDNeuronConfig):
             config.vocab_size,
             config.hidden_size,
             config.pad_token_id,
-            dtype=neuron_config.torch_dtype,
+            dtype=neuron_config.dtype,
             shard_across_embedding=True,
             pad=True,
         )
diff --git a/optimum/neuron/models/training/llama/modeling_llama.py b/optimum/neuron/models/training/llama/modeling_llama.py
index f4cfe4c38..45c913626 100644
--- a/optimum/neuron/models/training/llama/modeling_llama.py
+++ b/optimum/neuron/models/training/llama/modeling_llama.py
@@ -210,7 +210,7 @@ def __init__(self, config, trn_config: TrainingNeuronConfig):
             init_method=init_method,
             sequence_parallel_enabled=self.trn_config.sequence_parallel_enabled,
             sequence_dimension=0,
-            dtype=self.config.torch_dtype,
+            dtype=self.config.dtype,
         )
         self.down_proj = RowParallelLinear(
             self.intermediate_size,
@@ -220,7 +220,7 @@ def __init__(self, config, trn_config: TrainingNeuronConfig):
             init_method=init_method,
             sequence_parallel_enabled=self.trn_config.sequence_parallel_enabled,
             sequence_dimension=0,
-            dtype=self.config.torch_dtype,
+            dtype=self.config.dtype,
         )
 
     def forward(self, x):
@@ -333,7 +333,7 @@ def __init__(self, config: LlamaConfig, trn_config: TrainingNeuronConfig, layer_
                 sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
                 kv_size_multiplier=self.kv_size_multiplier,
                 fuse_qkv=trn_config.fuse_qkv,
-                dtype=self.config.torch_dtype,
+                dtype=self.config.dtype,
             )
 
             gqa_qkv_specs = GQAQKVColumnParallelLinearSpec(
@@ -361,7 +361,7 @@ def __init__(self, config: LlamaConfig, trn_config: TrainingNeuronConfig, layer_
                 init_method=init_method,
                 sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
                 sequence_dimension=0,
-                dtype=self.config.torch_dtype,
+                dtype=self.config.dtype,
             )
             self.specs.add_spec(
                 FusedLinearsSpec(
@@ -382,7 +382,7 @@ def __init__(self, config: LlamaConfig, trn_config: TrainingNeuronConfig, layer_
                 init_method=init_method,
                 sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
                 sequence_dimension=0,
-                dtype=self.config.torch_dtype,
+                dtype=self.config.dtype,
             )
             self.k_proj = ColumnParallelLinear(
                 self.hidden_size,
@@ -392,7 +392,7 @@ def __init__(self, config: LlamaConfig, trn_config: TrainingNeuronConfig, layer_
                 init_method=init_method,
                 sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
                 sequence_dimension=0,
-                dtype=self.config.torch_dtype,
+                dtype=self.config.dtype,
             )
             self.v_proj = ColumnParallelLinear(
                 self.hidden_size,
@@ -402,7 +402,7 @@ def __init__(self, config: LlamaConfig, trn_config: TrainingNeuronConfig, layer_
                 init_method=init_method,
                 sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
                 sequence_dimension=0,
-                dtype=self.config.torch_dtype,
+                dtype=self.config.dtype,
             )
         self.o_proj = RowParallelLinear(
             self.num_heads * self.head_dim,
@@ -412,7 +412,7 @@ def __init__(self, config: LlamaConfig, trn_config: TrainingNeuronConfig, layer_
             init_method=init_method,
             sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
             sequence_dimension=0,
-            dtype=self.config.torch_dtype,
+            dtype=self.config.dtype,
         )
         self.num_heads = neuronx_dist_utils.divide(config.num_attention_heads, tp_size)
         self.num_key_value_heads = neuronx_dist_utils.divide(
@@ -606,7 +606,7 @@ def __init__(self, config: LlamaConfig, trn_config: TrainingNeuronConfig):
             self.padding_idx,
             init_method=init_method,
             sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
         )
         self.layers = nn.ModuleList(
             [LlamaDecoderLayer(config, trn_config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
@@ -715,7 +715,7 @@ def __init__(self, config, trn_config: TrainingNeuronConfig):
             init_method=init_method,
             sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
             sequence_dimension=0,
-            dtype=self.config.torch_dtype,
+            dtype=self.config.dtype,
         )
 
         self.vocab_size = config.vocab_size // get_tensor_model_parallel_size()
diff --git a/optimum/neuron/models/training/modeling_utils.py b/optimum/neuron/models/training/modeling_utils.py
index dfd5fd23c..cef3c7c5b 100644
--- a/optimum/neuron/models/training/modeling_utils.py
+++ b/optimum/neuron/models/training/modeling_utils.py
@@ -61,7 +61,6 @@
     get_state_dict_dtype,
     load_state_dict,
     no_init_weights,
-    set_initialized_submodules,
 )
 from transformers.pytorch_utils import id_tensor_storage
 from transformers.quantizers import AutoHfQuantizer
@@ -213,7 +212,7 @@ def _check_and_adjust_attn_implementation(
         return attn_implementation
 
     def _flash_attn_2_can_dispatch(self, is_init_check: bool = False) -> bool:
-        torch_dtype = self.config.torch_dtype
+        dtype = self.config.dtype
 
         if not self._supports_flash_attn:
             raise ValueError(
@@ -221,9 +220,9 @@ def _flash_attn_2_can_dispatch(self, is_init_check: bool = False) -> bool:
                 "https://github.com/huggingface/optimum-neuron/issues"
             )
 
-        if torch_dtype is None:
+        if dtype is None:
             logger.warning_once(
-                "You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour"
+                "You are attempting to use Flash Attention 2 without specifying a dtype. This might lead to unexpected behaviour"
             )
 
         # If no error raise by this point, we can return `True`
@@ -447,7 +446,14 @@ def _load_pretrained_model(
                     _loaded_keys = [k[len(prefix) + 1 :] for k in loaded_keys]
                 else:
                     _loaded_keys = loaded_keys
-                not_initialized_submodules = set_initialized_submodules(model, _loaded_keys)
+
+                # Mark loaded parameters/buffers as initialized (transformers 4.56.0+ approach)
+                for key in model.state_dict():
+                    if key in _loaded_keys:
+                        param_or_buffer = model.get_parameter_or_buffer(key)
+                        if param_or_buffer is not None:
+                            param_or_buffer._is_hf_initialized = True
+
                 # If we're about to tie the output embeds to the input embeds we don't need to init them
                 if (
                     hasattr(model.config.get_text_config(decoder=True), "tie_word_embeddings")
@@ -458,6 +464,22 @@ def _load_pretrained_model(
                         # Still need to initialize if there is a bias term since biases are not tied.
                         if not hasattr(output_embeddings, "bias") or output_embeddings.bias is None:
                             output_embeddings._is_hf_initialized = True
+
+                # Set the flag on modules recursively
+                def set_is_initialized_for_modules(module):
+                    if (
+                        all(getattr(child, "_is_hf_initialized", False) for child in module.children())
+                        and all(getattr(param, "_is_hf_initialized", False) for param in module.parameters(recurse=False))
+                        and all(
+                            getattr(buffer, "_is_hf_initialized", False)
+                            for buffer in module.buffers(recurse=False)
+                            if buffer not in module._non_persistent_buffers_set
+                        )
+                    ):
+                        module._is_hf_initialized = True
+
+                model.apply(set_is_initialized_for_modules)
+                not_initialized_submodules = {name: mod for name, mod in model.named_modules() if not getattr(mod, "_is_hf_initialized", False)}
             else:
                 not_initialized_submodules = dict(model.named_modules())
 
@@ -704,7 +726,11 @@ def from_pretrained(
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
         _fast_init = kwargs.pop("_fast_init", True)
+        dtype = kwargs.pop("dtype", None)
         torch_dtype = kwargs.pop("torch_dtype", None)
+        # For BC on torch_dtype argument (deprecated in favor of dtype)
+        if torch_dtype is not None:
+            dtype = dtype if dtype is not None else torch_dtype
         low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", None)
         device_map = kwargs.pop("device_map", None)
         kwargs.pop("max_memory", None)
@@ -859,6 +885,9 @@ def from_pretrained(
                 _from_pipeline=from_pipeline,
                 **kwargs,
             )
+            # dtype is a config attribute, not a model parameter, so we remove it from model_kwargs
+            # Following the transformers pattern where dtype/torch_dtype are popped early from kwargs
+            model_kwargs.pop("dtype", None)
         else:
             config = copy.deepcopy(config)
             model_kwargs = kwargs
@@ -1172,18 +1201,18 @@ def from_pretrained(
         # 1. If torch_dtype is not None, we use that dtype
         # 2. If torch_dtype is "auto", we auto-detect dtype from the loaded state_dict, by checking its first
         #    weights entry that is of a floating type - we assume all floating dtype weights are of the same dtype
-        # We also may have config.torch_dtype available, but we won't rely on it till v5
+        # We also may have config.dtype available, but we won't rely on it till v5
         dtype_orig = None
 
-        if torch_dtype is not None:
-            if isinstance(torch_dtype, str):
-                if torch_dtype == "auto":
-                    if hasattr(config, "torch_dtype") and config.torch_dtype is not None:
-                        torch_dtype = config.torch_dtype
-                        logger.info(f"Will use torch_dtype={torch_dtype} as defined in model's config object")
+        if dtype is not None:
+            if isinstance(dtype, str):
+                if dtype == "auto":
+                    if hasattr(config, "dtype") and config.dtype is not None:
+                        dtype = config.dtype
+                        logger.info(f"Will use dtype={dtype} as defined in model's config object")
                     else:
                         if is_sharded and "dtype" in sharded_metadata:
-                            torch_dtype = sharded_metadata["dtype"]
+                            dtype = sharded_metadata["dtype"]
                         elif not is_sharded:
                             # ** Difference from original from_pretrained **
                             # Here we load the state dict only if we end up in this case, otherwise we defer the
@@ -1193,52 +1222,52 @@ def from_pretrained(
                                     one_time_state_dict = load_state_dict(
                                         resolved_archive_file, weights_only=weights_only
                                     )
-                                    torch_dtype = get_state_dict_dtype(one_time_state_dict)
+                                    dtype = get_state_dict_dtype(one_time_state_dict)
                                     del one_time_state_dict
-                                xm.rendezvous(f"auto torch_dtype_{worker}")
+                                xm.rendezvous(f"auto dtype_{worker}")
                         else:
                             one_state_dict = load_state_dict(resolved_archive_file[0], weights_only=weights_only)
-                            torch_dtype = get_state_dict_dtype(one_state_dict)
+                            dtype = get_state_dict_dtype(one_state_dict)
                             del one_state_dict  # free CPU memory
                         logger.info(
-                            "Since the `torch_dtype` attribute can't be found in model's config object, "
-                            "will use torch_dtype={torch_dtype} as derived from model's weights"
+                            "Since the `dtype` attribute can't be found in model's config object, "
+                            "will use dtype={dtype} as derived from model's weights"
                         )
-                elif hasattr(torch, torch_dtype):
-                    torch_dtype = getattr(torch, torch_dtype)
+                elif hasattr(torch, dtype):
+                    dtype = getattr(torch, dtype)
                     for sub_config_key in config.sub_configs.keys():
                         sub_config = getattr(config, sub_config_key)
-                        sub_config.torch_dtype = torch_dtype
-            elif isinstance(torch_dtype, torch.dtype):
+                        sub_config.dtype = dtype
+            elif isinstance(dtype, torch.dtype):
                 for sub_config_key in config.sub_configs.keys():
                     sub_config = getattr(config, sub_config_key)
-                    sub_config.torch_dtype = torch_dtype
-            elif isinstance(torch_dtype, dict):
-                for key, curr_dtype in torch_dtype.items():
+                    sub_config.dtype = dtype
+            elif isinstance(dtype, dict):
+                for key, curr_dtype in dtype.items():
                     if hasattr(config, key):
                         value = getattr(config, key)
-                        value.torch_dtype = curr_dtype
+                        value.dtype = curr_dtype
                 # main torch dtype for modules that aren't part of any sub-config
-                torch_dtype = torch_dtype.get("")
-                config.torch_dtype = torch_dtype
-                if isinstance(torch_dtype, str) and hasattr(torch, torch_dtype):
-                    torch_dtype = getattr(torch, torch_dtype)
-                elif torch_dtype is None:
-                    torch_dtype = torch.float32
+                dtype = dtype.get("")
+                config.dtype = dtype
+                if isinstance(dtype, str) and hasattr(torch, dtype):
+                    dtype = getattr(torch, dtype)
+                elif dtype is None:
+                    dtype = torch.float32
             else:
                 raise ValueError(
-                    f"`torch_dtype` can be one of: `torch.dtype`, `'auto'`, a string of a valid `torch.dtype` or a `dict` with valid `torch_dtype` "
-                    f"for each sub-config in composite configs, but received {torch_dtype}"
+                    f"`dtype` can be one of: `torch.dtype`, `'auto'`, a string of a valid `torch.dtype` or a `dict` with valid `dtype` "
+                    f"for each sub-config in composite configs, but received {dtype}"
                 )
 
-            dtype_orig = cls._set_default_torch_dtype(torch_dtype)
+            dtype_orig = cls._set_default_torch_dtype(dtype)
         else:
             # set fp32 as the default dtype for BC
             default_dtype = str(torch.get_default_dtype()).split(".")[-1]
-            config.torch_dtype = default_dtype
+            config.dtype = default_dtype
             for key in config.sub_configs.keys():
                 value = getattr(config, key)
-                value.torch_dtype = default_dtype
+                value.dtype = default_dtype
 
         # ** Difference from original from_pretrained **
         # We do not handle `use_keep_in_fp32_modules` here since it is not relevant for us.
@@ -1264,9 +1293,9 @@ def from_pretrained(
         config = copy.deepcopy(config)  # We do not want to modify the config inplace in from_pretrained.
 
         # ** Difference from original from_pretrained **
-        # We make sure that config.torch_dtype is of type torch.dtype.
+        # We make sure that config.dtype is of type torch.dtype.
         # We do not change the config inplace since we are working from a deepcopy.
-        config.torch_dtype = torch_dtype
+        config.dtype = dtype
 
         # ** Difference from original from_pretrained **
         # We do not support the `tie_word_embeddings` feature in pipeline parallelism.
@@ -1316,7 +1345,7 @@ def from_pretrained(
                     sharded_metadata=sharded_metadata,
                     _fast_init=_fast_init,
                     device_map=device_map,
-                    dtype=torch_dtype,
+                    dtype=dtype,
                     weights_only=weights_only,
                 )
 
@@ -1419,7 +1448,7 @@ def save_pretrained(
         # save the string version of dtype to the config, e.g. convert torch.float32 => "float32"
         # we currently don't use this setting automatically, but may start to use with v5
         dtype = get_parameter_dtype(model_to_save)
-        model_to_save.config.torch_dtype = str(dtype).split(".")[1]
+        model_to_save.config.dtype = str(dtype).split(".")[1]
 
         # Attach architecture to the config
         model_to_save.config.architectures = [model_to_save.__class__.__name__]
diff --git a/optimum/neuron/models/training/qwen3/modeling_qwen3.py b/optimum/neuron/models/training/qwen3/modeling_qwen3.py
index b5536d910..8b1eddb6d 100644
--- a/optimum/neuron/models/training/qwen3/modeling_qwen3.py
+++ b/optimum/neuron/models/training/qwen3/modeling_qwen3.py
@@ -197,7 +197,7 @@ def __init__(self, config: Qwen3Config, trn_config: TrainingNeuronConfig):
             self.padding_idx,
             init_method=init_method,
             sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
         )
         self.layers = nn.ModuleList(
             [Qwen3DecoderLayer(config, trn_config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
diff --git a/optimum/neuron/trainers/sft_trainer.py b/optimum/neuron/trainers/sft_trainer.py
index b69e909d2..c9a481bb4 100644
--- a/optimum/neuron/trainers/sft_trainer.py
+++ b/optimum/neuron/trainers/sft_trainer.py
@@ -138,7 +138,7 @@ def __init__(
             raise ValueError("You passed model_init_kwargs to the SFTConfig, but your model is already instantiated.")
         else:
             model_init_kwargs = args.model_init_kwargs
-            torch_dtype = model_init_kwargs.get("torch_dtype")
+            torch_dtype = model_init_kwargs.get("dtype")
             if torch_dtype is not None:
                 # Convert to `torch.dtype` if an str is passed
                 if isinstance(torch_dtype, str) and torch_dtype != "auto":
@@ -147,7 +147,7 @@ def __init__(
                     raise ValueError(
                         f"Invalid `torch_dtype` passed to the SFTConfig. Expected a string with either `torch.dtype` or 'auto', but got {torch_dtype}."
                     )
-                model_init_kwargs["torch_dtype"] = torch_dtype
+                model_init_kwargs["dtype"] = torch_dtype
 
         if isinstance(model, str):
             logging.warning(
diff --git a/optimum/neuron/trainers/training_args.py b/optimum/neuron/trainers/training_args.py
index f0e6f164e..b54704843 100644
--- a/optimum/neuron/trainers/training_args.py
+++ b/optimum/neuron/trainers/training_args.py
@@ -29,10 +29,9 @@
     SchedulerType,
     get_last_checkpoint,
 )
+from functools import cached_property
+
 from transformers.training_args import OptimizerNames, _convert_str_dict, default_logdir, trainer_log_levels
-from transformers.utils import (
-    cached_property,
-)
 
 from ...utils import logging
 from ..accelerate import NeuronAcceleratorState, NeuronPartialState
@@ -759,8 +758,8 @@ def _dict_torch_dtype_to_str(self, d: dict[str, Any]) -> None:
         converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
         string, which can then be stored in the json format.
         """
-        if d.get("torch_dtype", None) is not None and not isinstance(d["torch_dtype"], str):
-            d["torch_dtype"] = str(d["torch_dtype"]).split(".")[1]
+        if d.get("dtype", None) is not None and not isinstance(d["dtype"], str):
+            d["dtype"] = str(d["dtype"]).split(".")[1]
         for value in d.values():
             if isinstance(value, dict):
                 self._dict_torch_dtype_to_str(value)
diff --git a/pyproject.toml b/pyproject.toml
index 6f82fa577..9c1836bba 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 ]
 dependencies = [
-    "transformers ~= 4.55.4",
+    "transformers ~= 4.57.1",
     "accelerate == 1.8.1",
     "optimum ~= 1.24.0",
     "huggingface_hub >= 0.31.4",
diff --git a/tools/cache/auto_fill_diffusion_cache.py b/tools/cache/auto_fill_diffusion_cache.py
index 7213dda88..3a00c819f 100644
--- a/tools/cache/auto_fill_diffusion_cache.py
+++ b/tools/cache/auto_fill_diffusion_cache.py
@@ -219,7 +219,7 @@ def compile_and_cache_model(
                     task=model_config.get("task", None),
                     auto_cast=model_config.get("auto_cast", None),
                     auto_cast_type=model_config.get("auto_cast_type", None),
-                    torch_dtype=model_config.get("torch_dtype", None),
+                    torch_dtype=model_config.get("dtype", None),
                 )
     elif args.hf_model_id is None:
         raise ValueError("You must provide --hf_model_id to compile a model without a config file.")

From 6fe874b29e205368dd6dcdb9eb5008620e56a09c Mon Sep 17 00:00:00 2001
From: Michael Benayoun <mickbenayoun@gmail.com>
Date: Wed, 29 Oct 2025 12:10:21 +0100
Subject: [PATCH 02/10] fix: bugs linked to refactor

---
 optimum/neuron/models/training/modeling_utils.py | 13 +++++++------
 optimum/neuron/trainers/training_args.py         |  3 +--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/optimum/neuron/models/training/modeling_utils.py b/optimum/neuron/models/training/modeling_utils.py
index cef3c7c5b..31efbab17 100644
--- a/optimum/neuron/models/training/modeling_utils.py
+++ b/optimum/neuron/models/training/modeling_utils.py
@@ -469,7 +469,9 @@ def _load_pretrained_model(
                 def set_is_initialized_for_modules(module):
                     if (
                         all(getattr(child, "_is_hf_initialized", False) for child in module.children())
-                        and all(getattr(param, "_is_hf_initialized", False) for param in module.parameters(recurse=False))
+                        and all(
+                            getattr(param, "_is_hf_initialized", False) for param in module.parameters(recurse=False)
+                        )
                         and all(
                             getattr(buffer, "_is_hf_initialized", False)
                             for buffer in module.buffers(recurse=False)
@@ -479,7 +481,9 @@ def set_is_initialized_for_modules(module):
                         module._is_hf_initialized = True
 
                 model.apply(set_is_initialized_for_modules)
-                not_initialized_submodules = {name: mod for name, mod in model.named_modules() if not getattr(mod, "_is_hf_initialized", False)}
+                not_initialized_submodules = {
+                    name: mod for name, mod in model.named_modules() if not getattr(mod, "_is_hf_initialized", False)
+                }
             else:
                 not_initialized_submodules = dict(model.named_modules())
 
@@ -885,9 +889,6 @@ def from_pretrained(
                 _from_pipeline=from_pipeline,
                 **kwargs,
             )
-            # dtype is a config attribute, not a model parameter, so we remove it from model_kwargs
-            # Following the transformers pattern where dtype/torch_dtype are popped early from kwargs
-            model_kwargs.pop("dtype", None)
         else:
             config = copy.deepcopy(config)
             model_kwargs = kwargs
@@ -1260,7 +1261,7 @@ def from_pretrained(
                     f"for each sub-config in composite configs, but received {dtype}"
                 )
 
-            dtype_orig = cls._set_default_torch_dtype(dtype)
+            dtype_orig = cls._set_default_dtype(dtype)
         else:
             # set fp32 as the default dtype for BC
             default_dtype = str(torch.get_default_dtype()).split(".")[-1]
diff --git a/optimum/neuron/trainers/training_args.py b/optimum/neuron/trainers/training_args.py
index b54704843..e692d38e3 100644
--- a/optimum/neuron/trainers/training_args.py
+++ b/optimum/neuron/trainers/training_args.py
@@ -18,6 +18,7 @@
 import os
 from dataclasses import dataclass, field, fields
 from enum import Enum
+from functools import cached_property
 from typing import Any
 
 import torch
@@ -29,8 +30,6 @@
     SchedulerType,
     get_last_checkpoint,
 )
-from functools import cached_property
-
 from transformers.training_args import OptimizerNames, _convert_str_dict, default_logdir, trainer_log_levels
 
 from ...utils import logging

From 1e910a52730c3fff937bdb3fdd1b9fbaf8873de6 Mon Sep 17 00:00:00 2001
From: Michael Benayoun <mickbenayoun@gmail.com>
Date: Wed, 29 Oct 2025 12:27:38 +0100
Subject: [PATCH 03/10] fix: restore  for inference related code

---
 optimum/commands/neuron/cache.py                     |  2 +-
 optimum/commands/neuron/serve.py                     |  2 +-
 optimum/neuron/cache/hub_cache.py                    |  2 +-
 optimum/neuron/models/inference/backend/config.py    |  6 +++---
 .../backend/modules/attention/attention_base.py      |  4 ++--
 .../backend/modules/decoder/decoder_builder.py       |  8 ++++----
 .../backend/modules/decoder/decoder_wrapper.py       |  4 ++--
 .../backend/modules/kvcache/kv_cache_manager.py      |  2 +-
 .../models/inference/backend/modules/moe_v2.py       |  6 +++---
 .../models/inference/backend/pretrained_model.py     | 12 ++++++------
 .../models/inference/granite/modeling_granite.py     |  2 +-
 .../neuron/models/inference/llama/modeling_llama.py  | 10 +++++-----
 .../models/inference/llama4/modeling_llama4.py       |  4 ++--
 .../models/inference/mixtral/modeling_mixtral.py     |  2 +-
 .../neuron/models/inference/qwen2/modeling_qwen2.py  |  2 +-
 .../neuron/models/inference/qwen3/modeling_qwen3.py  |  2 +-
 .../models/inference/qwen3_moe/modeling_qwen3_moe.py |  2 +-
 .../models/inference/smollm3/modeling_smollm3.py     |  2 +-
 18 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/optimum/commands/neuron/cache.py b/optimum/commands/neuron/cache.py
index f19fa1459..a8365c89a 100644
--- a/optimum/commands/neuron/cache.py
+++ b/optimum/commands/neuron/cache.py
@@ -147,7 +147,7 @@ def _list_entries(self):
                     str(entry["batch_size"]),
                     str(entry["sequence_length"]),
                     str(entry.get("tp_degree", entry.get("tensor_parallel_size"))),
-                    str(entry["dtype"]),
+                    str(entry.get("torch_dtype", "unknown")),
                     str(entry["target"]),
                 )
             )
diff --git a/optimum/commands/neuron/serve.py b/optimum/commands/neuron/serve.py
index e7ebd8fcf..cc972a993 100644
--- a/optimum/commands/neuron/serve.py
+++ b/optimum/commands/neuron/serve.py
@@ -202,7 +202,7 @@ def run(self):
                 batch_size = selected_entry["batch_size"]
                 sequence_length = selected_entry["sequence_length"]
                 tensor_parallel_size = selected_entry["tp_degree"]
-                torch_dtype = DTYPE_MAPPER.pt(selected_entry["dtype"])
+                torch_dtype = DTYPE_MAPPER.pt(selected_entry["torch_dtype"])
                 warning_msg = f"{model_id} is not a neuron model, but a cached configuration is available using"
                 warning_msg += f" instance type {instance_type},"
                 warning_msg += f" batch size = {batch_size},"
diff --git a/optimum/neuron/cache/hub_cache.py b/optimum/neuron/cache/hub_cache.py
index 48f64647b..73605e4a3 100644
--- a/optimum/neuron/cache/hub_cache.py
+++ b/optimum/neuron/cache/hub_cache.py
@@ -427,7 +427,7 @@ def select_hub_cached_entries(
             continue
         if torch_dtype is not None:
             target_value = DTYPE_MAPPER.pt(torch_dtype) if isinstance(torch_dtype, str) else torch_dtype
-            entry_value = DTYPE_MAPPER.pt(entry.get("dtype"))
+            entry_value = DTYPE_MAPPER.pt(entry.get("torch_dtype"))
             if target_value != entry_value:
                 continue
         selected.append(entry)
diff --git a/optimum/neuron/models/inference/backend/config.py b/optimum/neuron/models/inference/backend/config.py
index 170c2f026..38209291f 100644
--- a/optimum/neuron/models/inference/backend/config.py
+++ b/optimum/neuron/models/inference/backend/config.py
@@ -83,9 +83,9 @@ def __init__(
         self.batch_size = batch_size
         self.sequence_length = sequence_length
         self.tp_degree = tp_degree
-        self.dtype = torch_dtype
-        if isinstance(self.dtype, str):
-            self.dtype = DTYPE_MAPPER.pt(self.dtype)
+        self.torch_dtype = torch_dtype
+        if isinstance(self.torch_dtype, str):
+            self.torch_dtype = DTYPE_MAPPER.pt(self.torch_dtype)
         self.n_active_tokens = self.sequence_length if n_active_tokens is None else n_active_tokens
         self.output_logits = output_logits
 
diff --git a/optimum/neuron/models/inference/backend/modules/attention/attention_base.py b/optimum/neuron/models/inference/backend/modules/attention/attention_base.py
index 5ef35b0ba..9d3c21b7e 100644
--- a/optimum/neuron/models/inference/backend/modules/attention/attention_base.py
+++ b/optimum/neuron/models/inference/backend/modules/attention/attention_base.py
@@ -99,7 +99,7 @@ def __init__(
             self.head_dim = self.hidden_size // self.num_attention_heads
         self.max_position_embeddings = config.max_position_embeddings
         self.rope_theta = config.rope_theta
-        self.dtype = neuron_config.dtype
+        self.dtype = neuron_config.torch_dtype
         self.rms_norm_eps = config.rms_norm_eps
         self._qk_scale = qk_scale
 
@@ -130,7 +130,7 @@ def __init__(
             input_is_parallel=True,
             layer_name=self.o_proj_layer_name,
             tensor_model_parallel_group=self.tensor_model_parallel_group,
-            rpl_reduce_dtype=neuron_config.dtype,
+            rpl_reduce_dtype=neuron_config.torch_dtype,
         )
         self.num_heads = utils.divide(self.qkv_proj.get_num_attention_heads(), neuron_config.tp_degree)
         self.num_key_value_heads = utils.divide(self.qkv_proj.get_num_key_value_heads(), neuron_config.tp_degree)
diff --git a/optimum/neuron/models/inference/backend/modules/decoder/decoder_builder.py b/optimum/neuron/models/inference/backend/modules/decoder/decoder_builder.py
index 8a11576c1..ced8b09dc 100644
--- a/optimum/neuron/models/inference/backend/modules/decoder/decoder_builder.py
+++ b/optimum/neuron/models/inference/backend/modules/decoder/decoder_builder.py
@@ -38,8 +38,8 @@ def __init__(
         self.max_tokens = max_tokens
         self.active_tokens = active_tokens
 
-        if not self.neuron_config.dtype:
-            self.neuron_config.dtype = torch.float32
+        if not self.neuron_config.torch_dtype:
+            self.neuron_config.torch_dtype = torch.float32
 
         if config.pad_token_id is None:
             config.pad_token_id = 0
@@ -88,9 +88,9 @@ def load_module(self):
         float_model = self.model_cls(self.config, self.neuron_config)
         float_model.eval()
 
-        if self.neuron_config.dtype != torch.float32:
+        if self.neuron_config.torch_dtype != torch.float32:
             float_model._apply(
-                lambda t: t.to(self.neuron_config.dtype)
+                lambda t: t.to(self.neuron_config.torch_dtype)
                 if t.is_floating_point() and t.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]
                 else t
             )
diff --git a/optimum/neuron/models/inference/backend/modules/decoder/decoder_wrapper.py b/optimum/neuron/models/inference/backend/modules/decoder/decoder_wrapper.py
index e1a34bd92..6f58e3f5a 100644
--- a/optimum/neuron/models/inference/backend/modules/decoder/decoder_wrapper.py
+++ b/optimum/neuron/models/inference/backend/modules/decoder/decoder_wrapper.py
@@ -37,8 +37,8 @@ def __init__(
         self.model = model
         self.tag = tag
 
-        if not self.neuron_config.dtype:
-            self.neuron_config.dtype = torch.float32
+        if not self.neuron_config.torch_dtype:
+            self.neuron_config.torch_dtype = torch.float32
 
         if config.pad_token_id is None:
             config.pad_token_id = 0
diff --git a/optimum/neuron/models/inference/backend/modules/kvcache/kv_cache_manager.py b/optimum/neuron/models/inference/backend/modules/kvcache/kv_cache_manager.py
index 78bee92c6..50ecc4d0a 100644
--- a/optimum/neuron/models/inference/backend/modules/kvcache/kv_cache_manager.py
+++ b/optimum/neuron/models/inference/backend/modules/kvcache/kv_cache_manager.py
@@ -47,7 +47,7 @@ def __init__(self, config: PretrainedConfig, neuron_config: NxDNeuronConfig, **k
         self._init_kv_shape(config, neuron_config)
 
         num_layer = config.num_hidden_layers
-        dtype = neuron_config.dtype
+        dtype = neuron_config.torch_dtype
         self.past_key_values = nn.ParameterList(
             [nn.Parameter(torch.zeros(self.kv_shape, dtype=dtype), requires_grad=False) for _ in range(num_layer * 2)]
         )
diff --git a/optimum/neuron/models/inference/backend/modules/moe_v2.py b/optimum/neuron/models/inference/backend/modules/moe_v2.py
index d9789c10b..6c4063925 100644
--- a/optimum/neuron/models/inference/backend/modules/moe_v2.py
+++ b/optimum/neuron/models/inference/backend/modules/moe_v2.py
@@ -36,7 +36,7 @@ def initialize_moe_module(
             glu_mlp=neuron_config.glu_mlp,
             early_expert_affinity_modulation=early_expert_affinity_modulation,
         ),
-        dtype=neuron_config.dtype,
+        dtype=neuron_config.torch_dtype,
     )
     shared_experts = None
     if n_shared_experts is not None:
@@ -45,8 +45,8 @@ def initialize_moe_module(
             intermediate_size=config.intermediate_size,
             num_shared_experts=n_shared_experts,
             hidden_act=config.hidden_act,
-            dtype=neuron_config.dtype,
-            reduce_dtype=neuron_config.dtype,
+            dtype=neuron_config.torch_dtype,
+            reduce_dtype=neuron_config.torch_dtype,
             fused_gate_up_projection=fused_shared_experts,
         )
 
diff --git a/optimum/neuron/models/inference/backend/pretrained_model.py b/optimum/neuron/models/inference/backend/pretrained_model.py
index 5a3a24a6f..8cad5b6f2 100644
--- a/optimum/neuron/models/inference/backend/pretrained_model.py
+++ b/optimum/neuron/models/inference/backend/pretrained_model.py
@@ -117,7 +117,7 @@ def __init__(
         self.config = copy.deepcopy(config)
         self.neuron_config = copy.deepcopy(neuron_config)
         # Override torch_dtype in config as it is used by the neuronx_distributed code to cast weights to the correct type
-        self.config.dtype = self.neuron_config.dtype
+        self.config.dtype = self.neuron_config.torch_dtype
         self._traced_model = traced_model
         self.graph_builders = graph_builders  # Required for loading weights
 
@@ -252,11 +252,11 @@ def checkpoint_loader_fn(self, checkpoint_path, config, neuron_config):
         """This function loads the model's state dictionary and weights from the hf model"""
 
         model_sd = self.get_state_dict(checkpoint_path, config, neuron_config)
-        if neuron_config.dtype != torch.float32:
+        if neuron_config.torch_dtype != torch.float32:
             for name, param in model_sd.items():
-                if torch.is_floating_point(param) and param.dtype is not neuron_config.dtype:
-                    logger.debug(f"Converting {name} to {neuron_config.dtype}")
-                    model_sd[name] = param.to(neuron_config.dtype)
+                if torch.is_floating_point(param) and param.dtype is not neuron_config.torch_dtype:
+                    logger.debug(f"Converting {name} to {neuron_config.torch_dtype}")
+                    model_sd[name] = param.to(neuron_config.torch_dtype)
         return model_sd
 
     @classmethod
@@ -344,7 +344,7 @@ def _export(
                 trust_remote_code=trust_remote_code,
             ).get_text_config()
         # Override torch_dtype in config as it is used by the neuronx_distributed code to cast weights to the correct type
-        config.dtype = neuron_config.dtype
+        config.dtype = neuron_config.torch_dtype
         # Evaluate head_dim if it is defined but set to null (like in Mixtral for transformers 4.54+)
         if hasattr(config, "head_dim") and config.head_dim is None:
             config.head_dim = config.hidden_size // config.num_attention_heads
diff --git a/optimum/neuron/models/inference/granite/modeling_granite.py b/optimum/neuron/models/inference/granite/modeling_granite.py
index 1863ffe92..72b25fb19 100644
--- a/optimum/neuron/models/inference/granite/modeling_granite.py
+++ b/optimum/neuron/models/inference/granite/modeling_granite.py
@@ -104,7 +104,7 @@ def __init__(self, config: GraniteConfig, neuron_config: NxDNeuronConfig):
             config.vocab_size,
             config.hidden_size,
             config.pad_token_id,
-            dtype=neuron_config.dtype,
+            dtype=neuron_config.torch_dtype,
             shard_across_embedding=True,
             pad=True,
         )
diff --git a/optimum/neuron/models/inference/llama/modeling_llama.py b/optimum/neuron/models/inference/llama/modeling_llama.py
index 18161c941..76b02eaec 100644
--- a/optimum/neuron/models/inference/llama/modeling_llama.py
+++ b/optimum/neuron/models/inference/llama/modeling_llama.py
@@ -84,7 +84,7 @@ def __init__(self, config: LlamaConfig, neuron_config: NxDNeuronConfig):
             self.intermediate_size,
             bias=mlp_bias,
             gather_output=False,
-            dtype=neuron_config.dtype,
+            dtype=neuron_config.torch_dtype,
             pad=True,
         )
         self.up_proj = ColumnParallelLinear(
@@ -92,7 +92,7 @@ def __init__(self, config: LlamaConfig, neuron_config: NxDNeuronConfig):
             self.intermediate_size,
             bias=mlp_bias,
             gather_output=False,
-            dtype=neuron_config.dtype,
+            dtype=neuron_config.torch_dtype,
             pad=True,
         )
         self.down_proj = RowParallelLinear(
@@ -100,9 +100,9 @@ def __init__(self, config: LlamaConfig, neuron_config: NxDNeuronConfig):
             self.hidden_size,
             bias=mlp_bias,
             input_is_parallel=True,
-            dtype=neuron_config.dtype,
+            dtype=neuron_config.torch_dtype,
             pad=True,
-            reduce_dtype=neuron_config.dtype,
+            reduce_dtype=neuron_config.torch_dtype,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -288,7 +288,7 @@ def __init__(self, config: LlamaConfig, neuron_config: NxDNeuronConfig):
             config.vocab_size,
             config.hidden_size,
             config.pad_token_id,
-            dtype=neuron_config.dtype,
+            dtype=neuron_config.torch_dtype,
             shard_across_embedding=True,
             pad=True,
         )
diff --git a/optimum/neuron/models/inference/llama4/modeling_llama4.py b/optimum/neuron/models/inference/llama4/modeling_llama4.py
index 7b29dba66..5a3473aec 100644
--- a/optimum/neuron/models/inference/llama4/modeling_llama4.py
+++ b/optimum/neuron/models/inference/llama4/modeling_llama4.py
@@ -209,7 +209,7 @@ def __init__(self, config: Llama4TextConfig, neuron_config: NxDNeuronConfig):
             config.vocab_size,
             config.hidden_size,
             config.pad_token_id,
-            dtype=neuron_config.dtype,
+            dtype=neuron_config.torch_dtype,
             shard_across_embedding=True,
             pad=True,
         )
@@ -226,7 +226,7 @@ def __init__(self, config: Llama4TextConfig, neuron_config: NxDNeuronConfig):
             gather_output=not neuron_config.on_device_sampling,
             bias=False,
             pad=True,
-            dtype=neuron_config.dtype,
+            dtype=neuron_config.torch_dtype,
         )
 
 
diff --git a/optimum/neuron/models/inference/mixtral/modeling_mixtral.py b/optimum/neuron/models/inference/mixtral/modeling_mixtral.py
index e7530aa64..92adbc093 100644
--- a/optimum/neuron/models/inference/mixtral/modeling_mixtral.py
+++ b/optimum/neuron/models/inference/mixtral/modeling_mixtral.py
@@ -206,7 +206,7 @@ def __init__(self, config: MixtralConfig, neuron_config: NxDNeuronConfig):
             config.vocab_size,
             config.hidden_size,
             config.pad_token_id,
-            dtype=neuron_config.dtype,
+            dtype=neuron_config.torch_dtype,
             shard_across_embedding=True,
         )
         self.layers = nn.ModuleList(
diff --git a/optimum/neuron/models/inference/qwen2/modeling_qwen2.py b/optimum/neuron/models/inference/qwen2/modeling_qwen2.py
index a000bc07e..7cd0d0f71 100644
--- a/optimum/neuron/models/inference/qwen2/modeling_qwen2.py
+++ b/optimum/neuron/models/inference/qwen2/modeling_qwen2.py
@@ -76,7 +76,7 @@ def __init__(self, config: Qwen2Config, neuron_config: NxDNeuronConfig):
             config.vocab_size,
             config.hidden_size,
             config.pad_token_id,
-            dtype=neuron_config.dtype,
+            dtype=neuron_config.torch_dtype,
             shard_across_embedding=True,
             pad=True,
         )
diff --git a/optimum/neuron/models/inference/qwen3/modeling_qwen3.py b/optimum/neuron/models/inference/qwen3/modeling_qwen3.py
index b35b87dd9..3fb33928b 100644
--- a/optimum/neuron/models/inference/qwen3/modeling_qwen3.py
+++ b/optimum/neuron/models/inference/qwen3/modeling_qwen3.py
@@ -76,7 +76,7 @@ def __init__(self, config: Qwen3Config, neuron_config: NxDNeuronConfig):
             config.vocab_size,
             config.hidden_size,
             config.pad_token_id,
-            dtype=neuron_config.dtype,
+            dtype=neuron_config.torch_dtype,
             shard_across_embedding=True,
             pad=True,
         )
diff --git a/optimum/neuron/models/inference/qwen3_moe/modeling_qwen3_moe.py b/optimum/neuron/models/inference/qwen3_moe/modeling_qwen3_moe.py
index 76eeda130..fc64713f6 100644
--- a/optimum/neuron/models/inference/qwen3_moe/modeling_qwen3_moe.py
+++ b/optimum/neuron/models/inference/qwen3_moe/modeling_qwen3_moe.py
@@ -179,7 +179,7 @@ def __init__(self, config: Qwen3MoeConfig, neuron_config: NxDNeuronConfig):
             config.vocab_size,
             config.hidden_size,
             config.pad_token_id,
-            dtype=neuron_config.dtype,
+            dtype=neuron_config.torch_dtype,
             shard_across_embedding=True,
         )
         self.layers = nn.ModuleList(
diff --git a/optimum/neuron/models/inference/smollm3/modeling_smollm3.py b/optimum/neuron/models/inference/smollm3/modeling_smollm3.py
index d4ad65803..b45560202 100644
--- a/optimum/neuron/models/inference/smollm3/modeling_smollm3.py
+++ b/optimum/neuron/models/inference/smollm3/modeling_smollm3.py
@@ -88,7 +88,7 @@ def __init__(self, config: SmolLM3Config, neuron_config: NxDNeuronConfig):
             config.vocab_size,
             config.hidden_size,
             config.pad_token_id,
-            dtype=neuron_config.dtype,
+            dtype=neuron_config.torch_dtype,
             shard_across_embedding=True,
             pad=True,
         )

From 94c837365ad155ee33d4097f34821ea2a49d5eb4 Mon Sep 17 00:00:00 2001
From: Michael Benayoun <mickbenayoun@gmail.com>
Date: Wed, 29 Oct 2025 12:37:15 +0100
Subject: [PATCH 04/10] fix: restore  for inference related code

---
 .../text-generation/performance/llama3.3-70b.py      |  2 +-
 docs/source/model_doc/diffusers/flux.mdx             |  2 +-
 docs/source/model_doc/diffusers/pixart_alpha.mdx     |  2 +-
 docs/source/model_doc/diffusers/pixart_sigma.mdx     |  2 +-
 optimum/commands/neuron/cache.py                     |  2 +-
 optimum/commands/neuron/serve.py                     |  2 +-
 optimum/exporters/neuron/__main__.py                 |  4 ++--
 optimum/neuron/cache/entries/cache_entry.py          |  2 +-
 optimum/neuron/generation/utils.py                   | 11 ++++++-----
 .../backend/modules/attention/attention_base.py      | 12 ++++++------
 .../models/inference/backend/pretrained_model.py     |  4 ++--
 11 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/benchmark/text-generation/performance/llama3.3-70b.py b/benchmark/text-generation/performance/llama3.3-70b.py
index d4a29c118..d87bb5c8a 100644
--- a/benchmark/text-generation/performance/llama3.3-70b.py
+++ b/benchmark/text-generation/performance/llama3.3-70b.py
@@ -32,7 +32,7 @@ def main():
             assert neuron_config.sequence_length == seq_length, (
                 f"Model {model_name} is not configured for sequence length {seq_length}."
             )
-            assert neuron_config.dtype == torch.bfloat16, f"Model {model_name} is not configured for bf16."
+            assert neuron_config.torch_dtype == torch.bfloat16, f"Model {model_name} is not configured for bf16."
             model = NeuronModelForCausalLM.from_pretrained(model_id)
         except Exception:
             model = NeuronModelForCausalLM.from_pretrained(
diff --git a/docs/source/model_doc/diffusers/flux.mdx b/docs/source/model_doc/diffusers/flux.mdx
index c68b1554e..a47148517 100644
--- a/docs/source/model_doc/diffusers/flux.mdx
+++ b/docs/source/model_doc/diffusers/flux.mdx
@@ -40,7 +40,7 @@ if __name__ == "__main__":
 
     pipe = NeuronFluxPipeline.from_pretrained(
         "black-forest-labs/FLUX.1-dev",
-        dtype=torch.bfloat16,
+        torch_dtype=torch.bfloat16,
         export=True,
         tensor_parallel_size=8,
         **compiler_args,
diff --git a/docs/source/model_doc/diffusers/pixart_alpha.mdx b/docs/source/model_doc/diffusers/pixart_alpha.mdx
index 375f292cb..31dca7023 100644
--- a/docs/source/model_doc/diffusers/pixart_alpha.mdx
+++ b/docs/source/model_doc/diffusers/pixart_alpha.mdx
@@ -61,7 +61,7 @@ from optimum.neuron import NeuronPixArtAlphaPipeline
 compiler_args = {"auto_cast": "none"}
 input_shapes = {"batch_size": 1, "height": 512, "width": 512, "sequence_length": 120}
 
-neuron_model = NeuronPixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-512x512", dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes)
+neuron_model = NeuronPixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-512x512", torch_dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes)
 
 # Save locally
 neuron_model.save_pretrained("pixart_alpha_neuron_512/")
diff --git a/docs/source/model_doc/diffusers/pixart_sigma.mdx b/docs/source/model_doc/diffusers/pixart_sigma.mdx
index bc52effdb..59ecbefb7 100644
--- a/docs/source/model_doc/diffusers/pixart_sigma.mdx
+++ b/docs/source/model_doc/diffusers/pixart_sigma.mdx
@@ -61,7 +61,7 @@ from optimum.neuron import NeuronPixArtSigmaPipeline
 compiler_args = {"auto_cast": "none"}
 input_shapes = {"batch_size": 1, "height": 512, "width": 512, "sequence_length": 120}
 
-neuron_model = NeuronPixArtSigmaPipeline.from_pretrained("Jingya/pixart_sigma_pipe_xl_2_512_ms", dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes)
+neuron_model = NeuronPixArtSigmaPipeline.from_pretrained("Jingya/pixart_sigma_pipe_xl_2_512_ms", torch_dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes)
 
 # Save locally
 neuron_model.save_pretrained("pixart_sigma_neuron_512/")
diff --git a/optimum/commands/neuron/cache.py b/optimum/commands/neuron/cache.py
index a8365c89a..cb437c347 100644
--- a/optimum/commands/neuron/cache.py
+++ b/optimum/commands/neuron/cache.py
@@ -147,7 +147,7 @@ def _list_entries(self):
                     str(entry["batch_size"]),
                     str(entry["sequence_length"]),
                     str(entry.get("tp_degree", entry.get("tensor_parallel_size"))),
-                    str(entry.get("torch_dtype", "unknown")),
+                    str(entry["torch_dtype"]),
                     str(entry["target"]),
                 )
             )
diff --git a/optimum/commands/neuron/serve.py b/optimum/commands/neuron/serve.py
index cc972a993..d0f7ce0d3 100644
--- a/optimum/commands/neuron/serve.py
+++ b/optimum/commands/neuron/serve.py
@@ -101,7 +101,7 @@ def run(self):
         sequence_length = self.args.sequence_length
         tensor_parallel_size = self.args.tensor_parallel_size
         config = AutoConfig.from_pretrained(model_name_or_path)
-        torch_dtype = DTYPE_MAPPER.pt(config.dtype)
+        torch_dtype = DTYPE_MAPPER.pt(config.torch_dtype)
         try:
             # Look for a NeuronConfig in the model directory
             neuron_config = NeuronConfig.from_pretrained(model_name_or_path)
diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py
index eda69a541..afa451485 100644
--- a/optimum/exporters/neuron/__main__.py
+++ b/optimum/exporters/neuron/__main__.py
@@ -581,7 +581,7 @@ def load_models_and_neuron_configs(
         "trust_remote_code": trust_remote_code,
         "framework": "pt",
         "library_name": library_name,
-        "dtype": torch_dtype,
+        "torch_dtype": torch_dtype,
     }
     if model is None:
         model = TasksManager.get_model_from_task(**model_kwargs)
@@ -878,7 +878,7 @@ def main():
         model_name_or_path=args.model,
         output=args.output,
         compiler_kwargs=compiler_kwargs,
-        torch_dtype=args.dtype,
+        torch_dtype=args.torch_dtype,
         tensor_parallel_size=args.tensor_parallel_size,
         task=task,
         dynamic_batch_size=args.dynamic_batch_size,
diff --git a/optimum/neuron/cache/entries/cache_entry.py b/optimum/neuron/cache/entries/cache_entry.py
index b4b7a4680..15bd8c520 100644
--- a/optimum/neuron/cache/entries/cache_entry.py
+++ b/optimum/neuron/cache/entries/cache_entry.py
@@ -28,7 +28,7 @@
     "bos_token_id",
     "pad_token_id",
     "torchscript",
-    "dtype",  # this has been renamed as `float_dtype` for the check
+    "torch_dtype",  # this has been renamed as `float_dtype` for the check
     "_commit_hash",
     "sample_size",
     "projection_dim",
diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py
index fc76cde80..fd654b492 100644
--- a/optimum/neuron/generation/utils.py
+++ b/optimum/neuron/generation/utils.py
@@ -325,17 +325,18 @@ class NeuronGenerationMixin(GenerationMixin):
     The class exposes [`~generation.GenerationMixin.generate`], which can be used for:
         - *greedy decoding* by calling [`~generation.GenerationMixin.greedy_search`] if `num_beams=1` and
           `do_sample=False`
+        - *contrastive search* by calling [`~generation.GenerationMixin.contrastive_search`] if `penalty_alpha>0` and
+          `top_k>1`
         - *multinomial sampling* by calling [`~generation.GenerationMixin.sample`] if `num_beams=1` and
           `do_sample=True`
         - *beam-search decoding* by calling [`~generation.GenerationMixin.beam_search`] if `num_beams>1` and
           `do_sample=False`
         - *beam-search multinomial sampling* by calling [`~generation.GenerationMixin.beam_sample`] if `num_beams>1`
           and `do_sample=True`
-
-    Note: The following strategies have been removed in transformers 4.56.0+:
-        - constrained beam-search decoding (constraints and force_words_ids)
-        - group beam-search decoding (num_beam_groups > 1)
-        - contrastive search (penalty_alpha > 0)
+        - *diverse beam-search decoding* by calling [`~generation.GenerationMixin.group_beam_search`], if `num_beams>1`
+          and `num_beam_groups>1`
+        - *constrained beam-search decoding* by calling [`~generation.GenerationMixin.constrained_beam_search`], if
+          `constraints!=None` or `force_words_ids!=None`
 
     You do not need to call any of the above methods directly. Pass custom parameter values to 'generate' instead. To
     learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
diff --git a/optimum/neuron/models/inference/backend/modules/attention/attention_base.py b/optimum/neuron/models/inference/backend/modules/attention/attention_base.py
index 9d3c21b7e..37542e575 100644
--- a/optimum/neuron/models/inference/backend/modules/attention/attention_base.py
+++ b/optimum/neuron/models/inference/backend/modules/attention/attention_base.py
@@ -99,7 +99,7 @@ def __init__(
             self.head_dim = self.hidden_size // self.num_attention_heads
         self.max_position_embeddings = config.max_position_embeddings
         self.rope_theta = config.rope_theta
-        self.dtype = neuron_config.torch_dtype
+        self.torch_dtype = neuron_config.torch_dtype
         self.rms_norm_eps = config.rms_norm_eps
         self._qk_scale = qk_scale
 
@@ -111,7 +111,7 @@ def __init__(
             num_attention_heads=self.num_attention_heads,
             num_key_value_heads=self.num_key_value_heads,
             tp_degree=neuron_config.tp_degree,
-            dtype=self.dtype,
+            dtype=self.torch_dtype,
             bias=qkv_proj_bias,
             gather_output=False,
             fused_qkv=neuron_config.fused_qkv,
@@ -125,7 +125,7 @@ def __init__(
             num_attention_heads=self.num_attention_heads,
             num_key_value_heads=self.num_key_value_heads,
             tp_degree=neuron_config.tp_degree,
-            dtype=self.dtype,
+            dtype=self.torch_dtype,
             bias=o_proj_bias,
             input_is_parallel=True,
             layer_name=self.o_proj_layer_name,
@@ -202,13 +202,13 @@ def perform_prefill(self, Q, K, V, q_len, bsz, attention_mask) -> Tensor:
             Q = (
                 Q.permute(0, 1, 3, 2)  # after permute: batch, num_heads, d_head, seqlen
                 .reshape((bsz * self.num_heads, self.head_dim, q_len))
-                .to(self.dtype)
+                .to(self.torch_dtype)
             )
             Q = Q * self.qk_scale
             K_active = (
-                K_active.permute(0, 1, 3, 2).reshape((bsz * self.num_heads, self.head_dim, q_len)).to(self.dtype)
+                K_active.permute(0, 1, 3, 2).reshape((bsz * self.num_heads, self.head_dim, q_len)).to(self.torch_dtype)
             )
-            V_active = V_active.reshape((bsz * self.num_heads, q_len, self.head_dim)).to(self.dtype)
+            V_active = V_active.reshape((bsz * self.num_heads, q_len, self.head_dim)).to(self.torch_dtype)
             # shape: (B*H)DS
             attn_output = torch.zeros(bsz * self.num_heads, self.head_dim, q_len, dtype=Q.dtype, device=Q.device)
 
diff --git a/optimum/neuron/models/inference/backend/pretrained_model.py b/optimum/neuron/models/inference/backend/pretrained_model.py
index 8cad5b6f2..985115de8 100644
--- a/optimum/neuron/models/inference/backend/pretrained_model.py
+++ b/optimum/neuron/models/inference/backend/pretrained_model.py
@@ -117,7 +117,7 @@ def __init__(
         self.config = copy.deepcopy(config)
         self.neuron_config = copy.deepcopy(neuron_config)
         # Override torch_dtype in config as it is used by the neuronx_distributed code to cast weights to the correct type
-        self.config.dtype = self.neuron_config.torch_dtype
+        self.config.torch_dtype = self.neuron_config.torch_dtype
         self._traced_model = traced_model
         self.graph_builders = graph_builders  # Required for loading weights
 
@@ -344,7 +344,7 @@ def _export(
                 trust_remote_code=trust_remote_code,
             ).get_text_config()
         # Override torch_dtype in config as it is used by the neuronx_distributed code to cast weights to the correct type
-        config.dtype = neuron_config.torch_dtype
+        config.torch_dtype = neuron_config.torch_dtype
         # Evaluate head_dim if it is defined but set to null (like in Mixtral for transformers 4.54+)
         if hasattr(config, "head_dim") and config.head_dim is None:
             config.head_dim = config.hidden_size // config.num_attention_heads

From 741e542a86dda83a1a5770f0a237a174472cfaf7 Mon Sep 17 00:00:00 2001
From: JingyaHuang <huang_jingya@outlook.com>
Date: Wed, 29 Oct 2025 13:55:32 +0000
Subject: [PATCH 05/10] fix: dict as outputs during the padding

---
 optimum/neuron/modeling_traced.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/optimum/neuron/modeling_traced.py b/optimum/neuron/modeling_traced.py
index 99184f8f9..7f00214c4 100644
--- a/optimum/neuron/modeling_traced.py
+++ b/optimum/neuron/modeling_traced.py
@@ -612,7 +612,7 @@ def neuron_padding_manager(self, inputs: dict[str, "torch.Tensor"]):
 
     @staticmethod
     def remove_padding(
-        outputs: list[torch.Tensor],
+        outputs: list[torch.Tensor] | dict,
         dims: list[int],
         indices: list[int],
         padding_side: Literal["right", "left"] = "right",
@@ -633,6 +633,8 @@ def remove_padding(
         if len(dims) != len(indices):
             raise ValueError(f"The size of `dims`({len(dims)}) and indices`({len(indices)}) must be equal.")
 
+        if isinstance(outputs, dict):
+            outputs = list(outputs.values())
         for dim, indice in zip(dims, indices):
             if padding_side == "right":
                 outputs = [

From 0bbe203a7f6319b1a96150ab4435b158f5799308 Mon Sep 17 00:00:00 2001
From: JingyaHuang <huang_jingya@outlook.com>
Date: Wed, 29 Oct 2025 14:27:23 +0000
Subject: [PATCH 06/10] fix: update t5 modeling with new cahce utils

---
 .../neuron/models/inference/t5/modeling_t5.py | 48 +++++++++----------
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/optimum/neuron/models/inference/t5/modeling_t5.py b/optimum/neuron/models/inference/t5/modeling_t5.py
index 15c3e8c67..49b69ff30 100644
--- a/optimum/neuron/models/inference/t5/modeling_t5.py
+++ b/optimum/neuron/models/inference/t5/modeling_t5.py
@@ -27,6 +27,7 @@
 from torch import nn
 from transformers import T5Config
 from transformers.activations import ACT2FN
+from transformers.cache_utils import EncoderDecoderCache
 from transformers.models.t5.modeling_t5 import (
     T5Attention,
     T5DenseActDense,
@@ -154,7 +155,7 @@ def forward(
         mask=None,
         key_value_states=None,
         position_bias=None,
-        past_key_value=None,
+        past_key_values=None,
         layer_head_mask=None,
         query_length=None,
         use_cache=False,
@@ -177,38 +178,38 @@ def forward(
             batch_size, -1, self.num_attention_heads_per_partition, self.key_value_proj_dim
         ).transpose(1, 2)
 
-        if past_key_value is not None:
-            is_updated = past_key_value.is_updated.get(self.layer_idx)
+        # Check is encoder-decoder model is being used. Otherwise we'll get `DynamicCache`
+        is_updated = False
+        if isinstance(past_key_values, EncoderDecoderCache):
+            is_updated = past_key_values.is_updated.get(self.layer_idx)
             if is_cross_attention:
                 # after the first generated id, we can subsequently re-use all key/value_states from cache
-                curr_past_key_value = past_key_value.cross_attention_cache
+                curr_past_key_values = past_key_values.cross_attention_cache
             else:
-                curr_past_key_value = past_key_value.self_attention_cache
+                curr_past_key_values = past_key_values.self_attention_cache
+        else:
+            curr_past_key_values = past_key_values
 
         current_states = key_value_states if is_cross_attention else hidden_states
-        if is_cross_attention and past_key_value is not None and is_updated:
+        if is_cross_attention and past_key_values is not None and is_updated:
             # reuse k,v, cross_attentions
-            key_states = curr_past_key_value.key_cache[self.layer_idx]
-            value_states = curr_past_key_value.value_cache[self.layer_idx]
+            key_states = curr_past_key_values.layers[self.layer_idx].keys
+            value_states = curr_past_key_values.layers[self.layer_idx].values
         else:
             key_states = self.k(current_states)
             value_states = self.v(current_states)
-            key_states = key_states.view(
-                batch_size, -1, self.num_attention_heads_per_partition, self.key_value_proj_dim
-            ).transpose(1, 2)
-            value_states = value_states.view(
-                batch_size, -1, self.num_attention_heads_per_partition, self.key_value_proj_dim
-            ).transpose(1, 2)
-
-            if past_key_value is not None:
+            key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+            value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+
+            if past_key_values is not None:
                 # save all key/value_states to cache to be re-used for fast auto-regressive generation
                 cache_position = cache_position if not is_cross_attention else None
-                key_states, value_states = curr_past_key_value.update(
+                key_states, value_states = curr_past_key_values.update(
                     key_states, value_states, self.layer_idx, {"cache_position": cache_position}
                 )
                 # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
-                if is_cross_attention:
-                    past_key_value.is_updated[self.layer_idx] = True
+                if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
+                    past_key_values.is_updated[self.layer_idx] = True
 
         # compute scores, equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
         scores = torch.matmul(query_states, key_states.transpose(3, 2))
@@ -235,14 +236,9 @@ def forward(
                 causal_mask = mask[:, :, :, : key_states.shape[-2]]
                 position_bias = position_bias + causal_mask
 
-        if self.pruned_heads:
-            mask = torch.ones(position_bias.shape[1])
-            mask[list(self.pruned_heads)] = 0
-            position_bias_masked = position_bias[:, mask.bool()]
-        else:
-            position_bias_masked = position_bias
-
+        position_bias_masked = position_bias
         scores += position_bias_masked
+
         # (batch_size, n_heads, seq_length, key_length)
         attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)
         attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)

From fda56be6501c8128b5b577a81dde038f8693702f Mon Sep 17 00:00:00 2001
From: JingyaHuang <huang_jingya@outlook.com>
Date: Wed, 29 Oct 2025 14:54:59 +0000
Subject: [PATCH 07/10] fix: generate assistant mode

---
 .../inference/backend/modules/generation/generation_utils.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/optimum/neuron/models/inference/backend/modules/generation/generation_utils.py b/optimum/neuron/models/inference/backend/modules/generation/generation_utils.py
index c72bd5762..163690355 100644
--- a/optimum/neuron/models/inference/backend/modules/generation/generation_utils.py
+++ b/optimum/neuron/models/inference/backend/modules/generation/generation_utils.py
@@ -17,7 +17,7 @@
 from typing import Any
 
 import torch
-from transformers import GenerationConfig
+from transformers import GenerationConfig, PreTrainedModel
 from transformers.generation import GenerationMixin, SampleDecoderOnlyOutput
 from transformers.generation.logits_process import LogitsProcessorList
 from transformers.generation.stopping_criteria import StoppingCriteriaList
@@ -270,14 +270,13 @@ def _update_model_kwargs_for_generation(
     def _assisted_decoding(
         self,
         input_ids: torch.LongTensor,
-        candidate_generator: "CandidateGenerator",  # noqa
         stopping_criteria: StoppingCriteriaList,
         generation_config: GenerationConfig,
+        assistant_model: "PreTrainedModel | None" = None,
         **model_kwargs,
     ):
         pad_token_id = generation_config.pad_token_id
         eos_token_id = generation_config.eos_token_id
-        assistant_model = candidate_generator.assistant_model
 
         if assistant_model.neuron_config.on_device_sampling:
             raise ValueError("Assistant model must not use on-device sampling")

From b72a90fe437d03ef37b0f713afeda3a833ebf081 Mon Sep 17 00:00:00 2001
From: JingyaHuang <huang_jingya@outlook.com>
Date: Wed, 29 Oct 2025 15:07:20 +0000
Subject: [PATCH 08/10] fix: diffusers caching tool

---
 tools/cache/auto_fill_diffusion_cache.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/cache/auto_fill_diffusion_cache.py b/tools/cache/auto_fill_diffusion_cache.py
index 3a00c819f..72dcbac45 100644
--- a/tools/cache/auto_fill_diffusion_cache.py
+++ b/tools/cache/auto_fill_diffusion_cache.py
@@ -219,7 +219,7 @@ def compile_and_cache_model(
                     task=model_config.get("task", None),
                     auto_cast=model_config.get("auto_cast", None),
                     auto_cast_type=model_config.get("auto_cast_type", None),
-                    torch_dtype=model_config.get("dtype", None),
+                    torch_dtype=model_config.get("dtype", None) or model_config.get("torch_dtype", None),
                 )
     elif args.hf_model_id is None:
         raise ValueError("You must provide --hf_model_id to compile a model without a config file.")
@@ -235,4 +235,5 @@ def compile_and_cache_model(
             task=args.task,
             auto_cast=args.auto_cast,
             auto_cast_type=args.auto_cast_type,
+            torch_dtype=args.torch_dtype,
         )

From bf06f2e969260f1328ec77af521381dd19b7cd4f Mon Sep 17 00:00:00 2001
From: Michael Benayoun <mickbenayoun@gmail.com>
Date: Thu, 30 Oct 2025 16:51:32 +0100
Subject: [PATCH 09/10] fix: remove deprecated tests

---
 tests/training/test_custom_modeling.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/training/test_custom_modeling.py b/tests/training/test_custom_modeling.py
index 29746b9d1..bda6d3b89 100644
--- a/tests/training/test_custom_modeling.py
+++ b/tests/training/test_custom_modeling.py
@@ -614,8 +614,6 @@ def test_each_pp_rank_only_loads_relevant_parameters(set_cache_for_ci):
         ("flash_attention_2", "flash_attention_2"),
         ("eager", "eager"),
         (None, "eager"),
-        # Unsupported attention implementation - should default to eager
-        ("sdpa", "eager"),
     ],
 )
 @distributed_test(world_size=8, tp_size=2, pp_size=1)

From df9fdc53df99c74edfbf861d190b8af12f874000 Mon Sep 17 00:00:00 2001
From: Michael Benayoun <mickbenayoun@gmail.com>
Date: Thu, 30 Oct 2025 16:58:59 +0100
Subject: [PATCH 10/10] fix: restore file

---
 examples/training/qwen3/finetune_qwen3.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/training/qwen3/finetune_qwen3.sh b/examples/training/qwen3/finetune_qwen3.sh
index b2d7568e3..d64a6572d 100755
--- a/examples/training/qwen3/finetune_qwen3.sh
+++ b/examples/training/qwen3/finetune_qwen3.sh
@@ -13,8 +13,7 @@ TP_DEGREE=8
 BS=1
 GRADIENT_ACCUMULATION_STEPS=8
 LOGGING_STEPS=2
-# MODEL_NAME="Qwen/Qwen3-8B" # Change this to the desired model name
-MODEL_NAME="Qwen/Qwen3-0.6B" # Change this to the desired model name
+MODEL_NAME="Qwen/Qwen3-8B" # Change this to the desired model name
 OUTPUT_DIR="$(echo $MODEL_NAME | cut -d'/' -f2)-finetuned"
 DISTRIBUTED_ARGS="--nproc_per_node $PROCESSES_PER_NODE"
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )