huggingface · michaelbenayoun · Oct 31, 2025 · Oct 29, 2025 · Oct 29, 2025 · Oct 29, 2025
diff --git a/README.md b/README.md
@@ -115,7 +115,7 @@ def main():
     model = NeuronModelForCausalLM.from_pretrained(
         model_id,
         training_args.trn_config,
-        torch_dtype=torch.bfloat16,
+        dtype=torch.bfloat16,
         attn_implementation="flash_attention_2", # Enable flash attention
     )
 

diff --git a/benchmark/text-generation/performance/llama3.3-70b.py b/benchmark/text-generation/performance/llama3.3-70b.py
@@ -32,7 +32,7 @@ def main():
             assert neuron_config.sequence_length == seq_length, (
                 f"Model {model_name} is not configured for sequence length {seq_length}."
             )
-            assert neuron_config.torch_dtype == torch.bfloat16, f"Model {model_name} is not configured for bf16."
+            assert neuron_config.dtype == torch.bfloat16, f"Model {model_name} is not configured for bf16."
             model = NeuronModelForCausalLM.from_pretrained(model_id)
         except Exception:
             model = NeuronModelForCausalLM.from_pretrained(

diff --git a/docs/source/contribute/contribute_for_training.mdx b/docs/source/contribute/contribute_for_training.mdx
@@ -80,7 +80,7 @@ class YourModelEmbeddings(nn.Module):
         self.embed_tokens = ParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
             sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
         )
 ```
@@ -105,7 +105,7 @@ class YourModelMLP(nn.Module, CustomModule):
             bias=False,
             gather_output=False,
             sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
         )
 
         self.down_proj = RowParallelLinear(
@@ -114,7 +114,7 @@ class YourModelMLP(nn.Module, CustomModule):
             bias=False,
             input_is_parallel=True,
             sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
         )
 
         # Define transformation specs
@@ -151,23 +151,23 @@ class YourModelAttention(nn.Module, CustomModule):
             bias=False,
             gather_output=False,
             sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
         )
         self.k_proj = ColumnParallelLinear(
             config.hidden_size,
             self.num_key_value_heads * self.head_dim,
             bias=False,
             gather_output=False,
             sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
         )
         self.v_proj = ColumnParallelLinear(
             config.hidden_size,
             self.num_key_value_heads * self.head_dim,
             bias=False,
             gather_output=False,
             sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
         )
 
         self.o_proj = RowParallelLinear(
@@ -176,7 +176,7 @@ class YourModelAttention(nn.Module, CustomModule):
             bias=False,
             input_is_parallel=True,
             sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
         )
 
         # No transformation specs needed - regular parallel layers
@@ -201,7 +201,7 @@ class YourModelAttention(nn.Module, CustomModule):
                 bias=False,
                 gather_output=False,
                 sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
-                dtype=config.torch_dtype,
+                dtype=config.dtype,
             )
 
             # Define transformation specs for fused QKV
@@ -246,7 +246,7 @@ class YourModelAttention(nn.Module, CustomModule):
                 sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
                 kv_size_multiplier=self.kv_size_multiplier,
                 fuse_qkv=trn_config.fuse_qkv,
-                dtype=config.torch_dtype,
+                dtype=config.dtype,
             )
 
             # Define transformation specs for GQA QKV
@@ -336,7 +336,7 @@ class YourModelForCausalLM(NeuronModelMixin, YourPreTrainedModel):
             config.vocab_size,
             bias=False,
             gather_output=False,
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
         )
 
         self.post_init()
@@ -473,7 +473,7 @@ Update `tests/training/test_modeling_auto.py`:
 @is_trainium_test
 def test_auto_model_with_supported_architecture(from_pretrained):
     trn_config = TrainingNeuronConfig()
-    kwargs = {"torch_dtype": torch.bfloat16}
+    kwargs = {"dtype": torch.bfloat16}
     for model_name_or_path in [
         "michaelbenayoun/llama-2-tiny-4kv-heads-4layers-random",
         "michaelbenayoun/granite-tiny-4kv-heads-4layers-random", 
@@ -487,7 +487,7 @@ def test_auto_model_with_supported_architecture(from_pretrained):
 @is_trainium_test
 def test_auto_model_for_causal_lm_with_supported_architecture(from_pretrained):
     trn_config = TrainingNeuronConfig()
-    kwargs = {"torch_dtype": torch.bfloat16}
+    kwargs = {"dtype": torch.bfloat16}
     for model_name_or_path in [
         "michaelbenayoun/llama-2-tiny-4kv-heads-4layers-random",
         "michaelbenayoun/granite-tiny-4kv-heads-4layers-random",

diff --git a/docs/source/model_doc/diffusers/flux.mdx b/docs/source/model_doc/diffusers/flux.mdx
@@ -40,7 +40,7 @@ if __name__ == "__main__":
 
     pipe = NeuronFluxPipeline.from_pretrained(
         "black-forest-labs/FLUX.1-dev",
-        torch_dtype=torch.bfloat16,
+        dtype=torch.bfloat16,
         export=True,
         tensor_parallel_size=8,
         **compiler_args,

diff --git a/docs/source/model_doc/diffusers/pixart_alpha.mdx b/docs/source/model_doc/diffusers/pixart_alpha.mdx
@@ -61,7 +61,7 @@ from optimum.neuron import NeuronPixArtAlphaPipeline
 compiler_args = {"auto_cast": "none"}
 input_shapes = {"batch_size": 1, "height": 512, "width": 512, "sequence_length": 120}
 
-neuron_model = NeuronPixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-512x512", torch_dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes)
+neuron_model = NeuronPixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-512x512", dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes)
 
 # Save locally
 neuron_model.save_pretrained("pixart_alpha_neuron_512/")

diff --git a/docs/source/model_doc/diffusers/pixart_sigma.mdx b/docs/source/model_doc/diffusers/pixart_sigma.mdx
@@ -61,7 +61,7 @@ from optimum.neuron import NeuronPixArtSigmaPipeline
 compiler_args = {"auto_cast": "none"}
 input_shapes = {"batch_size": 1, "height": 512, "width": 512, "sequence_length": 120}
 
-neuron_model = NeuronPixArtSigmaPipeline.from_pretrained("Jingya/pixart_sigma_pipe_xl_2_512_ms", torch_dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes)
+neuron_model = NeuronPixArtSigmaPipeline.from_pretrained("Jingya/pixart_sigma_pipe_xl_2_512_ms", dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes)
 
 # Save locally
 neuron_model.save_pretrained("pixart_sigma_neuron_512/")

diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx
@@ -79,7 +79,7 @@ def main():
     model = NeuronModelForCausalLM.from_pretrained(
         model_id,
         training_args.trn_config,
-        torch_dtype=torch.bfloat16,
+        dtype=torch.bfloat16,
         attn_implementation="flash_attention_2",  # Enable flash attention
     )
 

diff --git a/docs/source/training_tutorials/finetune_llama.mdx b/docs/source/training_tutorials/finetune_llama.mdx
@@ -138,7 +138,7 @@ dtype = torch.bfloat16 if training_args.bf16 else torch.float32
 model = NeuronModelForCausalLM.from_pretrained(
     model_id,
     trn_config,
-    torch_dtype=dtype,
+    dtype=dtype,
     # Use FlashAttention2 for better performance and to be able to use larger sequence lengths.
     attn_implementation="flash_attention_2",
 )

diff --git a/docs/source/training_tutorials/finetune_qwen3.mdx b/docs/source/training_tutorials/finetune_qwen3.mdx
@@ -137,7 +137,7 @@ dtype = torch.bfloat16 if training_args.bf16 else torch.float32
 model = NeuronModelForCausalLM.from_pretrained(
     model_id,
     trn_config,
-    torch_dtype=dtype,
+    dtype=dtype,
     # Use FlashAttention2 for better performance and to be able to use larger sequence lengths.
     attn_implementation="flash_attention_2",
 )

diff --git a/examples/training/qwen3/finetune_qwen3.sh b/examples/training/qwen3/finetune_qwen3.sh
@@ -13,7 +13,8 @@ TP_DEGREE=8
 BS=1
 GRADIENT_ACCUMULATION_STEPS=8
 LOGGING_STEPS=2
-MODEL_NAME="Qwen/Qwen3-8B" # Change this to the desired model name
+# MODEL_NAME="Qwen/Qwen3-8B" # Change this to the desired model name
+MODEL_NAME="Qwen/Qwen3-0.6B" # Change this to the desired model name
 OUTPUT_DIR="$(echo $MODEL_NAME | cut -d'/' -f2)-finetuned"
 DISTRIBUTED_ARGS="--nproc_per_node $PROCESSES_PER_NODE"
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )

diff --git a/optimum/commands/neuron/cache.py b/optimum/commands/neuron/cache.py
@@ -147,7 +147,7 @@ def _list_entries(self):
                     str(entry["batch_size"]),
                     str(entry["sequence_length"]),
                     str(entry.get("tp_degree", entry.get("tensor_parallel_size"))),
-                    str(entry["torch_dtype"]),
+                    str(entry["dtype"]),
                     str(entry["target"]),
                 )
             )

diff --git a/optimum/commands/neuron/serve.py b/optimum/commands/neuron/serve.py
@@ -101,7 +101,7 @@ def run(self):
         sequence_length = self.args.sequence_length
         tensor_parallel_size = self.args.tensor_parallel_size
         config = AutoConfig.from_pretrained(model_name_or_path)
-        torch_dtype = DTYPE_MAPPER.pt(config.torch_dtype)
+        torch_dtype = DTYPE_MAPPER.pt(config.dtype)
         try:
             # Look for a NeuronConfig in the model directory
             neuron_config = NeuronConfig.from_pretrained(model_name_or_path)
@@ -202,7 +202,7 @@ def run(self):
                 batch_size = selected_entry["batch_size"]
                 sequence_length = selected_entry["sequence_length"]
                 tensor_parallel_size = selected_entry["tp_degree"]
-                torch_dtype = DTYPE_MAPPER.pt(selected_entry["torch_dtype"])
+                torch_dtype = DTYPE_MAPPER.pt(selected_entry["dtype"])
                 warning_msg = f"{model_id} is not a neuron model, but a cached configuration is available using"
                 warning_msg += f" instance type {instance_type},"
                 warning_msg += f" batch size = {batch_size},"

diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py
@@ -581,7 +581,7 @@ def load_models_and_neuron_configs(
         "trust_remote_code": trust_remote_code,
         "framework": "pt",
         "library_name": library_name,
-        "torch_dtype": torch_dtype,
+        "dtype": torch_dtype,
     }
     if model is None:
         model = TasksManager.get_model_from_task(**model_kwargs)
@@ -878,7 +878,7 @@ def main():
         model_name_or_path=args.model,
         output=args.output,
         compiler_kwargs=compiler_kwargs,
-        torch_dtype=args.torch_dtype,
+        torch_dtype=args.dtype,
         tensor_parallel_size=args.tensor_parallel_size,
         task=task,
         dynamic_batch_size=args.dynamic_batch_size,

diff --git a/optimum/neuron/cache/entries/cache_entry.py b/optimum/neuron/cache/entries/cache_entry.py
@@ -28,7 +28,7 @@
     "bos_token_id",
     "pad_token_id",
     "torchscript",
-    "torch_dtype",  # this has been renamed as `float_dtype` for the check
+    "dtype",  # this has been renamed as `float_dtype` for the check
     "_commit_hash",
     "sample_size",
     "projection_dim",

diff --git a/optimum/neuron/cache/hub_cache.py b/optimum/neuron/cache/hub_cache.py
@@ -427,7 +427,7 @@ def select_hub_cached_entries(
             continue
         if torch_dtype is not None:
             target_value = DTYPE_MAPPER.pt(torch_dtype) if isinstance(torch_dtype, str) else torch_dtype
-            entry_value = DTYPE_MAPPER.pt(entry.get("torch_dtype"))
+            entry_value = DTYPE_MAPPER.pt(entry.get("dtype"))
             if target_value != entry_value:
                 continue
         selected.append(entry)

diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py
@@ -325,18 +325,17 @@ class NeuronGenerationMixin(GenerationMixin):
     The class exposes [`~generation.GenerationMixin.generate`], which can be used for:
         - *greedy decoding* by calling [`~generation.GenerationMixin.greedy_search`] if `num_beams=1` and
           `do_sample=False`
-        - *contrastive search* by calling [`~generation.GenerationMixin.contrastive_search`] if `penalty_alpha>0` and
-          `top_k>1`
         - *multinomial sampling* by calling [`~generation.GenerationMixin.sample`] if `num_beams=1` and
           `do_sample=True`
         - *beam-search decoding* by calling [`~generation.GenerationMixin.beam_search`] if `num_beams>1` and
           `do_sample=False`
         - *beam-search multinomial sampling* by calling [`~generation.GenerationMixin.beam_sample`] if `num_beams>1`
           and `do_sample=True`
-        - *diverse beam-search decoding* by calling [`~generation.GenerationMixin.group_beam_search`], if `num_beams>1`
-          and `num_beam_groups>1`
-        - *constrained beam-search decoding* by calling [`~generation.GenerationMixin.constrained_beam_search`], if
-          `constraints!=None` or `force_words_ids!=None`
+
+    Note: The following strategies have been removed in transformers 4.56.0+:
+        - constrained beam-search decoding (constraints and force_words_ids)
+        - group beam-search decoding (num_beam_groups > 1)
+        - contrastive search (penalty_alpha > 0)
 
     You do not need to call any of the above methods directly. Pass custom parameter values to 'generate' instead. To
     learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).

diff --git a/optimum/neuron/modeling_diffusion.py b/optimum/neuron/modeling_diffusion.py
@@ -1193,7 +1193,7 @@ def forward(
 
         outputs = self.model(*inputs)
         if self.config.model_type == "t5" and isinstance(outputs, dict):  # Flux text encoder 2
-            return [outputs["last_hidden_state"].to(self.config.torch_dtype)]
+            return [outputs["last_hidden_state"].to(self.config.dtype)]
 
         if return_dict and not isinstance(outputs, dict):
             outputs = ModelOutput(dict(zip(self.neuron_config.outputs, outputs)))

diff --git a/optimum/neuron/models/inference/backend/config.py b/optimum/neuron/models/inference/backend/config.py
@@ -83,9 +83,9 @@ def __init__(
         self.batch_size = batch_size
         self.sequence_length = sequence_length
         self.tp_degree = tp_degree
-        self.torch_dtype = torch_dtype
-        if isinstance(self.torch_dtype, str):
-            self.torch_dtype = DTYPE_MAPPER.pt(self.torch_dtype)
+        self.dtype = torch_dtype
+        if isinstance(self.dtype, str):
+            self.dtype = DTYPE_MAPPER.pt(self.dtype)
         self.n_active_tokens = self.sequence_length if n_active_tokens is None else n_active_tokens
         self.output_logits = output_logits
 

diff --git a/optimum/neuron/models/inference/backend/modules/attention/attention_base.py b/optimum/neuron/models/inference/backend/modules/attention/attention_base.py
@@ -99,7 +99,7 @@ def __init__(
             self.head_dim = self.hidden_size // self.num_attention_heads
         self.max_position_embeddings = config.max_position_embeddings
         self.rope_theta = config.rope_theta
-        self.torch_dtype = neuron_config.torch_dtype
+        self.dtype = neuron_config.dtype
         self.rms_norm_eps = config.rms_norm_eps
         self._qk_scale = qk_scale
 
@@ -111,7 +111,7 @@ def __init__(
             num_attention_heads=self.num_attention_heads,
             num_key_value_heads=self.num_key_value_heads,
             tp_degree=neuron_config.tp_degree,
-            dtype=self.torch_dtype,
+            dtype=self.dtype,
             bias=qkv_proj_bias,
             gather_output=False,
             fused_qkv=neuron_config.fused_qkv,
@@ -125,12 +125,12 @@ def __init__(
             num_attention_heads=self.num_attention_heads,
             num_key_value_heads=self.num_key_value_heads,
             tp_degree=neuron_config.tp_degree,
-            dtype=self.torch_dtype,
+            dtype=self.dtype,
             bias=o_proj_bias,
             input_is_parallel=True,
             layer_name=self.o_proj_layer_name,
             tensor_model_parallel_group=self.tensor_model_parallel_group,
-            rpl_reduce_dtype=neuron_config.torch_dtype,
+            rpl_reduce_dtype=neuron_config.dtype,
         )
         self.num_heads = utils.divide(self.qkv_proj.get_num_attention_heads(), neuron_config.tp_degree)
         self.num_key_value_heads = utils.divide(self.qkv_proj.get_num_key_value_heads(), neuron_config.tp_degree)
@@ -202,13 +202,13 @@ def perform_prefill(self, Q, K, V, q_len, bsz, attention_mask) -> Tensor:
             Q = (
                 Q.permute(0, 1, 3, 2)  # after permute: batch, num_heads, d_head, seqlen
                 .reshape((bsz * self.num_heads, self.head_dim, q_len))
-                .to(self.torch_dtype)
+                .to(self.dtype)
             )
             Q = Q * self.qk_scale
             K_active = (
-                K_active.permute(0, 1, 3, 2).reshape((bsz * self.num_heads, self.head_dim, q_len)).to(self.torch_dtype)
+                K_active.permute(0, 1, 3, 2).reshape((bsz * self.num_heads, self.head_dim, q_len)).to(self.dtype)
             )
-            V_active = V_active.reshape((bsz * self.num_heads, q_len, self.head_dim)).to(self.torch_dtype)
+            V_active = V_active.reshape((bsz * self.num_heads, q_len, self.head_dim)).to(self.dtype)
             # shape: (B*H)DS
             attn_output = torch.zeros(bsz * self.num_heads, self.head_dim, q_len, dtype=Q.dtype, device=Q.device)
 

diff --git a/optimum/neuron/models/inference/backend/modules/decoder/decoder_builder.py b/optimum/neuron/models/inference/backend/modules/decoder/decoder_builder.py
@@ -38,8 +38,8 @@ def __init__(
         self.max_tokens = max_tokens
         self.active_tokens = active_tokens
 
-        if not self.neuron_config.torch_dtype:
-            self.neuron_config.torch_dtype = torch.float32
+        if not self.neuron_config.dtype:
+            self.neuron_config.dtype = torch.float32
 
         if config.pad_token_id is None:
             config.pad_token_id = 0
@@ -88,9 +88,9 @@ def load_module(self):
         float_model = self.model_cls(self.config, self.neuron_config)
         float_model.eval()
 
-        if self.neuron_config.torch_dtype != torch.float32:
+        if self.neuron_config.dtype != torch.float32:
             float_model._apply(
-                lambda t: t.to(self.neuron_config.torch_dtype)
+                lambda t: t.to(self.neuron_config.dtype)
                 if t.is_floating_point() and t.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]
                 else t
             )

diff --git a/optimum/neuron/models/inference/backend/modules/decoder/decoder_wrapper.py b/optimum/neuron/models/inference/backend/modules/decoder/decoder_wrapper.py
@@ -37,8 +37,8 @@ def __init__(
         self.model = model
         self.tag = tag
 
-        if not self.neuron_config.torch_dtype:
-            self.neuron_config.torch_dtype = torch.float32
+        if not self.neuron_config.dtype:
+            self.neuron_config.dtype = torch.float32
 
         if config.pad_token_id is None:
             config.pad_token_id = 0

diff --git a/optimum/neuron/models/inference/backend/modules/kvcache/kv_cache_manager.py b/optimum/neuron/models/inference/backend/modules/kvcache/kv_cache_manager.py
@@ -47,7 +47,7 @@ def __init__(self, config: PretrainedConfig, neuron_config: NxDNeuronConfig, **k
         self._init_kv_shape(config, neuron_config)
 
         num_layer = config.num_hidden_layers
-        dtype = neuron_config.torch_dtype
+        dtype = neuron_config.dtype
         self.past_key_values = nn.ParameterList(
             [nn.Parameter(torch.zeros(self.kv_shape, dtype=dtype), requires_grad=False) for _ in range(num_layer * 2)]
         )