feat: Add support for YARN in NemotronNAS models (#4906)

amirkl94 · web-flow · commit de9779900c4c · 2025-06-29T09:45:49.000+03:00
Signed-off-by: Amir Klein &lt;203507526+amirkl94@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/attention_backend/interface.py b/tensorrt_llm/_torch/attention_backend/interface.py
@@ -391,8 +391,7 @@ def create_rope_const_params(self, interleave: bool = True):
                 )
 
         if self.scale_type == RotaryScalingType.yarn:
-            rope_inv_freq = None
-            _, rope_cos_sin = RopeEmbeddingUtils.create_sinusoidal_positions_yarn(
+            rope_inv_freq, rope_cos_sin = RopeEmbeddingUtils.create_sinusoidal_positions_yarn(
                 self.max_positions,
                 self.dim,
                 self.theta,
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -110,7 +110,7 @@ def __init__(
             self.qk_rope_head_dim = None
             self.v_head_dim = None
 
-        self.rotary_inv_freq, self.rotary_cos_sin = rope_params.create_rope_const_params(
+        self.rotary_inv_freq, self.rotary_cos_sin = self.rope_params.create_rope_const_params(
         )
 
         self.num_heads = num_heads
diff --git a/tensorrt_llm/_torch/models/modeling_nemotron_nas.py b/tensorrt_llm/_torch/models/modeling_nemotron_nas.py
@@ -4,7 +4,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from tensorrt_llm.functional import PositionEmbeddingType
+from tensorrt_llm.functional import PositionEmbeddingType, RotaryScalingType
 from tensorrt_llm.lora_manager import HfLoraLoader
 from tensorrt_llm.models.convert_utils import split_matrix_tp
 
@@ -48,19 +48,28 @@ def _create_linear_from_configs(model_config: ModelConfig[PretrainedConfig],
 
 
 class NemotronNASAttention(Attention):
+    NON_NEOX_TYPES = ("mistral_yarn", "rope_llama4")
 
     def __init__(self, model_config: ModelConfig[PretrainedConfig],
                  layer_idx: int):
         config = model_config.pretrained_config
+        is_neox = getattr(model_config.pretrained_config,
+                          "position_embedding_type",
+                          None) not in self.NON_NEOX_TYPES
+        rope = RopeParams.from_config(config)
+        if rope.scale_type == RotaryScalingType.yarn:
+            rope.mscale_all_dim = 0.0
+
         super().__init__(
             hidden_size=config.hidden_size,
             num_attention_heads=config.num_attention_heads,
             num_key_value_heads=config.num_key_value_heads[layer_idx],
             max_position_embeddings=config.max_position_embeddings,
             bias=False,
             pos_embd_params=PositionalEmbeddingParams(
-                type=PositionEmbeddingType.rope_gpt_neox,
-                rope=RopeParams.from_config(config),
+                type=PositionEmbeddingType.rope_gpt_neox
+                if is_neox else PositionEmbeddingType.rope_gptj,
+                rope=rope,
             ),
             layer_idx=layer_idx,
             dtype=config.torch_dtype,

Original file line number	Diff line number	Diff line change
`@@ -391,8 +391,7 @@ def create_rope_const_params(self, interleave: bool = True):`
`391`	`391`	`)`
`392`	`392`
`393`	`393`	`if self.scale_type == RotaryScalingType.yarn:`
`394`		`- rope_inv_freq = None`
`395`		`- _, rope_cos_sin = RopeEmbeddingUtils.create_sinusoidal_positions_yarn(`
	`394`	`+ rope_inv_freq, rope_cos_sin = RopeEmbeddingUtils.create_sinusoidal_positions_yarn(`
`396`	`395`	`self.max_positions,`
`397`	`396`	`self.dim,`
`398`	`397`	`self.theta,`
Original file line number	Diff line number	Diff line change
`@@ -110,7 +110,7 @@ def __init__(`
`110`	`110`	`self.qk_rope_head_dim = None`
`111`	`111`	`self.v_head_dim = None`
`112`	`112`
`113`		`- self.rotary_inv_freq, self.rotary_cos_sin = rope_params.create_rope_const_params(`
	`113`	`+ self.rotary_inv_freq, self.rotary_cos_sin = self.rope_params.create_rope_const_params(`
`114`	`114`	`)`
`115`	`115`
`116`	`116`	`self.num_heads = num_heads`