Skip to content

Commit dd4f892

Browse files
nv-guomingzvideodanchik
authored andcommitted
[TRTLLM-8577][feat] Clean the Qwen3-next code by removing Qwen3NextCo… (NVIDIA#10228)
Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com> Signed-off-by: Daniil Kulko <kulkodaniil@gmail.com>
1 parent 1e6f1c1 commit dd4f892

File tree

1 file changed

+1
-250
lines changed

1 file changed

+1
-250
lines changed

tensorrt_llm/_torch/models/modeling_qwen3_next.py

Lines changed: 1 addition & 250 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,7 @@
2323
import triton
2424
import triton.language as tl
2525
from torch import nn
26-
from transformers.configuration_utils import PretrainedConfig
27-
from transformers.modeling_rope_utils import rope_config_validation
26+
from transformers import Qwen3NextConfig
2827

2928
from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import \
3029
BaseWeightMapper
@@ -71,254 +70,6 @@ def divide(numerator, denominator):
7170
return numerator // denominator
7271

7372

74-
class Qwen3NextConfig(PretrainedConfig):
75-
r"""
76-
This is the configuration class to store the configuration of a [`Qwen3NextModel`]. It is used to instantiate a
77-
Qwen3-Next model according to the specified arguments, defining the model architecture.
78-
Instantiating a configuration with the defaults will yield a similar configuration to that of
79-
Qwen3-Next-80B-A3B-Instruct [Qwen/Qwen3-Next-80B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct).
80-
81-
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
82-
documentation from [`PretrainedConfig`] for more information.
83-
84-
85-
Args:
86-
vocab_size (`int`, *optional*, defaults to 151936):
87-
Vocabulary size of the model. Defines the number of different tokens that can be represented by the
88-
`inputs_ids`.
89-
hidden_size (`int`, *optional*, defaults to 2048):
90-
Dimension of the hidden representations.
91-
intermediate_size (`int`, *optional*, defaults to 5632):
92-
Dimension of the MLP representations.
93-
num_hidden_layers (`int`, *optional*, defaults to 48):
94-
Number of hidden layers in the Transformer encoder.
95-
num_attention_heads (`int`, *optional*, defaults to 16):
96-
Number of attention heads for each attention layer in the Transformer encoder.
97-
num_key_value_heads (`int`, *optional*, defaults to 2):
98-
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
99-
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
100-
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
101-
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
102-
by meanpooling all the original heads within that group. For more details checkout [this
103-
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
104-
hidden_act (`str`, *optional*, defaults to `"silu"`):
105-
The non-linear activation function in the decoder.
106-
max_position_embeddings (`int`, *optional*, defaults to 32768):
107-
The maximum sequence length that this model might ever be used with.
108-
initializer_range (`float`, *optional*, defaults to 0.02):
109-
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
110-
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
111-
The epsilon used by the rms normalization layers.
112-
use_cache (`bool`, *optional*, defaults to `True`):
113-
Whether or not the model should return the last key/values attentions (not used by all models). Only
114-
relevant if `config.is_decoder=True`.
115-
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
116-
Whether the model's input and output word embeddings should be tied.
117-
rope_theta (`float`, *optional*, defaults to 10000.0):
118-
The base period of the RoPE embeddings.
119-
rope_scaling (`Dict`, *optional*):
120-
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
121-
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
122-
accordingly.
123-
Expected contents:
124-
`rope_type` (`str`):
125-
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
126-
'llama3'], with 'default' being the original RoPE implementation.
127-
`factor` (`float`, *optional*):
128-
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
129-
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
130-
original maximum pre-trained length.
131-
`original_max_position_embeddings` (`int`, *optional*):
132-
Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
133-
pretraining.
134-
`attention_factor` (`float`, *optional*):
135-
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
136-
computation. If unspecified, it defaults to value recommended by the implementation, using the
137-
`factor` field to infer the suggested value.
138-
`beta_fast` (`float`, *optional*):
139-
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
140-
ramp function. If unspecified, it defaults to 32.
141-
`beta_slow` (`float`, *optional*):
142-
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
143-
ramp function. If unspecified, it defaults to 1.
144-
`short_factor` (`List[float]`, *optional*):
145-
Only used with 'longrope'. The scaling factor to be applied to short contexts (<
146-
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
147-
size divided by the number of attention heads divided by 2
148-
`long_factor` (`List[float]`, *optional*):
149-
Only used with 'longrope'. The scaling factor to be applied to long contexts (<
150-
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
151-
size divided by the number of attention heads divided by 2
152-
`low_freq_factor` (`float`, *optional*):
153-
Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
154-
`high_freq_factor` (`float`, *optional*):
155-
Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
156-
partial_rotary_factor (`float`, *optional*, defaults to 0.25):
157-
Percentage of the query and keys which will have rotary embedding.
158-
attention_bias (`bool`, *optional*, defaults to `False`):
159-
Whether to use a bias in the query, key, value and output projection layers during self-attention.
160-
attention_dropout (`float`, *optional*, defaults to 0.0):
161-
The dropout ratio for the attention probabilities.
162-
head_dim (`int`, *optional*, defaults to 256):
163-
Projection weights dimension in multi-head attention.
164-
linear_conv_kernel_dim (`int`, *optional*, defaults to 4):
165-
Kernel size of the convolution used in linear attention layers.
166-
linear_key_head_dim (`int`, *optional*, defaults to 128):
167-
Dimension of each key head in linear attention.
168-
linear_value_head_dim (`int`, *optional*, defaults to 128):
169-
Dimension of each value head in linear attention.
170-
linear_num_key_heads (`int`, *optional*, defaults to 16):
171-
Number of key heads used in linear attention layers.
172-
linear_num_value_heads (`int`, *optional*, defaults to 32):
173-
Number of value heads used in linear attention layers.
174-
decoder_sparse_step (`int`, *optional*, defaults to 1):
175-
The frequency of the MoE layer.
176-
moe_intermediate_size (`int`, *optional*, defaults to 512):
177-
Intermediate size of the routed expert.
178-
shared_expert_intermediate_size (`int`, *optional*, defaults to 512):
179-
Intermediate size of the shared expert.
180-
num_experts_per_tok (`int`, *optional*, defaults to 10):
181-
Number of selected experts.
182-
num_experts (`int`, *optional*, defaults to 512):
183-
Number of routed experts.
184-
norm_topk_prob (`bool`, *optional*, defaults to `True`):
185-
Whether to normalize the topk probabilities.
186-
output_router_logits (`bool`, *optional*, defaults to `False`):
187-
Whether or not the router logits should be returned by the model. Enabling this will also
188-
allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
189-
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
190-
The aux loss factor for the total loss.
191-
mlp_only_layers (`list[int]`, *optional*, defaults to `[]`):
192-
Indicate which layers use Qwen3NextMLP rather than Qwen3NextSparseMoeBlock
193-
The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
194-
If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
195-
layer_types (`list[str]`, *optional*):
196-
Types of each layer (attention or linear).
197-
198-
```python
199-
>>> from transformers import Qwen3NextModel, Qwen3NextConfig
200-
201-
>>> # Initializing a Qwen3Next style configuration
202-
>>> configuration = Qwen3NextConfig()
203-
204-
>>> # Initializing a model from the Qwen3-Next-80B-A3B style configuration
205-
>>> model = Qwen3NextModel(configuration)
206-
207-
>>> # Accessing the model configuration
208-
>>> configuration = model.config
209-
```
210-
"""
211-
212-
model_type = "qwen3_next"
213-
keys_to_ignore_at_inference = ["past_key_values"]
214-
215-
base_model_tp_plan = {
216-
"layers.*.self_attn.q_proj": "colwise",
217-
"layers.*.self_attn.k_proj": "colwise",
218-
"layers.*.self_attn.v_proj": "colwise",
219-
"layers.*.self_attn.o_proj": "rowwise",
220-
"layers.*.mlp.experts.*.gate_proj": "colwise",
221-
"layers.*.mlp.experts.*.up_proj": "colwise",
222-
"layers.*.mlp.experts.*.down_proj": "rowwise",
223-
"layers.*.mlp.shared_experts.gate_proj": "colwise",
224-
"layers.*.mlp.shared_experts.up_proj": "colwise",
225-
"layers.*.mlp.shared_experts.down_proj": "rowwise",
226-
"layers.*.mlp.gate_proj": "colwise",
227-
"layers.*.mlp.up_proj": "colwise",
228-
"layers.*.mlp.down_proj": "rowwise",
229-
}
230-
base_model_pp_plan = {
231-
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
232-
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
233-
"norm": (["hidden_states"], ["hidden_states"]),
234-
}
235-
236-
def __init__(
237-
self,
238-
vocab_size=151936,
239-
hidden_size=2048,
240-
intermediate_size=5632,
241-
num_hidden_layers=48,
242-
num_attention_heads=16,
243-
num_key_value_heads=2,
244-
hidden_act="silu",
245-
max_position_embeddings=32768,
246-
initializer_range=0.02,
247-
rms_norm_eps=1e-6,
248-
use_cache=True,
249-
tie_word_embeddings=False,
250-
rope_theta=10000.0,
251-
rope_scaling=None,
252-
partial_rotary_factor=0.25,
253-
attention_bias=False,
254-
attention_dropout=0.0,
255-
head_dim=256,
256-
linear_conv_kernel_dim=4,
257-
linear_key_head_dim=128,
258-
linear_value_head_dim=128,
259-
linear_num_key_heads=16,
260-
linear_num_value_heads=32,
261-
decoder_sparse_step=1,
262-
moe_intermediate_size=512,
263-
shared_expert_intermediate_size=512,
264-
num_experts_per_tok=10,
265-
num_experts=512,
266-
norm_topk_prob=True,
267-
output_router_logits=False,
268-
router_aux_loss_coef=0.001,
269-
mlp_only_layers=[],
270-
layer_types=None,
271-
**kwargs,
272-
):
273-
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
274-
self.vocab_size = vocab_size
275-
self.max_position_embeddings = max_position_embeddings
276-
self.hidden_size = hidden_size
277-
self.intermediate_size = intermediate_size
278-
self.num_hidden_layers = num_hidden_layers
279-
self.num_attention_heads = num_attention_heads
280-
self.num_key_value_heads = num_key_value_heads
281-
self.hidden_act = hidden_act
282-
self.initializer_range = initializer_range
283-
self.rms_norm_eps = rms_norm_eps
284-
self.use_cache = use_cache
285-
self.rope_theta = rope_theta
286-
self.rope_scaling = rope_scaling
287-
self.partial_rotary_factor = partial_rotary_factor
288-
self.attention_bias = attention_bias
289-
self.attention_dropout = attention_dropout
290-
self.head_dim = head_dim
291-
rope_config_validation(self)
292-
293-
self.layer_types = layer_types
294-
if self.layer_types is None:
295-
interval_pattern = kwargs.get("full_attention_interval", 4)
296-
self.layer_types = [
297-
"linear_attention" if bool(
298-
(i + 1) % interval_pattern) else "full_attention"
299-
for i in range(self.num_hidden_layers)
300-
]
301-
# layer_type_validation(self.layer_types, self.num_hidden_layers)
302-
303-
# linear attention part
304-
self.linear_conv_kernel_dim = linear_conv_kernel_dim
305-
self.linear_key_head_dim = linear_key_head_dim
306-
self.linear_value_head_dim = linear_value_head_dim
307-
self.linear_num_key_heads = linear_num_key_heads
308-
self.linear_num_value_heads = linear_num_value_heads
309-
310-
# MoE arguments
311-
self.decoder_sparse_step = decoder_sparse_step
312-
self.moe_intermediate_size = moe_intermediate_size
313-
self.shared_expert_intermediate_size = shared_expert_intermediate_size
314-
self.num_experts_per_tok = num_experts_per_tok
315-
self.num_experts = num_experts
316-
self.norm_topk_prob = norm_topk_prob
317-
self.output_router_logits = output_router_logits
318-
self.router_aux_loss_coef = router_aux_loss_coef
319-
self.mlp_only_layers = mlp_only_layers
320-
321-
32273
class Qwen3NextGate(nn.Module):
32374

32475
def __init__(

0 commit comments

Comments
 (0)