|
23 | 23 | import triton |
24 | 24 | import triton.language as tl |
25 | 25 | from torch import nn |
26 | | -from transformers.configuration_utils import PretrainedConfig |
27 | | -from transformers.modeling_rope_utils import rope_config_validation |
| 26 | +from transformers import Qwen3NextConfig |
28 | 27 |
|
29 | 28 | from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import \ |
30 | 29 | BaseWeightMapper |
@@ -71,254 +70,6 @@ def divide(numerator, denominator): |
71 | 70 | return numerator // denominator |
72 | 71 |
|
73 | 72 |
|
74 | | -class Qwen3NextConfig(PretrainedConfig): |
75 | | - r""" |
76 | | - This is the configuration class to store the configuration of a [`Qwen3NextModel`]. It is used to instantiate a |
77 | | - Qwen3-Next model according to the specified arguments, defining the model architecture. |
78 | | - Instantiating a configuration with the defaults will yield a similar configuration to that of |
79 | | - Qwen3-Next-80B-A3B-Instruct [Qwen/Qwen3-Next-80B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct). |
80 | | -
|
81 | | - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
82 | | - documentation from [`PretrainedConfig`] for more information. |
83 | | -
|
84 | | -
|
85 | | - Args: |
86 | | - vocab_size (`int`, *optional*, defaults to 151936): |
87 | | - Vocabulary size of the model. Defines the number of different tokens that can be represented by the |
88 | | - `inputs_ids`. |
89 | | - hidden_size (`int`, *optional*, defaults to 2048): |
90 | | - Dimension of the hidden representations. |
91 | | - intermediate_size (`int`, *optional*, defaults to 5632): |
92 | | - Dimension of the MLP representations. |
93 | | - num_hidden_layers (`int`, *optional*, defaults to 48): |
94 | | - Number of hidden layers in the Transformer encoder. |
95 | | - num_attention_heads (`int`, *optional*, defaults to 16): |
96 | | - Number of attention heads for each attention layer in the Transformer encoder. |
97 | | - num_key_value_heads (`int`, *optional*, defaults to 2): |
98 | | - This is the number of key_value heads that should be used to implement Grouped Query Attention. If |
99 | | - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if |
100 | | - `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When |
101 | | - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed |
102 | | - by meanpooling all the original heads within that group. For more details checkout [this |
103 | | - paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. |
104 | | - hidden_act (`str`, *optional*, defaults to `"silu"`): |
105 | | - The non-linear activation function in the decoder. |
106 | | - max_position_embeddings (`int`, *optional*, defaults to 32768): |
107 | | - The maximum sequence length that this model might ever be used with. |
108 | | - initializer_range (`float`, *optional*, defaults to 0.02): |
109 | | - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. |
110 | | - rms_norm_eps (`float`, *optional*, defaults to 1e-06): |
111 | | - The epsilon used by the rms normalization layers. |
112 | | - use_cache (`bool`, *optional*, defaults to `True`): |
113 | | - Whether or not the model should return the last key/values attentions (not used by all models). Only |
114 | | - relevant if `config.is_decoder=True`. |
115 | | - tie_word_embeddings (`bool`, *optional*, defaults to `False`): |
116 | | - Whether the model's input and output word embeddings should be tied. |
117 | | - rope_theta (`float`, *optional*, defaults to 10000.0): |
118 | | - The base period of the RoPE embeddings. |
119 | | - rope_scaling (`Dict`, *optional*): |
120 | | - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type |
121 | | - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value |
122 | | - accordingly. |
123 | | - Expected contents: |
124 | | - `rope_type` (`str`): |
125 | | - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', |
126 | | - 'llama3'], with 'default' being the original RoPE implementation. |
127 | | - `factor` (`float`, *optional*): |
128 | | - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In |
129 | | - most scaling types, a `factor` of x will enable the model to handle sequences of length x * |
130 | | - original maximum pre-trained length. |
131 | | - `original_max_position_embeddings` (`int`, *optional*): |
132 | | - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during |
133 | | - pretraining. |
134 | | - `attention_factor` (`float`, *optional*): |
135 | | - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention |
136 | | - computation. If unspecified, it defaults to value recommended by the implementation, using the |
137 | | - `factor` field to infer the suggested value. |
138 | | - `beta_fast` (`float`, *optional*): |
139 | | - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear |
140 | | - ramp function. If unspecified, it defaults to 32. |
141 | | - `beta_slow` (`float`, *optional*): |
142 | | - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear |
143 | | - ramp function. If unspecified, it defaults to 1. |
144 | | - `short_factor` (`List[float]`, *optional*): |
145 | | - Only used with 'longrope'. The scaling factor to be applied to short contexts (< |
146 | | - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden |
147 | | - size divided by the number of attention heads divided by 2 |
148 | | - `long_factor` (`List[float]`, *optional*): |
149 | | - Only used with 'longrope'. The scaling factor to be applied to long contexts (< |
150 | | - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden |
151 | | - size divided by the number of attention heads divided by 2 |
152 | | - `low_freq_factor` (`float`, *optional*): |
153 | | - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE |
154 | | - `high_freq_factor` (`float`, *optional*): |
155 | | - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE |
156 | | - partial_rotary_factor (`float`, *optional*, defaults to 0.25): |
157 | | - Percentage of the query and keys which will have rotary embedding. |
158 | | - attention_bias (`bool`, *optional*, defaults to `False`): |
159 | | - Whether to use a bias in the query, key, value and output projection layers during self-attention. |
160 | | - attention_dropout (`float`, *optional*, defaults to 0.0): |
161 | | - The dropout ratio for the attention probabilities. |
162 | | - head_dim (`int`, *optional*, defaults to 256): |
163 | | - Projection weights dimension in multi-head attention. |
164 | | - linear_conv_kernel_dim (`int`, *optional*, defaults to 4): |
165 | | - Kernel size of the convolution used in linear attention layers. |
166 | | - linear_key_head_dim (`int`, *optional*, defaults to 128): |
167 | | - Dimension of each key head in linear attention. |
168 | | - linear_value_head_dim (`int`, *optional*, defaults to 128): |
169 | | - Dimension of each value head in linear attention. |
170 | | - linear_num_key_heads (`int`, *optional*, defaults to 16): |
171 | | - Number of key heads used in linear attention layers. |
172 | | - linear_num_value_heads (`int`, *optional*, defaults to 32): |
173 | | - Number of value heads used in linear attention layers. |
174 | | - decoder_sparse_step (`int`, *optional*, defaults to 1): |
175 | | - The frequency of the MoE layer. |
176 | | - moe_intermediate_size (`int`, *optional*, defaults to 512): |
177 | | - Intermediate size of the routed expert. |
178 | | - shared_expert_intermediate_size (`int`, *optional*, defaults to 512): |
179 | | - Intermediate size of the shared expert. |
180 | | - num_experts_per_tok (`int`, *optional*, defaults to 10): |
181 | | - Number of selected experts. |
182 | | - num_experts (`int`, *optional*, defaults to 512): |
183 | | - Number of routed experts. |
184 | | - norm_topk_prob (`bool`, *optional*, defaults to `True`): |
185 | | - Whether to normalize the topk probabilities. |
186 | | - output_router_logits (`bool`, *optional*, defaults to `False`): |
187 | | - Whether or not the router logits should be returned by the model. Enabling this will also |
188 | | - allow the model to output the auxiliary loss, including load balancing loss and router z-loss. |
189 | | - router_aux_loss_coef (`float`, *optional*, defaults to 0.001): |
190 | | - The aux loss factor for the total loss. |
191 | | - mlp_only_layers (`list[int]`, *optional*, defaults to `[]`): |
192 | | - Indicate which layers use Qwen3NextMLP rather than Qwen3NextSparseMoeBlock |
193 | | - The list contains layer index, from 0 to num_layers-1 if we have num_layers layers |
194 | | - If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity. |
195 | | - layer_types (`list[str]`, *optional*): |
196 | | - Types of each layer (attention or linear). |
197 | | -
|
198 | | - ```python |
199 | | - >>> from transformers import Qwen3NextModel, Qwen3NextConfig |
200 | | -
|
201 | | - >>> # Initializing a Qwen3Next style configuration |
202 | | - >>> configuration = Qwen3NextConfig() |
203 | | -
|
204 | | - >>> # Initializing a model from the Qwen3-Next-80B-A3B style configuration |
205 | | - >>> model = Qwen3NextModel(configuration) |
206 | | -
|
207 | | - >>> # Accessing the model configuration |
208 | | - >>> configuration = model.config |
209 | | - ``` |
210 | | - """ |
211 | | - |
212 | | - model_type = "qwen3_next" |
213 | | - keys_to_ignore_at_inference = ["past_key_values"] |
214 | | - |
215 | | - base_model_tp_plan = { |
216 | | - "layers.*.self_attn.q_proj": "colwise", |
217 | | - "layers.*.self_attn.k_proj": "colwise", |
218 | | - "layers.*.self_attn.v_proj": "colwise", |
219 | | - "layers.*.self_attn.o_proj": "rowwise", |
220 | | - "layers.*.mlp.experts.*.gate_proj": "colwise", |
221 | | - "layers.*.mlp.experts.*.up_proj": "colwise", |
222 | | - "layers.*.mlp.experts.*.down_proj": "rowwise", |
223 | | - "layers.*.mlp.shared_experts.gate_proj": "colwise", |
224 | | - "layers.*.mlp.shared_experts.up_proj": "colwise", |
225 | | - "layers.*.mlp.shared_experts.down_proj": "rowwise", |
226 | | - "layers.*.mlp.gate_proj": "colwise", |
227 | | - "layers.*.mlp.up_proj": "colwise", |
228 | | - "layers.*.mlp.down_proj": "rowwise", |
229 | | - } |
230 | | - base_model_pp_plan = { |
231 | | - "embed_tokens": (["input_ids"], ["inputs_embeds"]), |
232 | | - "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), |
233 | | - "norm": (["hidden_states"], ["hidden_states"]), |
234 | | - } |
235 | | - |
236 | | - def __init__( |
237 | | - self, |
238 | | - vocab_size=151936, |
239 | | - hidden_size=2048, |
240 | | - intermediate_size=5632, |
241 | | - num_hidden_layers=48, |
242 | | - num_attention_heads=16, |
243 | | - num_key_value_heads=2, |
244 | | - hidden_act="silu", |
245 | | - max_position_embeddings=32768, |
246 | | - initializer_range=0.02, |
247 | | - rms_norm_eps=1e-6, |
248 | | - use_cache=True, |
249 | | - tie_word_embeddings=False, |
250 | | - rope_theta=10000.0, |
251 | | - rope_scaling=None, |
252 | | - partial_rotary_factor=0.25, |
253 | | - attention_bias=False, |
254 | | - attention_dropout=0.0, |
255 | | - head_dim=256, |
256 | | - linear_conv_kernel_dim=4, |
257 | | - linear_key_head_dim=128, |
258 | | - linear_value_head_dim=128, |
259 | | - linear_num_key_heads=16, |
260 | | - linear_num_value_heads=32, |
261 | | - decoder_sparse_step=1, |
262 | | - moe_intermediate_size=512, |
263 | | - shared_expert_intermediate_size=512, |
264 | | - num_experts_per_tok=10, |
265 | | - num_experts=512, |
266 | | - norm_topk_prob=True, |
267 | | - output_router_logits=False, |
268 | | - router_aux_loss_coef=0.001, |
269 | | - mlp_only_layers=[], |
270 | | - layer_types=None, |
271 | | - **kwargs, |
272 | | - ): |
273 | | - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) |
274 | | - self.vocab_size = vocab_size |
275 | | - self.max_position_embeddings = max_position_embeddings |
276 | | - self.hidden_size = hidden_size |
277 | | - self.intermediate_size = intermediate_size |
278 | | - self.num_hidden_layers = num_hidden_layers |
279 | | - self.num_attention_heads = num_attention_heads |
280 | | - self.num_key_value_heads = num_key_value_heads |
281 | | - self.hidden_act = hidden_act |
282 | | - self.initializer_range = initializer_range |
283 | | - self.rms_norm_eps = rms_norm_eps |
284 | | - self.use_cache = use_cache |
285 | | - self.rope_theta = rope_theta |
286 | | - self.rope_scaling = rope_scaling |
287 | | - self.partial_rotary_factor = partial_rotary_factor |
288 | | - self.attention_bias = attention_bias |
289 | | - self.attention_dropout = attention_dropout |
290 | | - self.head_dim = head_dim |
291 | | - rope_config_validation(self) |
292 | | - |
293 | | - self.layer_types = layer_types |
294 | | - if self.layer_types is None: |
295 | | - interval_pattern = kwargs.get("full_attention_interval", 4) |
296 | | - self.layer_types = [ |
297 | | - "linear_attention" if bool( |
298 | | - (i + 1) % interval_pattern) else "full_attention" |
299 | | - for i in range(self.num_hidden_layers) |
300 | | - ] |
301 | | - # layer_type_validation(self.layer_types, self.num_hidden_layers) |
302 | | - |
303 | | - # linear attention part |
304 | | - self.linear_conv_kernel_dim = linear_conv_kernel_dim |
305 | | - self.linear_key_head_dim = linear_key_head_dim |
306 | | - self.linear_value_head_dim = linear_value_head_dim |
307 | | - self.linear_num_key_heads = linear_num_key_heads |
308 | | - self.linear_num_value_heads = linear_num_value_heads |
309 | | - |
310 | | - # MoE arguments |
311 | | - self.decoder_sparse_step = decoder_sparse_step |
312 | | - self.moe_intermediate_size = moe_intermediate_size |
313 | | - self.shared_expert_intermediate_size = shared_expert_intermediate_size |
314 | | - self.num_experts_per_tok = num_experts_per_tok |
315 | | - self.num_experts = num_experts |
316 | | - self.norm_topk_prob = norm_topk_prob |
317 | | - self.output_router_logits = output_router_logits |
318 | | - self.router_aux_loss_coef = router_aux_loss_coef |
319 | | - self.mlp_only_layers = mlp_only_layers |
320 | | - |
321 | | - |
322 | 73 | class Qwen3NextGate(nn.Module): |
323 | 74 |
|
324 | 75 | def __init__( |
|
0 commit comments