ModelTC
diff --git a/‎lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/fused_gdn_gating:v1/{NUM_HEADS=8,a_dtype=torch.bfloat16}_NVIDIA_H200.json‎
Lines changed: 48 additions & 1 deletion b/‎lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/fused_gdn_gating:v1/{NUM_HEADS=8,a_dtype=torch.bfloat16}_NVIDIA_H200.json‎
Lines changed: 48 additions & 1 deletion
diff --git a/‎lightllm/models/qwen3next/layer_infer/transformer_layer_infer.py‎
Lines changed: 1 addition & 0 deletions b/‎lightllm/models/qwen3next/layer_infer/transformer_layer_infer.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lightllm/models/qwen3next/model.py‎
Lines changed: 10 additions & 6 deletions b/‎lightllm/models/qwen3next/model.py‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎lightllm/models/qwen3next/triton_kernel/fla/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎lightllm/models/qwen3next/triton_kernel/fla/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎lightllm/models/qwen3next/triton_kernel/fla_bak/__init__.py‎
Lines changed: 0 additions & 15 deletions b/‎lightllm/models/qwen3next/triton_kernel/fla_bak/__init__.py‎
Lines changed: 0 additions & 15 deletions
diff --git a/‎lightllm/models/qwen3next/triton_kernel/fla_bak/chunk.py‎
Lines changed: 0 additions & 225 deletions b/‎lightllm/models/qwen3next/triton_kernel/fla_bak/chunk.py‎
Lines changed: 0 additions & 225 deletions
@@ -1,3 +1,50 @@
 {
-  "8448": null
+  "1": {
+    "BLK_HEADS": 64,
+    "num_warps": 1
+  },
+  "100": {
+    "BLK_HEADS": 4,
+    "num_warps": 4
+  },
+  "1024": {
+    "BLK_HEADS": 8,
+    "num_warps": 1
+  },
+  "128": {
+    "BLK_HEADS": 16,
+    "num_warps": 4
+  },
+  "16": {
+    "BLK_HEADS": 8,
+    "num_warps": 2
+  },
+  "2048": {
+    "BLK_HEADS": 16,
+    "num_warps": 1
+  },
+  "256": {
+    "BLK_HEADS": 32,
+    "num_warps": 2
+  },
+  "32": {
+    "BLK_HEADS": 8,
+    "num_warps": 1
+  },
+  "4096": {
+    "BLK_HEADS": 16,
+    "num_warps": 4
+  },
+  "64": {
+    "BLK_HEADS": 64,
+    "num_warps": 2
+  },
+  "8": {
+    "BLK_HEADS": 8,
+    "num_warps": 2
+  },
+  "8448": {
+    "BLK_HEADS": 32,
+    "num_warps": 4
+  }
 }
@@ -270,6 +270,7 @@ def _linear_attn(
                 bias=layer_weight.linear_conv1d.mm_param.bias,
                 query_start_loc=infer_state.b1_cu_q_seq_len,
                 cache_indices=buffer_idx,
+                has_initial_state=infer_state.b_ready_cache_len > 0,
                 conv_states=conv_states.transpose(1, 2),
                 activation=self.activation,
             )
 
@@ -1,3 +1,4 @@
+import os
 import torch
 from typing import Optional
 from typing_extensions import override
@@ -62,11 +63,11 @@ def _init_mem_manager(self):
         start_args: StartArgs = get_env_start_args()
 
         mtp_step = start_args.mtp_step
-        linear_attn_cache_size = start_args.linear_attn_cache_size
-        if linear_attn_cache_size is not None:
+        mamba_cache_size = start_args.mamba_cache_size
+        if mamba_cache_size is not None:
             assert (
-                linear_attn_cache_size >= start_args.running_max_req_size
-            ), "linear_attn_cache_size must be greater than running_max_req_size"
+                mamba_cache_size >= start_args.running_max_req_size
+            ), "mamba_cache_size must be greater than running_max_req_size"
 
         self.num_linear_k_heads = self.config["linear_num_key_heads"]
         self.num_linear_v_heads = self.config["linear_num_value_heads"]
@@ -78,9 +79,12 @@ def _init_mem_manager(self):
             self.head_linear_k_dim * self.num_linear_k_heads * 2 + self.head_linear_v_dim * self.num_linear_v_heads
         )
 
+        ssm_dtype_dict = {"bfloat16": torch.bfloat16, "float32": torch.float32}
+        assert start_args.mamba_ssm_data_type in ssm_dtype_dict
+
         self.mem_manager = Qwen3NextMemoryManager(
             full_attn_cache_size=self.max_total_token_num,
-            linear_attn_cache_size=linear_attn_cache_size,
+            linear_attn_cache_size=mamba_cache_size,
             dtype=self.data_type,
             num_kv_heads=self.num_kv_heads,
             head_dim=self.config["head_dim"],
@@ -89,7 +93,7 @@ def _init_mem_manager(self):
             full_attention_interval=self.config["full_attention_interval"],
             conv_state_dtype=self.data_type,
             conv_state_shape=(conv_kernel_size - 1 + mtp_step, conv_dim // self.tp_world_size_),
-            ssm_state_dtype=self.data_type,
+            ssm_state_dtype=ssm_dtype_dict[start_args.mamba_ssm_data_type],
             ssm_state_shape=(
                 # mtp_step + 1,
                 self.num_linear_v_heads // self.tp_world_size_,
 
@@ -6,3 +6,6 @@
 # The original source code was licensed under the MIT license and included
 # the following copyright notice:
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+# Adapted from
+# https://github.com/vllm-project/vllm
Original file line number	Diff line number	Diff line change
`@@ -270,6 +270,7 @@ def _linear_attn(`
`270`	`270`	`bias=layer_weight.linear_conv1d.mm_param.bias,`
`271`	`271`	`query_start_loc=infer_state.b1_cu_q_seq_len,`
`272`	`272`	`cache_indices=buffer_idx,`
	`273`	`+ has_initial_state=infer_state.b_ready_cache_len > 0,`
`273`	`274`	`conv_states=conv_states.transpose(1, 2),`
`274`	`275`	`activation=self.activation,`
`275`	`276`	`)`