try fix

sufubao · sufubao · commit 7216a2acc844 · 2025-12-12T16:33:28.000+08:00
diff --git a/lightllm/models/qwen3next/triton_kernel/fla/ops/chunk_delta_h.py b/lightllm/models/qwen3next/triton_kernel/fla/ops/chunk_delta_h.py
@@ -14,7 +14,7 @@
 import triton.language as tl
 
 from .index import prepare_chunk_indices, prepare_chunk_offsets
-from .op import exp
+from .op import exp, safe_exp
 from .utils import use_cuda_graph
 from lightllm.common.triton_utils.autotuner import autotune
 
@@ -150,19 +150,18 @@ def chunk_gated_delta_rule_fwd_kernel_h_blockdim64(
 
         last_idx = min((i_t + 1) * BT, T) - 1
         if USE_G:
-            m_t = (i_t * BT + tl.arange(0, BT)) < T
             b_g_last = tl.load(g + bos * H + last_idx * H + i_h)
             p_g = tl.make_block_ptr(g + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
             b_g = tl.load(p_g, boundary_check=(0,))
-            b_v = b_v * tl.where(m_t, exp(b_g_last - b_g), 0)[:, None]
+            b_v = b_v * safe_exp(b_g_last - b_g)[:, None]
             b_g_last = exp(b_g_last)
-            b_h1 *= b_g_last
+            b_h1 = b_h1 * b_g_last
             if K > 64:
-                b_h2 *= b_g_last
+                b_h2 = b_h2 * b_g_last
             if K > 128:
-                b_h3 *= b_g_last
+                b_h3 = b_h3 * b_g_last
             if K > 192:
-                b_h4 *= b_g_last
+                b_h4 = b_h4 * b_g_last
 
         if USE_GK:
             o_k1 = tl.arange(0, 64)
diff --git a/lightllm/models/qwen3next/triton_kernel/fla/ops/chunk_o.py b/lightllm/models/qwen3next/triton_kernel/fla/ops/chunk_o.py
@@ -16,7 +16,7 @@
 import triton.language as tl
 
 from .index import prepare_chunk_indices
-from .op import exp
+from .op import exp, safe_exp
 from .utils import FLA_GDN_FIX_BT, check_shared_mem, is_nvidia_hopper
 from lightllm.common.triton_utils.autotuner import autotune
 
@@ -103,7 +103,7 @@ def chunk_fwd_kernel_o(
         p_g = tl.make_block_ptr(g, (T,), (H,), (i_t * BT,), (BT,), (0,))
         b_g = tl.load(p_g, boundary_check=(0,))
         b_o = b_o * exp(b_g)[:, None]
-        b_A = b_A * exp(b_g[:, None] - b_g[None, :])
+        b_A = b_A * safe_exp(b_g[:, None] - b_g[None, :])
 
     o_t = i_t * BT + tl.arange(0, BT)
     m_t = o_t < T
diff --git a/lightllm/models/qwen3next/triton_kernel/fla/ops/chunk_scaled_dot_kkt.py b/lightllm/models/qwen3next/triton_kernel/fla/ops/chunk_scaled_dot_kkt.py
@@ -14,7 +14,7 @@
 import triton.language as tl
 
 from .index import prepare_chunk_indices
-from .op import exp
+from .op import exp, safe_exp
 from lightllm.common.triton_utils.autotuner import autotune
 
 triton.set_allocator
@@ -80,7 +80,7 @@ def chunk_scaled_dot_kkt_fwd_kernel(
         p_g = tl.make_block_ptr(g + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
         b_g = tl.load(p_g, boundary_check=(0,))
         b_g_diff = b_g[:, None] - b_g[None, :]
-        b_A = b_A * exp(b_g_diff)
+        b_A = b_A * safe_exp(b_g_diff)
 
     b_A *= b_beta[:, None]
     m_A = (o_t[:, None] > o_t[None, :]) & (m_t[:, None] & m_t)
diff --git a/lightllm/models/qwen3next/triton_kernel/fla/ops/op.py b/lightllm/models/qwen3next/triton_kernel/fla/ops/op.py
@@ -19,6 +19,16 @@
 log2 = tl.log2
 
 
+@triton.jit
+def safe_exp(x):
+    """
+    Numerically stable exponential function.
+    Only applies exp to non-positive values, returns 0 for positive values.
+    This prevents numerical overflow and improves stability.
+    """
+    return exp(tl.where(x <= 0, x, float("-inf")))
+
+
 if not is_gather_supported:
 
     @triton.jit