LarryXFly
diff --git a/‎tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py‎
Lines changed: 0 additions & 26 deletions b/‎tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py‎
Lines changed: 0 additions & 26 deletions
diff --git a/‎tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py‎
Lines changed: 14 additions & 35 deletions b/‎tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py‎
Lines changed: 14 additions & 35 deletions
diff --git a/‎tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm.py‎
Lines changed: 11 additions & 1 deletion b/‎tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm.py‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py‎
Lines changed: 16 additions & 68 deletions b/‎tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py‎
Lines changed: 16 additions & 68 deletions
diff --git a/‎tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_swiglu_fusion.py‎
Lines changed: 14 additions & 35 deletions b/‎tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_swiglu_fusion.py‎
Lines changed: 14 additions & 35 deletions
diff --git a/‎tensorrt_llm/_torch/cute_dsl_kernels/blackwell/dense_blockscaled_gemm_persistent.py‎
Lines changed: 7 additions & 1 deletion b/‎tensorrt_llm/_torch/cute_dsl_kernels/blackwell/dense_blockscaled_gemm_persistent.py‎
Lines changed: 7 additions & 1 deletion
@@ -2178,32 +2178,6 @@ def _(
                                    device=input_scale.device)
         return output, output_scale
 
-    class FusedMoEInputsHelper:
-
-        def __init__(self, num_experts: int, top_k: int, num_local_experts: int,
-                     local_expert_offset: int):
-            self.num_experts = num_experts
-            self.top_k = top_k
-            self.num_local_experts = num_local_experts
-            self.local_expert_offset = local_expert_offset
-
-        def infer_shape_num_tokens(self, input_shapes: List[torch.Size]) -> int:
-            return input_shapes[0][0]
-
-        def inputs_pre_hook(self,
-                            inputs: List[torch.Tensor]) -> List[torch.Tensor]:
-            x, x_sf, token_selected_experts, token_final_scales, *others = inputs
-            num_tokens = token_selected_experts.size(0)
-            new_token_final_scales, new_token_selected_experts = torch.randn(
-                num_tokens,
-                self.num_experts,
-                device=token_selected_experts.device).topk(self.top_k, dim=-1)
-            new_token_selected_experts = new_token_selected_experts.to(
-                token_selected_experts.dtype)
-            new_token_final_scales = new_token_final_scales.softmax(dim=-1).to(
-                token_final_scales.dtype)
-            return x, x_sf, new_token_selected_experts, new_token_final_scales, *others
-
     class Sm100BlockScaledFusedMoERunner(TunableRunner):
         tuning_config_cache = dict()
 
 
@@ -35,44 +35,18 @@
 import cutlass.utils as utils
 import cutlass.utils.blackwell_helpers as sm100_utils
 import cutlass.utils.blockscaled_layout as blockscaled_utils
-from cutlass._mlir.dialects import math, nvvm
+from cutlass._mlir.dialects import math
 from cutlass.cute.nvgpu import cpasync, tcgen05
-from cutlass.cute.typing import Float32
-from cutlass.cutlass_dsl import T, dsl_user_op
 
 from .custom_pipeline import PipelineCpAsyncUmma
-from .utils import is_power_of_2
-
-
-@dsl_user_op
-def fmin(
-    a: Union[float, Float32], b: Union[float, Float32], *, nan=False, loc=None, ip=None
-) -> Float32:
-    return Float32(
-        nvvm.fmin(
-            T.f32(),
-            Float32(a).ir_value(loc=loc, ip=ip),
-            Float32(b).ir_value(loc=loc, ip=ip),
-            nan=nan,
-            loc=loc,
-            ip=ip,
-        )
-    )
-
-
-def sigmoid_f32(a: Union[float, Float32], fastmath: bool = False) -> Union[float, Float32]:
-    """
-    Compute the sigmoid of the input tensor.
-    """
-    return cute.arch.rcp_approx(1.0 + cute.math.exp(-a, fastmath=fastmath))
-
-
-def silu_f32(a: Union[float, Float32], fastmath: bool = False) -> Union[float, Float32]:
-    """
-    Compute the silu of the input tensor.
-    """
-    return a * sigmoid_f32(a, fastmath=fastmath)
-
+from .utils import (
+    TRTLLM_ENABLE_PDL,
+    fmin,
+    griddepcontrol_launch_dependents,
+    griddepcontrol_wait,
+    is_power_of_2,
+    silu_f32,
+)
 
 """
 High-performance persistent blockscaled contiguous grouped dense GEMM with gather and SwiGLU fusion
@@ -819,6 +793,7 @@ class SharedStorage:
             smem=self.shared_storage.size_in_bytes(),
             stream=stream,
             min_blocks_per_mp=1,
+            use_pdl=TRTLLM_ENABLE_PDL,
         )
         return
 
@@ -1148,6 +1123,8 @@ def kernel(
         else:
             self.cta_sync_barrier.arrive_and_wait()
 
+        griddepcontrol_wait()
+
         #
         # Specialized Schedule warp
         #
@@ -2282,6 +2259,8 @@ def kernel(
             #
             c_pipeline.producer_tail()
 
+        griddepcontrol_launch_dependents()
+
     def epilog_tmem_copy_and_partition(
         self,
         tidx: cutlass.Int32,
 
@@ -52,7 +52,12 @@
 import cutlass.utils.blockscaled_layout as blockscaled_utils
 from cutlass.cute.nvgpu import cpasync, tcgen05
 
-from .utils import is_power_of_2
+from .utils import (
+    TRTLLM_ENABLE_PDL,
+    griddepcontrol_launch_dependents,
+    griddepcontrol_wait,
+    is_power_of_2,
+)
 
 
 class Sm100BlockScaledContiguousGroupedGemmKernel:
@@ -597,6 +602,7 @@ class SharedStorage:
             smem=self.shared_storage.size_in_bytes(),
             stream=stream,
             min_blocks_per_mp=1,
+            use_pdl=TRTLLM_ENABLE_PDL,
         )
         return
 
@@ -933,6 +939,8 @@ def kernel(
         else:
             self.cta_sync_barrier.arrive_and_wait()
 
+        griddepcontrol_wait()
+
         #
         # Specialized Schedule warp
         #
@@ -1597,6 +1605,8 @@ def kernel(
             #
             c_pipeline.producer_tail()
 
+        griddepcontrol_launch_dependents()
+
     def epilog_tmem_copy_and_partition(
         self,
         tidx: cutlass.Int32,
 
@@ -35,11 +35,17 @@
 import cutlass.utils as utils
 import cutlass.utils.blackwell_helpers as sm100_utils
 import cutlass.utils.blockscaled_layout as blockscaled_utils
-from cutlass._mlir.dialects import llvm
 from cutlass.cute.nvgpu import cpasync, tcgen05
-from cutlass.cutlass_dsl import Int32, T, dsl_user_op
 
-from .utils import is_power_of_2
+from .utils import (
+    TRTLLM_ENABLE_PDL,
+    atomic_add_func,
+    griddepcontrol_launch_dependents,
+    griddepcontrol_wait,
+    is_power_of_2,
+    vectorized_atomic_add_bf16x8,
+    vectorized_atomic_add_fp32x2,
+)
 
 """
 High-performance persistent blockscaled contiguous grouped dense GEMM (C = alpha * (SFA * A) * (SFB * B)) example for
@@ -259,8 +265,8 @@ def hooked_PersistentTileSchedulerParams_init(
 
 
 def hooked_get_cluster_work_idx_with_fastdivmod(
-    self, current_work_linear_idx: Int32, *, loc=None, ip=None
-) -> Tuple[Int32, Int32, Int32]:
+    self, current_work_linear_idx: cutlass.Int32, *, loc=None, ip=None
+) -> Tuple[cutlass.Int32, cutlass.Int32, cutlass.Int32]:
     work_iteration, work_unit_id = divmod(current_work_linear_idx, self.params.batch_fdd)
 
     if self.params._raster_along_m:
@@ -287,69 +293,6 @@ def hooked_get_cluster_work_idx_with_fastdivmod(
 )
 
 
-# TODO(zhichenj): try to move these to NVVM wrapper or helper functions
-@dsl_user_op
-def vectorized_atomic_add_bf16x8(rOut_epi_packed, scatter_out_offset, loc=None, ip=None):
-    llvm.inline_asm(
-        None,
-        [
-            scatter_out_offset.iterator.llvm_ptr,
-            llvm.bitcast(T.i32(), rOut_epi_packed[0, None].load().ir_value()),
-            llvm.bitcast(T.i32(), rOut_epi_packed[1, None].load().ir_value()),
-            llvm.bitcast(T.i32(), rOut_epi_packed[2, None].load().ir_value()),
-            llvm.bitcast(T.i32(), rOut_epi_packed[3, None].load().ir_value()),
-        ],
-        "red.global.v4.bf16x2.add.noftz [$0], {$1, $2, $3, $4};",
-        "l,r,r,r,r",
-        has_side_effects=True,
-    )
-
-
-@dsl_user_op
-def vectorized_atomic_add_fp32x2(rOut_epi_packed, scatter_out_offset, loc=None, ip=None):
-    llvm.inline_asm(
-        None,
-        [
-            scatter_out_offset.iterator.llvm_ptr,
-            rOut_epi_packed[0].ir_value(),
-            rOut_epi_packed[1].ir_value(),
-        ],
-        "red.global.v2.f32.add [$0], {$1, $2};",
-        "l,f,f",
-        has_side_effects=True,
-    )
-
-
-@dsl_user_op
-def atomic_add_func(rOut_epi_packed, scatter_out_offset, loc=None, ip=None):
-    if cutlass.const_expr(rOut_epi_packed.dtype == cutlass.Float32):
-        llvm.inline_asm(
-            None,
-            [
-                scatter_out_offset.iterator.llvm_ptr,
-                rOut_epi_packed.ir_value(),
-            ],
-            "red.global.add.f32 [$0], $1;",
-            "l,f",
-            has_side_effects=True,
-            loc=loc,
-            ip=ip,
-        )
-    elif cutlass.const_expr(rOut_epi_packed.dtype == cutlass.BFloat16):
-        llvm.inline_asm(
-            None,
-            [
-                scatter_out_offset.iterator.llvm_ptr,
-                llvm.bitcast(T.i16(), rOut_epi_packed.ir_value()),
-            ],
-            "red.add.noftz.bf16 [$0], $1;",
-            "l,h",
-            has_side_effects=True,
-            loc=loc,
-            ip=ip,
-        )
-
-
 class Sm100BlockScaledContiguousGroupedGemmFinalizeFusionKernel:
     """This class implements batched matrix multiplication (C = A x SFA x B x SFB) with support for various data types
     and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization.
@@ -931,6 +874,7 @@ class SharedStorage:
             smem=self.shared_storage.size_in_bytes(),
             stream=stream,
             min_blocks_per_mp=1,
+            use_pdl=TRTLLM_ENABLE_PDL,
         )
         return
 
@@ -1286,6 +1230,8 @@ def kernel(
         else:
             self.cta_sync_barrier.arrive_and_wait()
 
+        griddepcontrol_wait()
+
         #
         # Specialized Schedule warp
         #
@@ -1940,6 +1886,8 @@ def kernel(
             self.epilog_sync_barrier.arrive_and_wait()
             tmem.free(tmem_ptr)
 
+        griddepcontrol_launch_dependents()
+
     def epilog_tmem_copy_and_partition(
         self,
         tidx: cutlass.Int32,
 
@@ -35,43 +35,17 @@
 import cutlass.utils as utils
 import cutlass.utils.blackwell_helpers as sm100_utils
 import cutlass.utils.blockscaled_layout as blockscaled_utils
-from cutlass._mlir.dialects import math, nvvm
+from cutlass._mlir.dialects import math
 from cutlass.cute.nvgpu import cpasync, tcgen05
-from cutlass.cute.typing import Float32
-from cutlass.cutlass_dsl import T, dsl_user_op
-
-from .utils import is_power_of_2
-
-
-@dsl_user_op
-def fmin(
-    a: Union[float, Float32], b: Union[float, Float32], *, nan=False, loc=None, ip=None
-) -> Float32:
-    return Float32(
-        nvvm.fmin(
-            T.f32(),
-            Float32(a).ir_value(loc=loc, ip=ip),
-            Float32(b).ir_value(loc=loc, ip=ip),
-            nan=nan,
-            loc=loc,
-            ip=ip,
-        )
-    )
-
-
-def sigmoid_f32(a: Union[float, Float32], fastmath: bool = False) -> Union[float, Float32]:
-    """
-    Compute the sigmoid of the input tensor.
-    """
-    return cute.arch.rcp_approx(1.0 + cute.math.exp(-a, fastmath=fastmath))
-
-
-def silu_f32(a: Union[float, Float32], fastmath: bool = False) -> Union[float, Float32]:
-    """
-    Compute the silu of the input tensor.
-    """
-    return a * sigmoid_f32(a, fastmath=fastmath)
 
+from .utils import (
+    TRTLLM_ENABLE_PDL,
+    fmin,
+    griddepcontrol_launch_dependents,
+    griddepcontrol_wait,
+    is_power_of_2,
+    silu_f32,
+)
 
 """
 High-performance persistent blockscaled contiguous grouped dense GEMM (C = alpha * (SFA * A) * (SFB * B)) example for
@@ -749,6 +723,7 @@ class SharedStorage:
             smem=self.shared_storage.size_in_bytes(),
             stream=stream,
             min_blocks_per_mp=1,
+            use_pdl=TRTLLM_ENABLE_PDL,
         )
         return
 
@@ -1087,6 +1062,8 @@ def kernel(
         else:
             self.cta_sync_barrier.arrive_and_wait()
 
+        griddepcontrol_wait()
+
         #
         # Specialized Schedule warp
         #
@@ -1949,6 +1926,8 @@ def kernel(
             #
             c_pipeline.producer_tail()
 
+        griddepcontrol_launch_dependents()
+
     def epilog_tmem_copy_and_partition(
         self,
         tidx: cutlass.Int32,
 
@@ -55,7 +55,8 @@
 from cutlass.cute.nvgpu import cpasync, tcgen05
 
 from .custom_pipeline import PipelineTmaUmma, PipelineUmmaAsync
-from .utils import is_power_of_2
+from .utils import (TRTLLM_ENABLE_PDL, griddepcontrol_launch_dependents,
+                    griddepcontrol_wait, is_power_of_2)
 
 
 class Sm100BlockScaledPersistentDenseGemmKernel:
@@ -578,6 +579,7 @@ class SharedStorage:
             smem=self.shared_storage.size_in_bytes(),
             min_blocks_per_mp=1,
             stream=stream,
+            use_pdl=TRTLLM_ENABLE_PDL,
         )
         return
 
@@ -869,6 +871,8 @@ def kernel(
             cute.arch.barrier(barrier_id=self.cta_sync_bar_id,
                               number_of_threads=self.threads_per_cta)
 
+        griddepcontrol_wait()
+
         #
         # Specialized TMA load warp
         #
@@ -1473,6 +1477,8 @@ def kernel(
             #
             c_pipeline.producer_tail()
 
+        griddepcontrol_launch_dependents()
+
     def mainloop_s2t_copy_and_partition(
         self,
         sSF: cute.Tensor,