[NVIDIA] Enable Programmatic Dependent Launch in Triton (#6394)

depaulmillz · peterbell10 · web-flow · commit ca4d9571d3aa · 2025-04-29T16:36:22.000Z
Programmatic Dependent Launch (PDL) enables kernels within the same CUDA stream to overlap while programmatically resolving inter-kernel dependencies. This allows consecutive kernels to overlap their ramp-down and ramp-up periods, efficiently hiding prologue latencies. Inter-kernel dependencies are resolved using Grid Dependency Control (GDC), which ensures that a kernel waits before reading memory written by the preceding kernel. This feature is utilized in libraries including [CUTLASS](https://github.com/NVIDIA/cutlass/blob/main/media/docs/cpp/dependent_kernel_launch.md). Effectively utilizing PDL in Triton requires using `tl.extra.cuda.gdc_wait()` to wait for the prior kernel to finish writing its results. The most straightforward approach is to execute `tl.extra.cuda.gdc_wait()` before any `tl.load`, based on the conservative assumption that the prior kernel may be launched with PDL and can write to any memory location. When using PDL, `tl.extra.cuda.gdc_launch_dependents()` allows for the current kernel to trigger the next kernel to start. See the [CUDA documentation](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization) for more information. We utilize this feature in a simple non-persistent kernel with a conservative approach to inter-kernel dependencies on Blackwell in tutorial 11. This kernel achieves up to a 15% speedup: ![pdl_performance](https://github.com/user-attachments/assets/5d0a9a3b-38f6-4ae1-9a94-7ade22099a4f) More advanced patterns with PDL we can achieve up to 33% performance benefits on back-to-back layers in LLMs (see [_LLM Inference Performance and Optimization on NVIDIA GB200 NVL72_](https://www.nvidia.com/en-us/on-demand/session/gtc25-s72503/) at GTC 2025 for more details). --------- Co-authored-by: dePaul Miller <23461061+depaulmillz@users.noreply.github.com> Co-authored-by: peterbell10 <peterbell10@live.co.uk>
diff --git a/docs/index.rst b/docs/index.rst
@@ -26,6 +26,7 @@ Python API
 - :doc:`triton.language <python-api/triton.language>`
 - :doc:`triton.testing <python-api/triton.testing>`
 - :doc:`Triton semantics <python-api/triton-semantics>`
+- :doc:`triton.language.extra.cuda <python-api/triton.language.extra.cuda>`
 
 
 .. toctree::
diff --git a/docs/python-api/triton.language.extra.cuda.rst b/docs/python-api/triton.language.extra.cuda.rst
@@ -0,0 +1,14 @@
+triton.language.extra.cuda
+==========================
+
+.. currentmodule:: triton.language.extra.cuda
+
+Programmatic Dependent Launch
+-----------------------------
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    gdc_wait
+    gdc_launch_dependents
diff --git a/python/tutorials/11-programmatic-dependent-launch.py b/python/tutorials/11-programmatic-dependent-launch.py
@@ -0,0 +1,116 @@
+"""
+Programmatic Dependent Launch
+=====================
+This script demonstrates the use of programmatic dependent launch (PDL) ontop of the vector-add example using Triton.
+
+For CUDA reference on programmatic dependent launch see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization.
+For PTX reference on programmatic dependent launch see https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol.
+
+.. code-block:: bash
+    python 11-programmatic-dependent-launch.py
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+
+def is_cuda():
+    return triton.runtime.driver.active.get_current_target().backend == "cuda"
+
+
+def supports_pdl():
+    return is_cuda() and torch.cuda.get_device_capability()[0] >= 9
+
+
+# In this example
+@triton.jit
+def add_kernel(x_ptr,  #
+               y_ptr,  #
+               output_ptr,  #
+               n_elements,  #
+               BLOCK_SIZE: tl.constexpr,  #
+               USE_GDC: tl.constexpr,  #
+               ):
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    if USE_GDC:
+        # GDC wait waits for ALL programs in the the prior kernel to complete before continuing.
+        # This ensures any memory operations happen before the wait in program order,
+        # e.g. if the prior kernel writes to x or y the new values will be visible.
+        tl.extra.cuda.gdc_wait()
+
+    x = tl.load(x_ptr + offsets, mask=mask)
+    y = tl.load(y_ptr + offsets, mask=mask)
+    if USE_GDC:
+        # GDC launch dependents hints the runtime system to launch dependent kernels.
+        # These dependent kernels must also be launched with PDL enabled.
+        # Once GDC launch has been issued by ALL programs or
+        # programs have finished, the dependent grid can begin if there are enough resources.
+        # Note: this by itself provides no additional memory-ordering guarentees, unlike `gdc_wait`
+        tl.extra.cuda.gdc_launch_dependents()
+    output = x + y
+    tl.store(output_ptr + offsets, output, mask=mask)
+
+
+def add(x: torch.Tensor, y: torch.Tensor, launch_pdl: bool = True):
+    output = torch.empty_like(x)
+    assert x.device == y.device and output.device == x.device
+    n_elements = output.numel()
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), )
+    add_kernel[grid](
+        x, y, output, n_elements, BLOCK_SIZE=1024,
+        USE_GDC=launch_pdl,  # set constexpr in kernel to use grid dependence control
+        launch_pdl=launch_pdl,  # launch kernel with PDL flag set enabled
+    )
+    return output
+
+
+def validate(n_elements):
+    x = torch.rand(n_elements, device="cuda", dtype=torch.float32)
+    y = torch.rand(n_elements, device="cuda", dtype=torch.float32)
+
+    torch_result = x + y
+    add_result = add(x, y)
+
+    torch_vs_add = "✅" if torch.allclose(torch_result, add_result, atol=1.0) else "❌"
+    print(f"Number of Elements={n_elements} verification naive vs: ", end="")
+    print(f"add: {torch_vs_add}")
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["size"],
+        x_vals=[2**i for i in range(23, 28, 1)],
+        x_log=False,
+        line_arg="provider",
+        line_vals=["pdl-fp32", "fp32"],
+        line_names=["PDL", "No PDL"],
+        styles=[("red", "-"), ("blue", "-")],
+        ylabel='GB/s',
+        plot_name="pdl-performance",
+        args={},
+    ))
+def benchmark(size, provider):
+    x = torch.rand(size, device="cuda", dtype=torch.float32)
+    y = torch.rand(size, device="cuda", dtype=torch.float32)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    fn = lambda: add(x, y, "pdl" in provider)
+
+    ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles, rep=100)
+
+    gbps = lambda ms: 3 * x.numel() * x.element_size() * 1e-9 / (ms * 1e-3)
+    return gbps(ms), gbps(max_ms), gbps(min_ms)
+
+
+if __name__ == "__main__":
+
+    if supports_pdl():
+        validate(1024)
+        benchmark.run(print_data=True, show_plots=True, save_path=".")
+    else:
+        print("PDL is not supported on this device")
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
@@ -108,6 +108,7 @@ class CUDAOptions:
     ptx_version: int = None
     enable_fp_fusion: bool = True
     launch_cooperative_grid: bool = False
+    launch_pdl: bool = False
     supported_fp8_dtypes: Tuple[str] = ("fp8e5", "fp8e4b15")
     deprecated_fp8_dtypes: Tuple[str] = ()
     default_dot_input_precision: str = "tf32"
diff --git a/third_party/nvidia/backend/driver.py b/third_party/nvidia/backend/driver.py
@@ -122,6 +122,9 @@ def ty_to_cpp(ty):
     }[ty]
 
 
+_BASE_ARGS_FORMAT = "iiiKKppOOOOO"
+
+
 def make_launcher(constants, signature):
 
     def _expand_signature(sig, output):
@@ -184,7 +187,7 @@ def format_of(ty):
     signature = {i: s for i, s in enumerate(expand_signature)}
 
     args_format = ''.join([format_of(ty) for ty in signature.values()])
-    format = "iiiKKpOOOOO" + args_format
+    format = _BASE_ARGS_FORMAT + args_format
 
     flat_signature = []
     for sig in signature.values():
@@ -264,67 +267,65 @@ def format_of(ty):
   return cuLaunchKernelExHandle;
 }}
 
-static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int launch_cooperative_grid, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, CUstream stream, CUfunction function, CUdeviceptr global_scratch{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{
+static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int launch_cooperative_grid, int launch_pdl, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, CUstream stream, CUfunction function, CUdeviceptr global_scratch{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{
   void *params[] = {{ {', '.join(params)} }};
   if (gridX*gridY*gridZ > 0) {{
-    if ((num_ctas == 1) && (0 == launch_cooperative_grid)) {{
-      CUDA_CHECK(cuLaunchKernel(function, gridX, gridY, gridZ, 32*num_warps, 1, 1, shared_memory, stream, params, 0));
-    }} else if ((num_ctas == 1) && (0 != launch_cooperative_grid)) {{
-      CUlaunchAttribute launchAttr[1];
+    // 4 attributes that we can currently pass maxmimum
+    CUlaunchAttribute launchAttr[4];
+    static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
+    if (cuLaunchKernelExHandle == NULL) {{
+      cuLaunchKernelExHandle = getLaunchKernelExHandle();
+    }}
+    CUlaunchConfig config;
+    config.gridDimX = gridX;
+    config.gridDimY = gridY;
+    config.gridDimZ = gridZ;
+
+    if (num_ctas != 1) {{
+      config.gridDimX *= clusterDimX;
+      config.gridDimY *= clusterDimY;
+      config.gridDimZ *= clusterDimZ;
+    }}
+
+    config.blockDimX = 32 * num_warps;
+    config.blockDimY = 1;
+    config.blockDimZ = 1;
+    config.sharedMemBytes = shared_memory;
+    config.hStream = stream;
+    config.attrs = launchAttr;
+    int num_attrs = 0;
+
+    if (launch_pdl != 0) {{
+      CUlaunchAttribute pdlAttr = {{ .id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION, .value = 1}};
+      launchAttr[num_attrs] = pdlAttr;
+      ++num_attrs;
+    }}
+
+    if (launch_cooperative_grid != 0) {{
       CUlaunchAttribute coopAttr = {{ .id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE, .value = 1}};
-      launchAttr[0] = coopAttr;
-
-      CUlaunchConfig config;
-      config.gridDimX = gridX;
-      config.gridDimY = gridY;
-      config.gridDimZ = gridZ;
-      config.blockDimX = 32 * num_warps;
-      config.blockDimY = 1;
-      config.blockDimZ = 1;
-      config.sharedMemBytes = shared_memory;
-      config.hStream = stream;
-      config.attrs = launchAttr;
-      config.numAttrs = 1;
-
-      static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
-      if (cuLaunchKernelExHandle == NULL) {{
-        cuLaunchKernelExHandle = getLaunchKernelExHandle();
-      }}
-      CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));
-
-    }} else {{
-      CUlaunchAttribute launchAttr[3];
-      launchAttr[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
-      launchAttr[0].value.clusterDim.x = clusterDimX;
-      launchAttr[0].value.clusterDim.y = clusterDimY;
-      launchAttr[0].value.clusterDim.z = clusterDimZ;
-      launchAttr[1].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
-      launchAttr[1].value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
-
-      unsigned numAttrs = 2;
-      if (0 != launch_cooperative_grid) {{
-        CUlaunchAttribute coopAttr = {{ .id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE, .value = 1}};
-        launchAttr[2] = coopAttr;
-        numAttrs = 3;
-      }}
-
-      CUlaunchConfig config;
-      config.gridDimX = gridX * clusterDimX;
-      config.gridDimY = gridY * clusterDimY;
-      config.gridDimZ = gridZ * clusterDimZ;
-      config.blockDimX = 32 * num_warps;
-      config.blockDimY = 1;
-      config.blockDimZ = 1;
-      config.sharedMemBytes = shared_memory;
-      config.hStream = stream;
-      config.attrs = launchAttr;
-      config.numAttrs = numAttrs;
-      static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
-      if (cuLaunchKernelExHandle == NULL) {{
-        cuLaunchKernelExHandle = getLaunchKernelExHandle();
-      }}
-      CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));
+      launchAttr[num_attrs] = coopAttr;
+      ++num_attrs;
+    }}
+
+    if (num_ctas != 1) {{
+      CUlaunchAttribute clusterAttr = {{}};
+      clusterAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
+      clusterAttr.value.clusterDim.x = clusterDimX;
+      clusterAttr.value.clusterDim.y = clusterDimY;
+      clusterAttr.value.clusterDim.z = clusterDimZ;
+      launchAttr[num_attrs] = clusterAttr;
+      ++num_attrs;
+
+      CUlaunchAttribute clusterSchedulingAttr = {{}};
+      clusterSchedulingAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
+      clusterSchedulingAttr.value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
+      launchAttr[num_attrs] = clusterSchedulingAttr;
+      ++num_attrs;
     }}
+
+    config.numAttrs = num_attrs;
+
+    CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));
   }}
 }}
 
@@ -444,14 +445,15 @@ def format_of(ty):
   uint64_t _stream;
   uint64_t _function;
   int launch_cooperative_grid;
+  int launch_pdl;
   PyObject *launch_enter_hook = NULL;
   PyObject *launch_exit_hook = NULL;
   PyObject *kernel_metadata = NULL;
   PyObject *launch_metadata = NULL;
   PyObject *global_scratch_obj = NULL;
   {newline.join([f"{_extracted_type(ty)} _arg{i};" for i, ty in signature.items()])}
   if(!PyArg_ParseTuple(args, \"{format}\", &gridX, &gridY, &gridZ,
-                                           &_stream, &_function, &launch_cooperative_grid, &global_scratch_obj,
+                                           &_stream, &_function, &launch_cooperative_grid, &launch_pdl, &global_scratch_obj,
                                            &kernel_metadata, &launch_metadata,
                                            &launch_enter_hook, &launch_exit_hook{args_list})) {{
     return NULL;
@@ -485,7 +487,7 @@ def format_of(ty):
   {newline.join(ptr_decls)}
   {newline.join(tma_decls)}
   Py_BEGIN_ALLOW_THREADS;
-  _launch(gridX, gridY, gridZ, num_warps, num_ctas, launch_cooperative_grid, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (CUstream)_stream, (CUfunction)_function, global_scratch{', ' + ', '.join(internal_args_list) if len(internal_args_list) > 0 else ''});
+  _launch(gridX, gridY, gridZ, num_warps, num_ctas, launch_cooperative_grid, launch_pdl, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (CUstream)_stream, (CUfunction)_function, global_scratch{', ' + ', '.join(internal_args_list) if len(internal_args_list) > 0 else ''});
   Py_END_ALLOW_THREADS;
   if (PyErr_Occurred()) {{
     return NULL;
@@ -584,8 +586,8 @@ def wrap_handle_tensordesc(launcher, tensordesc_meta):
         return launcher
 
     def inner(*args):
-        meta_args = args[:11]
-        raw_kernel_args = args[11:]
+        meta_args = args[:len(_BASE_ARGS_FORMAT)]
+        raw_kernel_args = args[len(_BASE_ARGS_FORMAT):]
         tensordesc_idx = 0
         final_args = []
         for i, arg in enumerate(raw_kernel_args):
@@ -619,6 +621,7 @@ def __init__(self, src, metadata):
         self.global_scratch_size = metadata.global_scratch_size
         self.global_scratch_align = metadata.global_scratch_align
         self.launch_cooperative_grid = metadata.launch_cooperative_grid
+        self.launch_pdl = metadata.launch_pdl
 
     def __call__(self, gridX, gridY, gridZ, stream, function, *args):
         if self.global_scratch_size > 0:
@@ -627,7 +630,8 @@ def __call__(self, gridX, gridY, gridZ, stream, function, *args):
             global_scratch = _allocation._allocator(alloc_size, self.global_scratch_align, stream)
         else:
             global_scratch = None
-        self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
+        self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,
+                    global_scratch, *args)
 
 
 class CudaDriver(GPUDriver):
diff --git a/third_party/nvidia/language/cuda/__init__.py b/third_party/nvidia/language/cuda/__init__.py
@@ -1,6 +1,7 @@
 from . import libdevice
 
 from .utils import (globaltimer, num_threads, num_warps, smid, convert_custom_float8_sm70, convert_custom_float8_sm80)
+from .gdc import (gdc_launch_dependents, gdc_wait)
 
 __all__ = [
     "libdevice",
@@ -10,4 +11,6 @@
     "smid",
     "convert_custom_float8_sm70",
     "convert_custom_float8_sm80",
+    "gdc_launch_dependents",
+    "gdc_wait",
 ]
diff --git a/third_party/nvidia/language/cuda/gdc.py b/third_party/nvidia/language/cuda/gdc.py