intel
diff --git a/‎MANIFEST.in‎
Lines changed: 1 addition & 3 deletions b/‎MANIFEST.in‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 4 additions & 0 deletions b/‎bin/RegisterTritonDialects.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/python-api/triton.language.extra.cuda.rst‎
Lines changed: 14 additions & 0 deletions b/‎docs/python-api/triton.language.extra.cuda.rst‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎python/triton/_utils.py‎
Lines changed: 23 additions & 21 deletions b/‎python/triton/_utils.py‎
Lines changed: 23 additions & 21 deletions
diff --git a/‎python/tutorials/11-programmatic-dependent-launch.py‎
Lines changed: 116 additions & 0 deletions b/‎python/tutorials/11-programmatic-dependent-launch.py‎
Lines changed: 116 additions & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 80 additions & 11 deletions b/‎setup.py‎
Lines changed: 80 additions & 11 deletions
@@ -5,9 +5,7 @@ graft include
 graft lib
 graft python/src
 graft python/test
-graft python/triton/backends/amd
-graft python/triton/backends/nvidia
-graft python/triton/tools/extra/cuda
+graft python/triton
 graft test
 graft third_party
 graft unittest
 
@@ -28,6 +28,7 @@
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
 #include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h"
 
+#include "nvidia/hopper/include/Transforms/Passes.h"
 #include "nvidia/include/Dialect/NVWS/Transforms/Passes.h"
 #include "nvidia/include/NVGPUToLLVM/Passes.h"
 #include "nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h"
@@ -109,6 +110,9 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   // NVWS passes
   mlir::registerNVWSTransformsPasses();
 
+  // NVGPU transform passes
+  mlir::registerNVHopperTransformsPasses();
+
   registry.insert<
       mlir::triton::TritonDialect, mlir::cf::ControlFlowDialect,
       mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect,
 
@@ -29,6 +29,7 @@ Python API
 - :doc:`triton.language <python-api/triton.language>`
 - :doc:`triton.testing <python-api/triton.testing>`
 - :doc:`Triton semantics <python-api/triton-semantics>`
+- :doc:`triton.language.extra.cuda <python-api/triton.language.extra.cuda>`
 
 
 .. toctree::
 
@@ -0,0 +1,14 @@
+triton.language.extra.cuda
+==========================
+
+.. currentmodule:: triton.language.extra.cuda
+
+Programmatic Dependent Launch
+-----------------------------
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    gdc_wait
+    gdc_launch_dependents
@@ -1,35 +1,37 @@
+from __future__ import annotations
+
 from functools import reduce
+from typing import Any, Callable, TYPE_CHECKING, Union
+
+if TYPE_CHECKING:
+    from .language import core
+    IterableType = Union[list[Any], tuple[Any, ...], core.tuple, core.tuple_type]
+    ObjPath = tuple[int, ...]
 
 
-def get_iterable_path(iterable, path):
-    return reduce(lambda a, idx: a[idx], path, iterable)
+def get_iterable_path(iterable: IterableType, path: ObjPath) -> Any:
+    return reduce(lambda a, idx: a[idx], path, iterable)  # type: ignore[index]
 
 
-def set_iterable_path(iterable, path, val):
+def set_iterable_path(iterable: IterableType, path: tuple[int, ...], val: Any):
+    assert len(path) != 0
     prev = iterable if len(path) == 1 else get_iterable_path(iterable, path[:-1])
-    prev[path[-1]] = val
+    prev[path[-1]] = val  # type: ignore[index]
 
 
-def find_paths_if(iterable, pred):
+def find_paths_if(iterable: Union[IterableType, Any], pred: Callable[[ObjPath, Any], bool]) -> list[ObjPath]:
     from .language import core
-    is_iterable = lambda x: isinstance(x, (list, tuple, core.tuple, core.tuple_type))
-    ret = dict()
+    is_iterable: Callable[[Any], bool] = lambda x: isinstance(x, (list, tuple, core.tuple, core.tuple_type))
+    # We need to use dict so that ordering is maintained, while set doesn't guarantee order
+    ret: dict[ObjPath, None] = {}
 
-    def _impl(current, path):
-        path = (path[0], ) if len(path) == 1 else tuple(path)
+    def _impl(path: tuple[int, ...], current: Any):
         if is_iterable(current):
             for idx, item in enumerate(current):
-                _impl(item, path + (idx, ))
+                _impl((*path, idx), item)
         elif pred(path, current):
-            if len(path) == 1:
-                ret[(path[0], )] = None
-            else:
-                ret[tuple(path)] = None
-
-    if is_iterable(iterable):
-        _impl(iterable, [])
-    elif pred(list(), iterable):
-        ret = {tuple(): None}
-    else:
-        ret = dict()
+            ret[path] = None
+
+    _impl((), iterable)
+
     return list(ret.keys())
@@ -0,0 +1,116 @@
+"""
+Programmatic Dependent Launch
+=====================
+This script demonstrates the use of programmatic dependent launch (PDL) ontop of the vector-add example using Triton.
+
+For CUDA reference on programmatic dependent launch see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization.
+For PTX reference on programmatic dependent launch see https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol.
+
+.. code-block:: bash
+    python 11-programmatic-dependent-launch.py
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+
+def is_cuda():
+    return triton.runtime.driver.active.get_current_target().backend == "cuda"
+
+
+def supports_pdl():
+    return is_cuda() and torch.cuda.get_device_capability()[0] >= 9
+
+
+# In this example
+@triton.jit
+def add_kernel(x_ptr,  #
+               y_ptr,  #
+               output_ptr,  #
+               n_elements,  #
+               BLOCK_SIZE: tl.constexpr,  #
+               USE_GDC: tl.constexpr,  #
+               ):
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    if USE_GDC:
+        # GDC wait waits for ALL programs in the the prior kernel to complete before continuing.
+        # This ensures any memory operations happen before the wait in program order,
+        # e.g. if the prior kernel writes to x or y the new values will be visible.
+        tl.extra.cuda.gdc_wait()
+
+    x = tl.load(x_ptr + offsets, mask=mask)
+    y = tl.load(y_ptr + offsets, mask=mask)
+    if USE_GDC:
+        # GDC launch dependents hints the runtime system to launch dependent kernels.
+        # These dependent kernels must also be launched with PDL enabled.
+        # Once GDC launch has been issued by ALL programs or
+        # programs have finished, the dependent grid can begin if there are enough resources.
+        # Note: this by itself provides no additional memory-ordering guarentees, unlike `gdc_wait`
+        tl.extra.cuda.gdc_launch_dependents()
+    output = x + y
+    tl.store(output_ptr + offsets, output, mask=mask)
+
+
+def add(x: torch.Tensor, y: torch.Tensor, launch_pdl: bool = True):
+    output = torch.empty_like(x)
+    assert x.device == y.device and output.device == x.device
+    n_elements = output.numel()
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), )
+    add_kernel[grid](
+        x, y, output, n_elements, BLOCK_SIZE=1024,
+        USE_GDC=launch_pdl,  # set constexpr in kernel to use grid dependence control
+        launch_pdl=launch_pdl,  # launch kernel with PDL flag set enabled
+    )
+    return output
+
+
+def validate(n_elements):
+    x = torch.rand(n_elements, device="cuda", dtype=torch.float32)
+    y = torch.rand(n_elements, device="cuda", dtype=torch.float32)
+
+    torch_result = x + y
+    add_result = add(x, y)
+
+    torch_vs_add = "✅" if torch.allclose(torch_result, add_result, atol=1.0) else "❌"
+    print(f"Number of Elements={n_elements} verification naive vs: ", end="")
+    print(f"add: {torch_vs_add}")
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["size"],
+        x_vals=[2**i for i in range(23, 28, 1)],
+        x_log=False,
+        line_arg="provider",
+        line_vals=["pdl-fp32", "fp32"],
+        line_names=["PDL", "No PDL"],
+        styles=[("red", "-"), ("blue", "-")],
+        ylabel='GB/s',
+        plot_name="pdl-performance",
+        args={},
+    ))
+def benchmark(size, provider):
+    x = torch.rand(size, device="cuda", dtype=torch.float32)
+    y = torch.rand(size, device="cuda", dtype=torch.float32)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    fn = lambda: add(x, y, "pdl" in provider)
+
+    ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles, rep=100)
+
+    gbps = lambda ms: 3 * x.numel() * x.element_size() * 1e-9 / (ms * 1e-3)
+    return gbps(ms), gbps(max_ms), gbps(min_ms)
+
+
+if __name__ == "__main__":
+
+    if supports_pdl():
+        validate(1024)
+        benchmark.run(print_data=True, show_plots=True, save_path=".")
+    else:
+        print("PDL is not supported on this device")
@@ -20,10 +20,19 @@
 from setuptools.command.build_ext import build_ext
 from setuptools.command.build_py import build_py
 from setuptools.command.develop import develop
+from setuptools.command.egg_info import egg_info
+from setuptools.command.install import install
+from setuptools.command.sdist import sdist
+
 from dataclasses import dataclass
 
 import pybind11
 
+try:
+    from setuptools.command.bdist_wheel import bdist_wheel
+except ImportError:
+    from wheel.bdist_wheel import bdist_wheel
+
 try:
     from setuptools.command.editable_wheel import editable_wheel
 except ImportError:
@@ -602,6 +611,10 @@ def get_package_dirs():
     yield ("", "python")
 
     for backend in backends:
+        # we use symlinks for external plugins
+        if backend.is_external:
+            continue
+
         yield (f"triton.backends.{backend.name}", backend.backend_dir)
 
         if backend.language_dir:
@@ -620,8 +633,33 @@ def get_package_dirs():
         yield ("triton.profiler", "third_party/proton/proton")
 
 
-def add_link_to_backends():
+def get_packages():
+    yield from find_packages(where="python")
+
+    for backend in backends:
+        yield f"triton.backends.{backend.name}"
+
+        if backend.language_dir:
+            # Install the contents of each backend's `language` directory into
+            # `triton.language.extra`.
+            for x in os.listdir(backend.language_dir):
+                yield f"triton.language.extra.{x}"
+
+        if backend.tools_dir:
+            # Install the contents of each backend's `tools` directory into
+            # `triton.tools.extra`.
+            for x in os.listdir(backend.tools_dir):
+                yield f"triton.tools.extra.{x}"
+
+    if check_env_flag("TRITON_BUILD_PROTON", "ON"):  # Default ON
+        yield "triton.profiler"
+
+
+def add_link_to_backends(external_only):
     for backend in backends:
+        if external_only and not backend.is_external:
+            continue
+
         update_symlink(backend.install_dir, backend.backend_dir)
 
         if backend.language_dir:
@@ -650,23 +688,53 @@ def add_link_to_proton():
     update_symlink(proton_install_dir, proton_dir)
 
 
-def add_links():
-    add_link_to_backends()
-    if check_env_flag("TRITON_BUILD_PROTON", "ON"):  # Default ON
+def add_links(external_only):
+    add_link_to_backends(external_only=external_only)
+    if not external_only and check_env_flag("TRITON_BUILD_PROTON", "ON"):  # Default ON
         add_link_to_proton()
 
 
+class plugin_bdist_wheel(bdist_wheel):
+
+    def run(self):
+        add_links(external_only=True)
+        super().run()
+
+
 class plugin_develop(develop):
 
     def run(self):
-        add_links()
+        add_links(external_only=False)
         super().run()
 
 
 class plugin_editable_wheel(editable_wheel):
 
     def run(self):
-        add_links()
+        add_links(external_only=False)
+        super().run()
+
+
+class plugin_egg_info(egg_info):
+
+    def run(self):
+        add_links(external_only=True)
+        super().run()
+
+
+class plugin_install(install):
+
+    def run(self):
+        add_links(external_only=True)
+        super().run()
+
+
+class plugin_sdist(sdist):
+
+    def run(self):
+        for backend in backends:
+            if backend.is_external:
+                raise RuntimeError("sdist cannot be used with TRITON_PLUGIN_DIRS")
         super().run()
 
 
@@ -708,9 +776,6 @@ def get_git_version_suffix():
 # keep it separate for easy substitution
 TRITON_VERSION = "3.3.0" + get_git_version_suffix() + os.environ.get("TRITON_WHEEL_VERSION_SUFFIX", "")
 
-package_dirs = dict(get_package_dirs())
-extra_packages = [x for x in package_dirs if x != ""]
-
 setup(
     name=os.environ.get("TRITON_WHEEL_NAME", "triton"),
     version=TRITON_VERSION,
@@ -722,17 +787,21 @@ def get_git_version_suffix():
         "setuptools>=78.1.0",
         "importlib-metadata; python_version < '3.10'",
     ],
-    packages=find_packages(where="python") + extra_packages,
-    package_dir=package_dirs,
+    packages=list(get_packages()),
+    package_dir=dict(get_package_dirs()),
     entry_points=get_entry_points(),
     include_package_data=True,
     ext_modules=[CMakeExtension("triton", "triton/_C/")],
     cmdclass={
+        "bdist_wheel": plugin_bdist_wheel,
         "build_ext": CMakeBuild,
         "build_py": CMakeBuildPy,
         "clean": CMakeClean,
         "develop": plugin_develop,
         "editable_wheel": plugin_editable_wheel,
+        "egg_info": plugin_egg_info,
+        "install": plugin_install,
+        "sdist": plugin_sdist,
     },
     zip_safe=False,
     # for PyPI