Merge commit '16961b79bdac1b774b42d44e52fd55a266ec2866'

whitneywhtsang · whitneywhtsang · commit 8eb38612737d · 2025-06-16T23:17:52.000Z
diff --git a/include/triton/Conversion/TritonGPUToLLVM/Utility.h b/include/triton/Conversion/TritonGPUToLLVM/Utility.h
@@ -543,6 +543,13 @@ emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
     Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
     std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);
 
+[[nodiscard]] bool emitTransferBetweenRegistersAndShared(
+    LinearLayout &regLayout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
+    std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
+    Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
+    Value laneId, Value warpId,
+    std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);
+
 SmallVector<Value> loadSharedToDistributed(triton::gpu::LocalLoadOp localLoadOp,
                                            Type elemLlvmTy,
                                            const SharedMemoryObject &smemObj,
diff --git a/include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h b/include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h
@@ -20,7 +20,7 @@ static const char *kLoopStageAttrName = "loop.stage";
 static const char *kLoopClusterAttrName = "loop.cluster";
 static const char *kScheduledMaxStageAttrName = "tt.scheduled_max_stage";
 class CoarseSchedule;
-
+class ModuleAxisInfoAnalysis;
 //===----------------------------------------------------------------------===//
 // Hoisting Utilities
 //===----------------------------------------------------------------------===//
@@ -87,6 +87,9 @@ std::pair<Operation *, int64_t> getDefiningOpAndDistance(scf::ForOp forOp,
 int getCopyVecBytes(RankedTensorType registerTy,
                     gpu::SharedEncodingTrait sharedEnc);
 
+bool canBeConvertedToAsyncLoad(
+    triton::LoadOp loadOp, triton::ModuleAxisInfoAnalysis &axisInfoAnalysis);
+
 // Serialize the latencies of the operations in the loops into the latency
 // attribute.
 void serializeLatencies(ModuleOp module, DenseMap<Operation *, int> &opLatency);
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -417,6 +417,18 @@ bool emitTransferBetweenRegistersAndShared(
     std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
     Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
     std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback) {
+  auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc);
+  return emitTransferBetweenRegistersAndShared(
+      regLayout, sharedTy, elemLlvmTy, maxVecElems, smemObj, loc, rewriter,
+      target, laneId, warpId, perVectorCallback);
+}
+
+bool emitTransferBetweenRegistersAndShared(
+    LinearLayout &regLayout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
+    std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
+    Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
+    Value laneId, Value warpId,
+    std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback) {
   MLIRContext *ctx = rewriter.getContext();
   auto b = TritonLLVMOpBuilder(loc, rewriter);
 
@@ -458,7 +470,6 @@ bool emitTransferBetweenRegistersAndShared(
                maxVecElems.value_or(std::numeric_limits<int>::max()));
 
   auto withCTAOffset = triton::gpu::getNumCTAs(sharedTy.getEncoding()) > 1;
-  auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc);
   Value blockId =
       withCTAOffset ? target.getClusterCTAId(rewriter, loc) : b.i32_val(0);
 
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp
@@ -106,34 +106,10 @@ class AssignLoadLatencies {
     return !incompatible;
   }
 
-  bool isSmallLoad(tt::LoadOp loadOp,
-                   tt::ModuleAxisInfoAnalysis &axisInfoAnalysis) {
-    assert(!isLoadFromTensorPtr(loadOp) &&
-           "Block ptr should have been lowered before this pass.");
-    auto ptr = loadOp.getPtr();
-    unsigned vec = axisInfoAnalysis.getContiguity(ptr);
-    if (auto mask = loadOp.getMask())
-      vec = std::min<unsigned>(vec, axisInfoAnalysis.getMaskAlignment(mask));
-
-    auto tensorTy = dyn_cast<RankedTensorType>(ptr.getType());
-    if (!tensorTy)
-      return true;
-    auto ty = cast<tt::PointerType>(tensorTy.getElementType()).getPointeeType();
-    unsigned width = vec * ty.getIntOrFloatBitWidth();
-
-    // We do not pipeline all loads for the following reasons:
-    // 1. On nvidia GPUs, cp.async's cp-size can only be 4, 8, or 16.
-    // 2. It's likely that pipling small loads won't offer much performance
-    //    improvement and may even hurt performance by increasing register
-    //    pressure.
-    LDBG("Load " << *loadOp << " has width " << width);
-    return width < 32;
-  }
-
   bool isPipeliningBeneficial(Operation *op, Operation *finalUser,
                               tt::ModuleAxisInfoAnalysis &axisInfoAnalysis) {
     if (auto loadOp = dyn_cast<tt::LoadOp>(op)) {
-      if (isSmallLoad(loadOp, axisInfoAnalysis)) {
+      if (!canBeConvertedToAsyncLoad(loadOp, axisInfoAnalysis)) {
         LDBG("Load " << *loadOp << " is too small for pipelining");
         return false;
       }
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp
@@ -1,6 +1,7 @@
 #include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "triton/Analysis/AxisInfo.h"
 #include "triton/Analysis/Utility.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Types.h"
@@ -441,7 +442,8 @@ bool loadRequiresAdditionalBuffer(Operation *loadOp) {
   return false;
 }
 
-scf::ForOp lowerLoads(scf::ForOp forOp, CoarseSchedule &schedule) {
+scf::ForOp lowerLoads(scf::ForOp forOp, CoarseSchedule &schedule,
+                      triton::ModuleAxisInfoAnalysis &axisInfoAnalysis) {
   llvm::MapVector<Operation *, AsyncLoad> asyncLoads;
   llvm::MapVector<int, LoadGroupInfo> loadGroups;
   // Only visit the top level ops, we do not support pipelining conditional
@@ -457,9 +459,13 @@ scf::ForOp lowerLoads(scf::ForOp forOp, CoarseSchedule &schedule) {
       SharedEncodingTrait sharedEncoding = getSharedEncoding(&op);
       // Do not create async loads for small loads (cp.async requires at least 4
       // bytes)
+      bool canUseAsyncCp =
+          isa<tt::LoadOp>(op) &&
+          canBeConvertedToAsyncLoad(cast<tt::LoadOp>(op), axisInfoAnalysis);
       int copyVecBytes = getCopyVecBytes(
           cast<RankedTensorType>(op.getResultTypes()[0]), sharedEncoding);
-      if (copyVecBytes >= 4 || isTMALoad(&op)) {
+      canUseAsyncCp &= copyVecBytes >= 4;
+      if (canUseAsyncCp || isTMALoad(&op)) {
         if (loadRequiresAdditionalBuffer(&op)) {
           // Allocate additional buffer required by the wgmma pipelining.
           stageDiff += 1;
@@ -1008,26 +1014,28 @@ scf::ForOp lowerMMAs(scf::ForOp forOp, CoarseSchedule &schedule) {
 // LOWER LOOP
 /////////////////////////////
 
-void lowerLoop(scf::ForOp forOp) {
+void lowerLoop(scf::ForOp forOp,
+               triton::ModuleAxisInfoAnalysis &axisInfoAnalysis) {
   CoarseSchedule schedule;
   if (failed(schedule.deSerialize(forOp))) {
     return;
   }
   scf::ForOp newForOp = lowerMMAs(forOp, schedule);
-  newForOp = lowerLoads(newForOp, schedule);
+  newForOp = lowerLoads(newForOp, schedule, axisInfoAnalysis);
   newForOp = lowerTMADescriptors(newForOp, schedule);
   schedule.serialize(newForOp);
 }
 
 } // namespace
 
 void lowerLoops(ModuleOp moduleOp) {
+  triton::ModuleAxisInfoAnalysis axisInfoAnalysis(moduleOp);
   SmallVector<scf::ForOp> loops;
   moduleOp->walk([&](scf::ForOp forOp) { loops.push_back(forOp); });
   if (loops.empty())
     return;
   for (auto forOp : loops) {
-    lowerLoop(forOp);
+    lowerLoop(forOp, axisInfoAnalysis);
   }
 }
 
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
@@ -6,6 +6,7 @@
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Support/LLVM.h"
+#include "triton/Analysis/AxisInfo.h"
 #include "triton/Dialect/Triton/IR/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
@@ -14,8 +15,13 @@
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
 #include <queue>
 
+#define DEBUG_TYPE "triton-loop-pipeline"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
 using namespace mlir;
 namespace tt = mlir::triton;
 namespace ttg = mlir::triton::gpu;
@@ -313,6 +319,30 @@ int mlir::triton::getCopyVecBytes(RankedTensorType registerTy,
   return vecElems * registerTy.getElementTypeBitWidth() / 8;
 }
 
+bool mlir::triton::canBeConvertedToAsyncLoad(
+    tt::LoadOp loadOp, tt::ModuleAxisInfoAnalysis &axisInfoAnalysis) {
+  assert(!isLoadFromTensorPtr(loadOp) &&
+         "Block ptr should have been lowered before this pass.");
+  auto ptr = loadOp.getPtr();
+  unsigned vec = axisInfoAnalysis.getContiguity(ptr);
+  if (auto mask = loadOp.getMask())
+    vec = std::min<unsigned>(vec, axisInfoAnalysis.getMaskAlignment(mask));
+
+  auto tensorTy = dyn_cast<RankedTensorType>(ptr.getType());
+  if (!tensorTy)
+    return false;
+  auto ty = cast<tt::PointerType>(tensorTy.getElementType()).getPointeeType();
+  unsigned width = vec * ty.getIntOrFloatBitWidth();
+
+  // We do not pipeline all loads for the following reasons:
+  // 1. On nvidia GPUs, cp.async's cp-size can only be 4, 8, or 16.
+  // 2. It's likely that pipling small loads won't offer much performance
+  //    improvement and may even hurt performance by increasing register
+  //    pressure.
+  LDBG("Load " << *loadOp << " has width " << width);
+  return width >= 32;
+}
+
 void mlir::triton::serializeLatencies(ModuleOp module,
                                       DenseMap<Operation *, int> &opLatency) {
   auto helper = TritonDialect::getLoaded(module)->getLatencyAttrHelper();
diff --git a/python/src/llvm.cc b/python/src/llvm.cc
@@ -316,6 +316,14 @@ void init_triton_llvm(py::module &&m) {
         CGSCCAnalysisManager cgam;
         ModuleAnalysisManager mam;
 
+        if (arch.empty()) {
+          llvm::TargetLibraryInfoImpl TLII;
+          TLII.disableAllFunctions();
+          fam.registerPass([TLII = std::move(TLII)] {
+            return llvm::TargetLibraryAnalysis(TLII);
+          });
+        }
+
         PassInstrumentationCallbacks *instrCbPtr = nullptr;
         PassInstrumentationCallbacks passInstrCb;
         StandardInstrumentations standardInstr(mod->getContext(),
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -13,7 +13,7 @@
 from triton._filecheck import filecheck_test, run_parser
 import triton.language as tl
 from triton._internal_testing import is_cuda
-from triton.compiler.errors import CompilationError
+from triton.compiler.errors import CompilationError, CompileTimeAssertionFailure
 
 TARGET_PAT = re.compile('ttg.target = "[^"]*"')
 
@@ -604,10 +604,10 @@ def kernel():
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 32 : i32} {
   tt.func public @kernel() attributes {noinline = false} {
     %0 = ttg.local_alloc : () -> !ttg.memdesc<32x32xi32, #shared, #smem, mutable>
-    tt.call @"test_frontend.smem_and_layout_user__MDi32S32_32SLSSS_1_1_1_constexpr[1]_constexpr[0]____SSSLAS[32, 32]ASMD__(1,)cconstexpr_SwizzledSharedLayout(vec=1, per_phase=1, max_phase=1, order=(constexpr_1_ ,constexpr_0_), ctas_per_cga=None, cta_split_num=None, cta_order=None)_"(%0) : (!ttg.memdesc<32x32xi32, #shared, #smem, mutable>) -> ()
+    tt.call @"test_frontend.smem_and_layout_user__MDi32S32_32SLSSS_1_1_1_1_0____SSSLAS[32, 32]ASMD__(1,)cconstexpr_SwizzledSharedLayout(vec=1, per_phase=1, max_phase=1, order=(1 ,0), ctas_per_cga=None, cta_split_num=None, cta_order=None)_"(%0) : (!ttg.memdesc<32x32xi32, #shared, #smem, mutable>) -> ()
     tt.return
   }
-  tt.func private @"test_frontend.smem_and_layout_user__MDi32S32_32SLSSS_1_1_1_constexpr[1]_constexpr[0]____SSSLAS[32, 32]ASMD__(1,)cconstexpr_SwizzledSharedLayout(vec=1, per_phase=1, max_phase=1, order=(constexpr_1_ ,constexpr_0_), ctas_per_cga=None, cta_split_num=None, cta_order=None)_"(%arg0: !ttg.memdesc<32x32xi32, #shared, #smem, mutable>) attributes {noinline = false} {
+  tt.func private @"test_frontend.smem_and_layout_user__MDi32S32_32SLSSS_1_1_1_1_0____SSSLAS[32, 32]ASMD__(1,)cconstexpr_SwizzledSharedLayout(vec=1, per_phase=1, max_phase=1, order=(1 ,0), ctas_per_cga=None, cta_split_num=None, cta_order=None)_"(%arg0: !ttg.memdesc<32x32xi32, #shared, #smem, mutable>) attributes {noinline = false} {
     tt.return
   }
 }
@@ -855,7 +855,7 @@ def test_tensor_permute():
 def test_split_join():
     # CHECK: [[BLOCKED:#.*]] = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
     # CHECK: [[BLOCKED1:#.*]] = #ttg.blocked<{sizePerThread = [2, 2], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0]}>
-    layout: ttgl.constexpr = ttgl.BlockedLayout([2], [32], [4], [0])
+    layout: ttgl.constexpr = ttgl.BlockedLayout([2], [32], [4], [0], [1], [1], [0])
     a = ttgl.full([128], 1, ttgl.int32, layout)
     b = ttgl.full([128], 2, ttgl.int32, layout)
     # CHECK: tt.join {{.*}} : tensor<128xi32, [[BLOCKED]]> -> tensor<128x2xi32, [[BLOCKED1]]>
@@ -883,6 +883,16 @@ def test_tensor_reshape():
     ttgl.static_assert(v.type.layout == expect_layout)
 
 
+@gluon.jit
+def static_assert_kernel():
+    ttgl.static_assert(False)
+
+
+def test_static_assert():
+    with pytest.raises(CompileTimeAssertionFailure):
+        run_parser(static_assert_kernel)
+
+
 @filecheck_test
 @gluon.jit
 def test_zeros():
diff --git a/python/triton/compiler/code_generator.py b/python/triton/compiler/code_generator.py
@@ -637,6 +637,8 @@ def _apply_binary_method(self, method_name, lhs, rhs):
         if _is_triton_tensor(rhs):
             reverse_method_name = re.sub(r"__(.*)__", r"__r\1__", method_name)
             return getattr(rhs, reverse_method_name)(lhs, _semantic=self.semantic)
+        if not isinstance(lhs, (constexpr, language.tuple)) and isinstance(rhs, constexpr):
+            lhs = constexpr(lhs)
         return getattr(lhs, method_name)(rhs)
 
     def visit_BinOp(self, node):
@@ -1457,9 +1459,12 @@ def ret(self, node: ast.Call):
 
         return ret
 
+    from ..experimental.gluon import language as ttgl
     statically_implemented_functions: Dict[object, Callable[[ast.Call], Any]] = {
         language.core.static_assert: execute_static_assert,
         language.core.static_print: static_executor(print),
+        ttgl.static_assert: execute_static_assert,
+        ttgl.static_print: static_executor(print),
         int: static_executor(int),
         len: static_executor(len),
     }
diff --git a/python/triton/compiler/compiler.py b/python/triton/compiler/compiler.py
@@ -10,8 +10,6 @@
 from ..runtime.cache import get_cache_manager, get_dump_manager, get_override_manager
 from ..runtime.driver import driver
 from ..tools.disasm import get_sass, get_spvdis
-# TODO: this shouldn't be here
-from .code_generator import ast_to_ttir
 from pathlib import Path
 import re
 import functools
@@ -81,6 +79,7 @@ def hash(self):
         return hashlib.sha256(key.encode("utf-8")).hexdigest()
 
     def make_ir(self, options, codegen_fns, module_map, context):
+        from .code_generator import ast_to_ttir
         return ast_to_ttir(self.fn, self, context=context, options=options, codegen_fns=codegen_fns,
                            module_map=module_map)
 
diff --git a/python/triton/experimental/gluon/_runtime.py b/python/triton/experimental/gluon/_runtime.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 import triton
-from triton.compiler.code_generator import ast_to_ttir
 from triton.compiler.compiler import ASTSource
 from triton.backends.compiler import Language
 from triton.runtime.jit import JITFunction
@@ -19,6 +18,7 @@ def __init__(self, fn, signature, constexprs=None, attrs=None) -> None:
 
     def make_ir(self, options, codegen_fns, module_map, context):
         from triton.compiler.compiler import make_backend
+        from triton.compiler.code_generator import ast_to_ttir
 
         builder = ir.builder(context)
         module = builder.create_module()
diff --git a/python/triton/experimental/gluon/language/_core.py b/python/triton/experimental/gluon/language/_core.py
@@ -54,6 +54,7 @@
     "reshape",
     "split",
     "static_assert",
+    "static_print",
     "store",
     "to_tensor",
     "where",
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
@@ -364,6 +364,12 @@ def wrapper(*args, _semantic=None, **kwargs):
 
 
 def _unwrap_if_constexpr(o):
+    if isinstance(o, list):
+        return [_unwrap_if_constexpr(x) for x in o]
+    if isinstance(o, builtins.tuple):
+        return builtins.tuple(_unwrap_if_constexpr(x) for x in o)
+    if isinstance(o, tuple):
+        return tuple(_unwrap_if_constexpr(x) for x in o)
     return o.value if isinstance(o, constexpr) else o
 
 
diff --git a/python/triton_kernels/tests/test_routing.py b/python/triton_kernels/tests/test_routing.py
@@ -3,6 +3,7 @@
 from triton_kernels.routing import routing, routing_torch
 from triton_kernels.testing import assert_close
 from triton_kernels.testing import assert_equal
+from triton_kernels.target_info import is_hip
 
 
 def init_data(n_tokens, n_expts_tot, dtype=torch.float32, device="cuda"):
@@ -18,6 +19,7 @@ def init_data(n_tokens, n_expts_tot, dtype=torch.float32, device="cuda"):
 @pytest.mark.parametrize("n_expts_tot, n_expts_act", [(128, 32), (1500, 8)])
 @pytest.mark.parametrize("use_expt_indx", [False, True])
 @pytest.mark.parametrize("sm_first", [True, False])
+@pytest.mark.skipif(is_hip, reason="Tests are currently broken on AMD")
 def test_op(n_tokens_pad, n_tokens_raw, n_expts_tot, n_expts_act, sm_first, use_expt_indx, device):
     torch.manual_seed(2)
     if n_tokens_raw is None:
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs.py
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_common.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_common.py
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_finalize_matmul.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_finalize_matmul.py
diff --git a/setup.py b/setup.py
diff --git a/test/TritonGPU/pipeline-lower-loop.mlir b/test/TritonGPU/pipeline-lower-loop.mlir
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp