Merge commit '4ff1fd66c2cea812226cc02aaa461e4355977ed7'

whitneywhtsang · whitneywhtsang · commit 65c7f470222e · 2024-10-08T19:31:26.000Z
diff --git a/include/triton/Conversion/TritonGPUToLLVM/Utility.h b/include/triton/Conversion/TritonGPUToLLVM/Utility.h
@@ -6,6 +6,7 @@
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
 #include "triton/Analysis/Utility.h"
 #include "triton/Conversion/MLIRTypes.h"
 #include "triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h"
@@ -364,17 +365,14 @@ inline bool isKernel(FunctionOpInterface funcOp) {
 
 inline Value getStackPointer(RewriterBase &rewriter,
                              FunctionOpInterface funcOp) {
+  if (!isKernel(funcOp)) {
+    return funcOp.getArgument(funcOp.getNumArguments() - 1);
+  }
+
   auto mod = funcOp->getParentOfType<ModuleOp>();
-  LLVM::GlobalOp globalBase = nullptr;
-  mod.walk([&](LLVM::GlobalOp op) {
-    if (op.getSymName() == "global_smem")
-      globalBase = op;
-  });
+  auto globalBase = dyn_cast<LLVM::GlobalOp>(mod.lookupSymbol("global_smem"));
   assert(globalBase);
-  if (isKernel(funcOp))
-    return rewriter.create<LLVM::AddressOfOp>(funcOp.getLoc(), globalBase);
-  else
-    return funcOp.getArgument(funcOp.getNumArguments() - 1);
+  return rewriter.create<LLVM::AddressOfOp>(funcOp.getLoc(), globalBase);
 }
 
 inline Value getSharedMemoryBase(Location loc, RewriterBase &rewriter,
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -602,6 +602,12 @@ def TT_GetProgramIdOp : TT_Op<"get_program_id", [Pure]> {
 
     let assemblyFormat = "$axis attr-dict `:` type($result)";
 
+    let builders = [
+      OpBuilder<(ins "int":$axis), [{
+        build($_builder, $_state, $_builder.getI32Type(), ProgramIDDimAttr::get($_builder.getContext(), ProgramIDDim(axis)));
+      }]>
+    ];
+
     let extraClassDeclaration = [{
       int32_t getAxisAsInt() {
         return static_cast<int32_t>(getAxis());
@@ -615,6 +621,11 @@ def TT_GetNumProgramsOp : TT_Op<"get_num_programs", [Pure]> {
     let results = (outs I32:$result);
 
     let assemblyFormat = "$axis attr-dict `:` type($result)";
+    let builders = [
+      OpBuilder<(ins "int":$axis), [{
+        build($_builder, $_state, $_builder.getI32Type(), ProgramIDDimAttr::get($_builder.getContext(), ProgramIDDim(axis)));
+      }]>
+    ];
 
     let extraClassDeclaration = [{
       int32_t getAxisAsInt() {
diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp
@@ -681,11 +681,11 @@ LinearLayout::divideRight(const LinearLayout &divisor) const {
       std::move(newBases), std::move(newOutDims.takeVector()),
       /*requireSurjective=*/false);
   LDBG("candidate_quotient:" << candidateQuotient);
-  LDBG("*candidate_quotient * divisor=" << *candidateQuotient * divisor);
   if (!candidateQuotient.has_value()) {
     LDBG("candidate quotient failed invariant checks");
     return std::nullopt;
   }
+  LDBG("*candidate_quotient * divisor=" << *candidateQuotient * divisor);
   if (*candidateQuotient * divisor != *this) {
     LDBG("candidate quotient failed invariant checks");
     return std::nullopt;
diff --git a/python/src/ir.cc b/python/src/ir.cc
@@ -1433,19 +1433,13 @@ void init_triton_ir(py::module &&m) {
            [](TritonOpBuilder &self, int axis) -> Value {
              if (axis < 0 || axis > 3)
                throw pybind11::index_error("program_id must be in [0,3]");
-             return self.create<GetProgramIdOp>(
-                 self.getBuilder().getI32Type(),
-                 ProgramIDDimAttr::get(self.getBuilder().getContext(),
-                                       ProgramIDDim(axis)));
+             return self.create<GetProgramIdOp>(axis);
            })
       .def("create_get_num_programs",
            [](TritonOpBuilder &self, int axis) -> Value {
              if (axis < 0 || axis > 3)
                throw pybind11::index_error("program_id must be in [0,3]");
-             return self.create<GetNumProgramsOp>(
-                 self.getBuilder().getI32Type(),
-                 ProgramIDDimAttr::get(self.getBuilder().getContext(),
-                                       ProgramIDDim(axis)));
+             return self.create<GetNumProgramsOp>(axis);
            })
       .def("create_dot",
            [](TritonOpBuilder &self, mlir::Value &a, mlir::Value &b,
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -3385,7 +3385,8 @@ def test_dot3d(B, num_warps, M, N, K, BLOCK_M, BLOCK_N, in_dtype_str, out_dtype_
         input_precision = "tf32" if is_cuda() and in_dtype_str == 'float32' else "ieee"
 
     if B == 8 and M == 64 and in_dtype_str == "float32" and out_dtype_str == "float32":
-        if torch.cuda.is_available() and triton.runtime.driver.active.utils.get_device_properties(
+        if not is_interpreter() and torch.cuda.is_available(
+        ) and triton.runtime.driver.active.utils.get_device_properties(
                 torch.cuda.current_device())["max_shared_mem"] < 131072:
             pytest.skip(
                 "Skipping tests with B = 8, M = 64, in_type = float32, out_type = float32 due to insufficient shared memory (less than 128 KB per SM) on this GPU."
diff --git a/python/triton/compiler/code_generator.py b/python/triton/compiler/code_generator.py
@@ -9,7 +9,7 @@
 from .. import language
 from .._C.libtriton import ir
 from ..language import constexpr, tensor, str_to_ty
-from ..language.core import _unwrap_if_constexpr, nv_tma_desc_type
+from ..language.core import _unwrap_if_constexpr, nv_tma_desc_type, _value
 from ..runtime.jit import _normalize_ty, get_jit_fn_file_line
 # ideally we wouldn't need any runtime component
 from ..runtime import JITFunction
@@ -47,6 +47,10 @@ def mangle_fn(name, arg_tys, constants):
     return ret
 
 
+def _is_triton_value(o: Any) -> bool:
+    return isinstance(o, _value)
+
+
 def _is_triton_tensor(o: Any) -> bool:
     return isinstance(o, tensor)
 
@@ -501,7 +505,7 @@ def visit_Assign(self, node):
             # by default, constexpr are assigned into python variable
             value = _unwrap_if_constexpr(value)
             if value is not None and \
-               not _is_triton_tensor(value) and \
+               not _is_triton_value(value) and \
                not isinstance(value, native_nontensor_types):
                 value = language.semantic.to_tensor(value, self.builder)
             self.set_value(name, value)
@@ -802,6 +806,15 @@ def visit_UnaryOp(self, node):
         ast.USub: '__neg__', ast.UAdd: '__pos__', ast.Not: '__not__', ast.Invert: '__invert__'
     }
 
+    def _verify_loop_carried_variable(self, name, loop_val, live_val):
+        assert _is_triton_value(loop_val), f'cannot reassign constxpr {name} in the loop'
+        assert _is_triton_value(live_val), f'cannot reasign constexpr {name} in the loop'
+        assert type(loop_val) == type(live_val), f'Loop carried variable {name} changed type'
+        assert not _is_triton_tensor(loop_val) or loop_val.type == live_val.type, \
+            f'Loop-carried variable {name} has initial type {live_val.type} '\
+            f'but is re-assigned to {loop_val.type} in loop! '\
+            f'Please make sure that the type stays consistent.'
+
     def visit_While(self, node):
         with enter_sub_region(self) as sr:
             liveins, insert_block = sr
@@ -824,17 +837,14 @@ def visit_While(self, node):
             for name in loop_defs:
                 if name in liveins:
                     # We should not def new constexpr
-                    assert _is_triton_tensor(loop_defs[name]), f'cannot reassign constxpr {name} in the loop'
-                    assert _is_triton_tensor(liveins[name]), f'cannot reasign constexpr {name} in the loop'
-                    assert loop_defs[name].type == liveins[name].type, \
-                        f'Loop-carried variable {name} has initial type {liveins[name].type} '\
-                        f'but is re-assigned to {loop_defs[name].type} in loop! '\
-                        f'Please make sure that the type stays consistent.'
+                    loop_val = loop_defs[name]
+                    live_val = liveins[name]
+                    self._verify_loop_carried_variable(name, loop_val, live_val)
 
                     # these are loop-carried values
                     names.append(name)
-                    ret_types.append(loop_defs[name].type)
-                    init_args.append(liveins[name])
+                    ret_types.append(loop_val.type)
+                    init_args.append(live_val)
 
             self._set_insertion_point_and_loc(ip, last_loc)
             while_op = self.builder.create_while_op([ty.to_ir(self.builder) for ty in ret_types],
@@ -972,16 +982,13 @@ def visit_For(self, node):
             names = []
             for name in self.local_defs:
                 if name in liveins:
-                    assert _is_triton_tensor(self.local_defs[name]), f'cannot reassign constxpr {name} in the loop'
-                    assert _is_triton_tensor(liveins[name]), f'cannot reassign constxpr {name} in the loop'
-                    assert self.local_defs[name].type == liveins[name].type, \
-                        f'Loop-carried variable {name} has initial type {liveins[name].type} '\
-                        f'but is re-assigned to {self.local_defs[name].type} in loop! '\
-                        f'Please make sure that the type stays consistent.'
+                    loop_val = self.local_defs[name]
+                    live_val = liveins[name]
+                    self._verify_loop_carried_variable(name, loop_val, live_val)
 
                     names.append(name)
-                    init_args.append(language.semantic.to_tensor(liveins[name], self.builder))
-                    yields.append(language.semantic.to_tensor(self.local_defs[name], self.builder))
+                    init_args.append(live_val)
+                    yields.append(loop_val)
 
             # create ForOp
             self._set_insertion_point_and_loc(ip, last_loc)
@@ -1051,7 +1058,7 @@ def visit_Assert(self, node) -> Any:
     def call_JitFunction(self, fn: JITFunction, args, kwargs):
         args = inspect.getcallargs(fn.fn, *args, **kwargs)
         args = [args[name] for name in fn.arg_names]
-        args = [arg if _is_triton_tensor(arg) else constexpr(arg) for arg in args]
+        args = [arg if _is_triton_value(arg) else constexpr(arg) for arg in args]
         # generate function def
         attributes = {}
         constexprs = [i for i, arg in enumerate(args) if _is_constexpr(arg)]
@@ -1110,7 +1117,7 @@ def visit_Call(self, node):
         if isinstance(fn, JITFunction):
             _check_fn_args(node, fn, args)
             return self.call_JitFunction(fn, args, kws)
-        if (hasattr(fn, '__self__') and _is_triton_tensor(fn.__self__)) or language.core.is_builtin(fn):
+        if (hasattr(fn, '__self__') and _is_triton_value(fn.__self__)) or language.core.is_builtin(fn):
             extra_kwargs = {"_builder": self.builder}
             sig = inspect.signature(fn)
             if '_generator' in sig.parameters:
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
@@ -701,12 +701,20 @@ def get_int_dtype(bitwidth: int, signed: bool) -> dtype:
         raise ValueError(f'Unsupported bitwidth {bitwidth} and signedness {signed}')
 
 
+class _value:
+    """Base class of values that exist in the triton IR (i.e. not constexprs).
+    """
+
+    def __init__(self, handle):
+        self.handle = handle
+
+
 # -----------------------
 # tensor
 # -----------------------
 
 
-class tensor:
+class tensor(_value):
     """Represents an N-dimensional array of values or pointers.
 
     :code:`tensor` is the fundamental data structure in Triton programs.  Most
@@ -729,7 +737,7 @@ class tensor:
     def __init__(self, handle, type: dtype):
         """Not called by user code."""
         # IR handle
-        self.handle = handle
+        super().__init__(handle)
         # Block shape
         self.shape = type.shape if type.is_block() else ()
         self.numel = 1
diff --git a/test/TritonGPU/amd/amd-canonicalize-pointers.mlir b/test/TritonGPU/amd/amd-canonicalize-pointers.mlir
@@ -577,3 +577,119 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war
     tt.return %11 : tensor<1024xf32, #blocked>
   }
 }
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} {
+  // CHECK-LABEL: scalar_pointers
+  tt.func public @scalar_pointers(%arg0: !tt.ptr<i64> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<i32> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    %0 = tt.get_program_id x : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %c10_i64 = arith.constant 10 : i64
+    %c100_i32 = arith.constant 100 : i32
+    %5 = tt.addptr %arg0, %c1_i32 : !tt.ptr<i64>, i32
+    // CHECK: arith.constant 0 : i64
+    // CHECK: arith.constant 0 : i64
+    // CHECK: %[[offset0:.*]] = arith.constant 0 : i64
+    // CHECK: %[[ptr0:.*]] = tt.addptr %arg0, %c1_i32 : !tt.ptr<i64>, i32
+    // CHECK: scf.for {{.*}} iter_args({{.*}}, %[[ptr1:.*]] = %[[ptr0]], %[[offset1:.*]] = %[[offset0]])
+    %10:1 = scf.for %arg3 = %c1_i32 to %c100_i32 step %c1_i32 iter_args(%arg4 = %5) -> (!tt.ptr<i64>)  : i32 {
+        // CHECK: tt.store %[[ptr1]]
+        tt.store %arg4, %c0_i64 : !tt.ptr<i64>
+        // CHECK: tt.addptr %[[ptr1]]
+        %11 = tt.addptr %arg4, %c1_i32 : !tt.ptr<i64>, i32
+        scf.yield %11 : !tt.ptr<i64>
+    }
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} {
+  // CHECK-LABEL: @scalar_if
+  tt.func @scalar_if(%arg0: !tt.ptr<f32>, %init : tensor<1024xf32, #blocked>, %cond : i1)->f32{
+    %0 = tt.get_program_id x : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %c10_i64 = arith.constant 10 : i64
+    %c100_i32 = arith.constant 100 : i32
+    %5 = tt.addptr %arg0, %c1_i32 : !tt.ptr<f32>, i32
+    // CHECK: %[[ptr0:.*]] = tt.addptr %arg0, %{{.*}}
+    // CHECK: scf.if {{.*}} -> ({{.*}}, !tt.ptr<f32>, i64)
+    %6 = scf.if %cond -> (!tt.ptr<f32>){
+        %true = tt.addptr %5, %c1_i32 : !tt.ptr<f32>, i32
+        // CHECK: %[[ptr1:.*]] = tt.addptr %[[ptr0]]
+        // CHECK: scf.yield {{.*}}, %[[ptr1]]
+        scf.yield %true : !tt.ptr<f32>
+    } else {
+        %false = tt.addptr %5, %c100_i32 : !tt.ptr<f32>, i32
+        // CHECK: %[[ptr2:.*]] = tt.addptr %[[ptr0]]
+        // CHECK: scf.yield {{.*}}, %[[ptr2]]
+        scf.yield %false : !tt.ptr<f32>
+    }
+    %11 = tt.load %6 : !tt.ptr<f32>
+    tt.return %11 : f32
+  }
+}
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
+module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
+  // CHECK-LABEL: tt.func @scalar_while
+  tt.func @scalar_while(%arg0: !tt.ptr<f32>, %init : f32, %cond : i1)->f32{
+    %c1024_i32 = arith.constant 1024 : i32
+    %c0 = arith.constant 0: index
+    %c128 = arith.constant 128: index
+    %c1 = arith.constant 1 : index
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    // CHECK: %[[ptr0:.*]] = tt.addptr %arg0, %{{.*}}
+    // CHECK: scf.while ({{.*}}, {{.*}} = %arg2, %[[ptr1:.*]] = %[[ptr0]], {{.*}})
+    %2 = tt.addptr %arg0, %0 : !tt.ptr<f32>, i32
+    %6 = scf.while (%arg1 = %2, %arg2 = %cond) : (!tt.ptr<f32>, i1) -> (!tt.ptr<f32>) {
+        // CHECK: scf.condition({{.*}}) {{.*}}, %[[ptr1]]
+        scf.condition(%arg2) %arg1 : !tt.ptr<f32>
+        } do {
+        // CHECK: ^bb0({{.*}}: !tt.ptr<f32>, %[[ptr2:.*]]: !tt.ptr<f32>, {{.*}})
+        // CHECK:   scf.yield %{{.*}}, {{.*}} %[[ptr2]], {{.*}}, {{.*}}
+        ^bb0(%arg1: !tt.ptr<f32>):
+          scf.yield %arg1, %cond : !tt.ptr<f32>, i1
+        }
+    %11 = tt.load %6 : !tt.ptr<f32>
+    tt.return %11 : f32
+  }
+}
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
+module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
+  // CHECK-LABEL: tt.func @scalar_cond_branch
+  tt.func @scalar_cond_branch(%arg0 : !tt.ptr<f32>, %i1 : i1) -> f32{
+    %c1024_i32 = arith.constant 1024 : i32
+    %c0 = arith.constant 0: index
+    %c128 = arith.constant 128: index
+    %c1 = arith.constant 1 : index
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %6 = tt.addptr %arg0, %0 : !tt.ptr<f32>, i32
+    // CHECK: %[[ptr0:.*]] = tt.addptr %arg0
+    // CHECK: cf.cond_br %arg1, ^bb1(%{{.*}}, %[[ptr0]], {{.*}}), ^bb2(%{{.*}}, %arg0, {{.*}})
+    cf.cond_br %i1, ^bb1(%6 : !tt.ptr<f32>), ^bb2(%arg0 : !tt.ptr<f32>)
+    // CHECK: ^bb1({{.*}}, %[[ptr1:.*]]: !tt.ptr<f32>, {{.*}}):
+    ^bb1(%arg1 : !tt.ptr<f32>):
+      // CHECK: tt.load %[[ptr1]]
+      %out1 = tt.load %arg1 : !tt.ptr<f32>
+      tt.return %out1 : f32
+    // CHECK: ^bb2({{.*}}, %[[ptr2:.*]]: !tt.ptr<f32>, {{.*}}):
+    ^bb2(%arg2 : !tt.ptr<f32>): // 2 preds: ^bb0, ^bb1
+      // CHECK: tt.load %[[ptr2]]
+      %out2 = tt.load %arg2 : !tt.ptr<f32>
+      tt.return %out2 : f32
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp