Merge commit '35f1827581071a5ac3a385f8776ab1a3a784811a'

whitneywhtsang · whitneywhtsang · commit e4d0843ec32c · 2024-11-14T00:31:16.000Z
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -778,7 +778,8 @@ def TT_ScanReturnOp: TT_Op<"scan.return",
 def TT_ExternElementwiseOp : TT_Op<"extern_elementwise", [Elementwise,
                                                             SameOperandsAndResultEncoding,
                                                             SameVariadicOperandSize,
-                                                            DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
+                                                            DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+                                                            ConditionallySpeculatable]> {
 
     let description = [{
         call an external function $symbol implemented in $libpath/$libname with $args
@@ -790,6 +791,12 @@ def TT_ExternElementwiseOp : TT_Op<"extern_elementwise", [Elementwise,
     let results = (outs TT_Type:$result);
 
     let assemblyFormat = "operands attr-dict `:` functional-type(operands, $result)";
+
+    let extraClassDeclaration = [{
+      // Interface method for ConditionallySpeculatable.
+      Speculation::Speculatability getSpeculatability();
+    }];
+
 }
 
 //
diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
@@ -71,18 +71,25 @@ unsigned ReduceOpHelper::getThreadOffsetOnReductionAxis() {
   }
 
   unsigned threadOffset = 1;
-  if (auto sliceLayout = mlir::dyn_cast<SliceEncodingAttr>(srcLayout)) {
-    auto parentLayout = sliceLayout.getParent();
-    auto threadsPerWarp = getThreadsPerWarp(parentLayout);
-    threadOffset = threadsPerWarp[sliceLayout.getDim()];
-  } else {
-    auto threadsPerWarp = getThreadsPerWarp(srcLayout);
-    auto order = getThreadOrder(srcLayout);
-    for (unsigned i = 0; i < order.size(); i++) {
-      if (order[i] == axis)
-        break;
-      threadOffset *= threadsPerWarp[order[i]];
-    }
+  SmallVector<int> dimsRemoved;
+  while (auto sliceLayout = mlir::dyn_cast<SliceEncodingAttr>(srcLayout)) {
+    dimsRemoved.push_back(sliceLayout.getDim());
+    srcLayout = sliceLayout.getParent();
+  }
+  // In case of slice layout we want to know the axis dimension relative to the
+  // most inner parent layout. `adjustedAxis` is the matching axis dim in the
+  // parent layout.
+  int adjustedAxis = axis;
+  for (auto dim : dimsRemoved) {
+    if (dim <= adjustedAxis)
+      adjustedAxis++;
+  }
+  auto threadsPerWarp = getThreadsPerWarp(srcLayout);
+  auto order = getThreadOrder(srcLayout);
+  for (unsigned i = 0; i < order.size(); i++) {
+    if (order[i] == adjustedAxis)
+      break;
+    threadOffset *= threadsPerWarp[order[i]];
   }
   return threadOffset;
 }
diff --git a/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
@@ -218,6 +218,46 @@ struct ReduceOpConversion
     rewriter.replaceOp(op, results);
   }
 
+  // For slice layout some ids are duplicated on multiple lanes, so we need to
+  // handle the delinearization of laneId in a special way. We need to
+  // generalize this part of the logic to work on any kind of linear layout
+  // uniformely.
+  SmallVector<Value>
+  getMultiDimLaneId(ReduceOpHelper &helper, Value &laneId, Location &loc,
+                    ConversionPatternRewriter &rewriter) const {
+    auto srcLayout = helper.getSrcLayout();
+    auto srcShape = helper.getSrcShape();
+    auto order = triton::gpu::getThreadOrder(srcLayout);
+    SmallVector<Value> multiDimLaneId;
+
+    if (auto sliceLayout = mlir::dyn_cast<SliceEncodingAttr>(srcLayout)) {
+      auto parentLayout = sliceLayout.getParent();
+      SmallVector<unsigned> dims = {sliceLayout.getDim()};
+      while (auto parentSliceLayout =
+                 mlir::dyn_cast<SliceEncodingAttr>(parentLayout)) {
+        dims.push_back(parentSliceLayout.getDim());
+        parentLayout = parentSliceLayout.getParent();
+      }
+
+      auto parentThreadsPerWarps = triton::gpu::getThreadsPerWarp(parentLayout);
+      auto parentOrder = triton::gpu::getThreadOrder(parentLayout);
+      multiDimLaneId = delinearize(rewriter, loc, laneId, parentThreadsPerWarps,
+                                   parentOrder);
+      for (unsigned dim : llvm::reverse(dims)) {
+        multiDimLaneId.erase(multiDimLaneId.begin() + dim);
+      }
+    } else {
+      SmallVector<unsigned> threadsPerWarps =
+          triton::gpu::getThreadsPerWarp(srcLayout);
+      threadsPerWarps[helper.getAxis()] =
+          triton::gpu::getThreadsPerWarpWithUniqueData(
+              srcLayout, srcShape)[helper.getAxis()];
+      multiDimLaneId =
+          delinearize(rewriter, loc, laneId, threadsPerWarps, order);
+    }
+    return multiDimLaneId;
+  }
+
   SmallVector<Value>
   getMultiDimWarpId(ReduceOpHelper &helper, Value &warpId, Location &loc,
                     ConversionPatternRewriter &rewriter) const {
@@ -231,11 +271,20 @@ struct ReduceOpConversion
     // a way to properly delinearize warpId in the slice case
     if (auto sliceLayout = mlir::dyn_cast<SliceEncodingAttr>(srcLayout)) {
       auto parentLayout = sliceLayout.getParent();
+      SmallVector<unsigned> dims = {sliceLayout.getDim()};
+      while (auto parentSliceLayout =
+                 mlir::dyn_cast<SliceEncodingAttr>(parentLayout)) {
+        dims.push_back(parentSliceLayout.getDim());
+        parentLayout = parentSliceLayout.getParent();
+      }
+
       auto parentWarpsPerCTA = triton::gpu::getWarpsPerCTA(parentLayout);
       auto parentOrder = triton::gpu::getWarpOrder(parentLayout);
       multiDimWarpId =
           delinearize(rewriter, loc, warpId, parentWarpsPerCTA, parentOrder);
-      multiDimWarpId.erase(multiDimWarpId.begin() + sliceLayout.getDim());
+      for (unsigned dim : llvm::reverse(dims)) {
+        multiDimWarpId.erase(multiDimWarpId.begin() + dim);
+      }
     } else {
       SmallVector<unsigned> warpsPerCTA =
           triton::gpu::getWarpsPerCTA(srcLayout);
@@ -263,11 +312,8 @@ struct ReduceOpConversion
     unsigned axis = op.getAxis();
     auto smemShape = helper.getScratchRepShape();
 
-    auto threadsPerWarp =
-        triton::gpu::getThreadsPerWarpWithUniqueData(srcLayout, srcShape);
-    auto order = getThreadOrder(srcLayout);
     SmallVector<Value> multiDimLaneId =
-        delinearize(rewriter, loc, laneId, threadsPerWarp, order);
+        getMultiDimLaneId(helper, laneId, loc, rewriter);
     Value laneIdAxis = multiDimLaneId[axis];
     Value zero = i32_val(0);
     Value laneZero = icmp_eq(laneIdAxis, zero);
diff --git a/lib/Dialect/Triton/IR/Ops.cpp b/lib/Dialect/Triton/IR/Ops.cpp
@@ -1039,6 +1039,12 @@ void ExternElementwiseOp::getEffects(
                        SideEffects::DefaultResource::get());
 }
 
+Speculation::Speculatability ExternElementwiseOp::getSpeculatability() {
+  if (getPure())
+    return Speculation::Speculatable;
+  return Speculation::NotSpeculatable;
+}
+
 // -- ExperimentalTensormapCreateOp --
 LogicalResult ExperimentalTensormapCreateOp::verify() {
   auto rank = getBoxDim().size();
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -31,6 +31,7 @@
     is_hip,
     is_hip_cdna,
     is_hip_mi200,
+    is_hip_mi300,
     is_xpu,
     get_arch,
     torch_float8_dtypes,
@@ -3414,8 +3415,8 @@ def test_scaled_dot(M, N, K, col_a, col_b, type_a, type_b, num_warps, mma, kpack
     if is_hip():
         if not is_hip_cdna():
             pytest.skip("scaled_dot only implemented for HIP CDNA")
-        if (type_a not in ["e2m1", "e5m2"]) or (type_b not in ["e2m1", "e5m2", "bf16"]):
-            pytest.skip(f"scaled_dot({type_a}, {type_b}) not yet implemented for HIP")
+        if "e4m3" in (type_a, type_b) and not is_hip_mi300():
+            pytest.skip(f"scaled_dot({type_a}, {type_b}) only implemented for MI300")
         if mma == 16 and K == 64:
             pytest.skip(f"K == {K} too small for mfma {mma} in scaled_dot")
     if is_xpu():
@@ -6072,3 +6073,33 @@ def sanitize_cumsum_kernel(Z, X, BLOCK: tl.constexpr):
     Z = torch.zeros_like(X)
     sanitize_cumsum_kernel[(1, )](Z, X, BLOCK=BLOCK)
     torch.testing.assert_close(Z, X.cumsum(0).to(torch.int32))
+
+
+# stress test slice layout usages in reductions.
+@pytest.mark.parametrize("in_shape, perm, red_dims", [
+    ((4, 32, 32, 4, 2), [2, 1, 0, 3, 4], [3, 1, 0]),
+    ((8, 2, 32, 4, 16), [4, 0, 1, 3, 2], [0, 2, 0]),
+])
+def test_chained_reductions(in_shape, perm, red_dims, device):
+
+    @triton.jit
+    def kernel(In, Out,  #
+               dim_0: tl.constexpr, dim_1: tl.constexpr, dim_2: tl.constexpr, dim_3: tl.constexpr, dim_4: tl.constexpr,
+               perm_0: tl.constexpr, perm_1: tl.constexpr, perm_2: tl.constexpr, perm_3: tl.constexpr,
+               perm_4: tl.constexpr, red_dim_0: tl.constexpr, red_dim_1: tl.constexpr, red_dim_2: tl.constexpr):
+        idx = tl.arange(0, dim_0 * dim_1 * dim_2 * dim_3 * dim_4)
+        idx = idx.reshape(dim_0, dim_1, dim_2, dim_3, dim_4)
+        vals = tl.load(In + idx)
+        vals = tl.permute(vals, [perm_0, perm_1, perm_2, perm_3, perm_4])
+        r = tl.sum(tl.sum(tl.sum(vals, red_dim_0), red_dim_1), red_dim_2)
+        st_idx = tl.arange(0, r.shape[0] * r.shape[1]).reshape(r.shape)
+        tl.store(Out + st_idx, r)
+
+    input = torch.randint(0, 1000, in_shape, device=device, dtype=torch.int32)
+    temp = torch.permute(input, perm).contiguous()
+    ref = torch.sum(torch.sum(torch.sum(temp, dim=red_dims[0]), dim=red_dims[1]), dim=red_dims[2])
+    result = torch.empty_like(ref)
+    kernel[(1, )](input, result, input.shape[0], input.shape[1], input.shape[2], input.shape[3], input.shape[4],
+                  perm[0], perm[1], perm[2], perm[3], perm[4], red_dims[0], red_dims[1], red_dims[2])
+
+    assert torch.all(ref == result)
diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir
@@ -1855,3 +1855,21 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 #loc3 = loc("inner_call":29:28)
 #loc4 = loc(callsite(#loc3 at #loc1))
 #loc5 = loc(callsite(#loc4 at #loc2))
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @log1pf_scan(%39: tensor<32x16xf32, #blocked>) attributes {noinline = false} {
+    // CHECK: log1pf_scan
+    // non-speculatable ops will introduce a cond_br; extern_elementwise with pure = true should be considered speculatable.
+    // CHECK-NOT: llvm.cond_br
+    %40 = "tt.scan"(%39) <{axis = 1 : i32, reverse = false}> ({
+    ^bb0(%arg5: f32, %arg6: f32):
+      %43 = tt.extern_elementwise %arg5 {libname = "", libpath = "", pure = true, symbol = "__nv_log1pf"} : (f32) -> f32
+      %44 = arith.addf %43, %43 : f32
+      tt.scan.return %44 : f32
+    }) : (tensor<32x16xf32, #blocked>) -> tensor<32x16xf32, #blocked>
+    tt.return
+  }
+}