Merge commit 'd997364bd617ba91911ecd73070f57f291611203'

whitneywhtsang · whitneywhtsang · commit be627a3640f1 · 2024-10-22T01:34:27.000Z
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
@@ -10,8 +10,7 @@ name: Integration Tests
 on:
   workflow_dispatch:
   pull_request:
-    # You can name your branch dev-foo to get CI runs.
-    branches: [main, 'dev-**']
+    branches-ignore: ['llvm-**']
   merge_group:
     branches: [main, 'dev-**']
     types: [checks_requested]
diff --git a/.github/workflows/integration-tests.yml.in b/.github/workflows/integration-tests.yml.in
@@ -9,8 +9,7 @@ name: Integration Tests
 on:
   workflow_dispatch:
   pull_request:
-    # You can name your branch dev-foo to get CI runs.
-    branches: [main, 'dev-**']
+    branches-ignore: ['llvm-**']
   merge_group:
     branches: [main, 'dev-**']
     types: [checks_requested]
diff --git a/include/triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h b/include/triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h
@@ -88,10 +88,11 @@ class ElementwiseOpConversionBase : public ConvertOpToLLVMPattern<SourceOp> {
       // encoding not available
       return resultVals;
     Attribute baseEncoding = encoding;
-    if (isa<AMDMfmaEncodingAttr>(baseEncoding))
-      // TODO: this logic seems incorrect for mfma layout. Skip for now.
-      // We saw mismatches for some flash-attention tests on AMD backend.
-      // Note that this logic works for sliced layout whose parent is
+    if (isa<AMDMfmaEncodingAttr>(baseEncoding) ||
+        isa<AMDWmmaEncodingAttr>(baseEncoding))
+      // TODO: this logic seems incorrect for mfma and wmma layout. Skip for
+      // now. We saw mismatches for some flash-attention and dot tests on AMD
+      // backend. Note that this logic works for sliced layout whose parent is
       // mfma layout. Therefore, this is not combined with the following check.
       return resultVals;
     while (auto sliced = dyn_cast<SliceEncodingAttr>(baseEncoding))
diff --git a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
@@ -250,6 +250,13 @@ chooseStMatrixLayout(MLIRContext *ctx, RankedTensorType tensorTy,
                      ArrayRef<unsigned> repShape,
                      ArrayRef<unsigned> paddedRepShape,
                      ArrayRef<unsigned> order, int swizzleByteSize);
+
+// FIXME
+// Exposing to use it in LinearLayoutConversionsTest.cpp
+// Remove it once we fully activate the DotOperand conversion via LLs
+class DotOperandEncodingAttr;
+LinearLayout ampereDotToLinearLayout(ArrayRef<int64_t> shape,
+                                     DotOperandEncodingAttr dot);
 } // namespace mlir::triton::gpu
 
 #endif // TRITON_DIALECT_TRITONGPU_IR_LINEARLAYOUTCONVERSIONS_H
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -1044,16 +1044,12 @@ SmallVector<unsigned> DotOperandEncodingAttr::getCTASplitNum() const {
   return res;
 }
 SmallVector<unsigned> DotOperandEncodingAttr::getWarpsPerCTA() const {
-  auto parentLayout = getParent();
-  assert(parentLayout && "DotOperandEncodingAttr must have a parent");
-  if (auto distributedLayout =
-          mlir::dyn_cast<DistributedEncodingTrait>(parentLayout)) {
-    return distributedLayout.getWarpsPerCTA();
-  } else {
-    llvm::report_fatal_error(
-        "DotOperandEncodingAttr non-DistributedEncodingAttr parent not "
-        "supported yet");
-  }
+  auto distributedLayout = mlir::cast<DistributedEncodingTrait>(getParent());
+  auto warps = distributedLayout.getWarpsPerCTA();
+  auto rank = warps.size();
+  auto kDim = getOpIdx() == 0 ? rank - 1 : rank - 2;
+  warps[kDim] = 1;
+  return warps;
 }
 SmallVector<unsigned> DotOperandEncodingAttr::getWarpOrder() const {
   return ::getWarpOrder(*this);
diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
@@ -5,6 +5,7 @@
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
+#include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h"
 #include "triton/Tools/LinearLayout.h"
 #include "triton/Tools/StrUtil.h"
 #include "llvm/ADT/DenseMap.h"
@@ -822,16 +823,82 @@ SliceEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
   return ret;
 }
 
+LinearLayout ampereDotToLinearLayout(ArrayRef<int64_t> shape,
+                                     DotOperandEncodingAttr dot) {
+  // TODO,BE. Implement ampereMMA in terms of this one
+  int rank = shape.size();
+  auto mma = cast<NvidiaMmaEncodingAttr>(dot.getParent());
+  int kWidth = dot.getKWidth();
+  bool isA = dot.getOpIdx() == 0;
+
+  assert(mma.isAmpere());
+  assert((rank == 2 && mma.getInstrShape() == ArrayRef<unsigned>({16, 8})) ||
+         (rank == 3 && mma.getInstrShape() == ArrayRef<unsigned>({1, 16, 8})));
+
+  MLIRContext *ctx = mma.getContext();
+  SmallVector<StringAttr> dimNames = standardOutDimNames(ctx, rank);
+
+  // Implement A. For B transpose in the end
+  std::vector<std::vector<int32_t>> registers;
+  std::vector<std::vector<int32_t>> lanes;
+  int32_t i = 1;
+  // kWidth contiguous elements
+  while (i < kWidth) {
+    registers.push_back({i, 0});
+    i *= 2;
+  }
+  // 4 threads per chunk
+  for (int j = 0; j < 2; j++) {
+    lanes.push_back({i, 0});
+    i *= 2;
+  }
+  // 8 threads going down
+  lanes.push_back({0, 1});
+  lanes.push_back({0, 2});
+  lanes.push_back({0, 4});
+  // 2 tiles in column-major order
+  // Just one if it's the B operand
+  if (isA) {
+    registers.push_back({0, 8});
+  }
+  registers.push_back({i, 0});
+
+  if (!isA) {
+    for (auto &r : registers) {
+      std::swap(r[0], r[1]);
+    }
+    for (auto &l : lanes) {
+      std::swap(l[0], l[1]);
+    }
+  }
+
+  LinearLayout ctaLayout(
+      {{S("register"), registers}, {S("lane"), lanes}},
+      llvm::to_vector(llvm::reverse(ArrayRef(dimNames).take_back(2))));
+
+  auto order = dot.getCTAOrder();
+  assert(order[0] == 1 && order[1] == 0);
+  ctaLayout *= identityND(S("warp"), dot.getWarpsPerCTA(), order, dimNames);
+
+  return combineCtaCgaWithShape(ctaLayout, mma.getCTALayout(), shape);
+}
+
 std::optional<LinearLayout>
 DotOperandEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
-
   if (auto mfmaLayout = llvm::dyn_cast<AMDMfmaEncodingAttr>(getParent())) {
     return dotOperandMfmaToLinearLayout(*this, shape);
   }
+
+  // TODO Activate in a follow-up PR
+  // else if (auto mma = mlir::dyn_cast<NvidiaMmaEncodingAttr>(getParent())) {
+  //  if (mma.isAmpere()) {
+  //    return ampereDotToLinearLayout(shape, *this);
+  //  }
+  //}
+
   if (auto dpasLayout = llvm::dyn_cast<intel::DpasEncodingAttr>(getParent())) {
     return dotOperandDpasToLinearLayout(*this, shape);
   }
-
   return std::nullopt;
 }
 
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipelineExpander.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipelineExpander.cpp
@@ -285,19 +285,6 @@ LogicalResult LoopPipelinerInternal::emitPrologue(RewriterBase &rewriter) {
   Location loc = forOp.getLoc();
   SmallVector<Value> predicates(maxStage);
   for (int64_t i = 0; i < maxStage; i++) {
-    if (dynamicLoop) {
-      Type t = ub.getType();
-      // pred = ub > lb + (i * step)
-      Value iv = rewriter.create<arith::AddIOp>(
-          loc, lb,
-          rewriter.create<arith::MulIOp>(
-              loc, step,
-              rewriter.create<arith::ConstantOp>(
-                  loc, rewriter.getIntegerAttr(t, i))));
-      predicates[i] = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::slt, iv, ub);
-    }
-
     // special handling for induction variable as the increment is implicit.
     // iv = lb + i * step
     Type t = lb.getType();
@@ -308,6 +295,13 @@ LogicalResult LoopPipelinerInternal::emitPrologue(RewriterBase &rewriter) {
             rewriter.create<arith::ConstantOp>(loc,
                                                rewriter.getIntegerAttr(t, i))));
     setValueMapping(forOp.getInductionVar(), iv, i);
+
+    if (dynamicLoop) {
+      // pred = ub > lb + (i * step)
+      predicates[i] = rewriter.create<arith::CmpIOp>(
+          loc, arith::CmpIPredicate::slt, iv, ub);
+    }
+
     for (Operation *op : opOrder) {
       if (stages[op] > i)
         continue;
@@ -655,50 +649,56 @@ LogicalResult
 LoopPipelinerInternal::emitEpilogue(RewriterBase &rewriter,
                                     llvm::SmallVector<Value> &returnValues) {
   Location loc = forOp.getLoc();
+  Type t = lb.getType();
   // Emit different versions of the induction variable. They will be
   // removed by dead code if not used.
 
-  // range_diff = ub - lb
-  // total_iterations = (range_diff + step + (step < 0 ? 1 : -1)) / step
-  Type t = lb.getType();
-  Value zero =
-      rewriter.create<arith::ConstantOp>(loc, rewriter.getIntegerAttr(t, 0));
-  Value one =
-      rewriter.create<arith::ConstantOp>(loc, rewriter.getIntegerAttr(t, 1));
-  Value minusOne =
-      rewriter.create<arith::ConstantOp>(loc, rewriter.getIntegerAttr(t, -1));
+  auto createConst = [&](int v) {
+    return rewriter.create<arith::ConstantOp>(loc,
+                                              rewriter.getIntegerAttr(t, v));
+  };
+
+  // total_iterations = cdiv(range_diff, step);
+  // - range_diff = ub - lb
+  // - total_iterations = (range_diff + step + (step < 0 ? 1 : -1)) / step
+  Value zero = createConst(0);
+  Value one = createConst(1);
   Value stepLessZero = rewriter.create<arith::CmpIOp>(
       loc, arith::CmpIPredicate::slt, step, zero);
   Value stepDecr =
-      rewriter.create<arith::SelectOp>(loc, stepLessZero, one, minusOne);
+      rewriter.create<arith::SelectOp>(loc, stepLessZero, one, createConst(-1));
 
   Value rangeDiff = rewriter.create<arith::SubIOp>(loc, ub, lb);
   Value rangeIncrStep = rewriter.create<arith::AddIOp>(loc, rangeDiff, step);
   Value rangeDecr =
       rewriter.create<arith::AddIOp>(loc, rangeIncrStep, stepDecr);
   Value totalIterations = rewriter.create<arith::DivSIOp>(loc, rangeDecr, step);
 
+  // If total_iters < max_stage, start the epilogue at zero to match the
+  // ramp-up in the prologue.
+  // start_iter = max(0, total_iters - max_stage)
+  Value iterI = rewriter.create<arith::SubIOp>(loc, totalIterations,
+                                               createConst(maxStage));
+  iterI = rewriter.create<arith::MaxSIOp>(loc, zero, iterI);
+
   // Capture predicates for dynamic loops.
   SmallVector<Value> predicates(maxStage + 1);
 
-  for (int64_t i = 0; i < maxStage; i++) {
-    // iterI = total_iters - 1 - i
-    // May go negative...
-    Value minusI =
-        rewriter.create<arith::ConstantOp>(loc, rewriter.getIntegerAttr(t, -i));
-    Value iterI = rewriter.create<arith::AddIOp>(
-        loc, rewriter.create<arith::AddIOp>(loc, totalIterations, minusOne),
-        minusI);
+  for (int64_t i = 1; i <= maxStage; i++) {
     // newLastIter = lb + step * iterI
     Value newlastIter = rewriter.create<arith::AddIOp>(
         loc, lb, rewriter.create<arith::MulIOp>(loc, step, iterI));
 
-    setValueMapping(forOp.getInductionVar(), newlastIter, maxStage - i);
+    setValueMapping(forOp.getInductionVar(), newlastIter, i);
+
+    // increment to next iterI
+    iterI = rewriter.create<arith::AddIOp>(loc, iterI, one);
 
     if (dynamicLoop) {
-      // pred = iterI >= 0
-      predicates[i + 1] = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::sge, iterI, zero);
+      // Disable stages when `i` is greater than total_iters.
+      // pred = total_iters >= i
+      predicates[i] = rewriter.create<arith::CmpIOp>(
+          loc, arith::CmpIPredicate::sge, totalIterations, createConst(i));
     }
   }
 
diff --git a/python/test/unit/language/test_pipeliner.py b/python/test/unit/language/test_pipeliner.py
@@ -180,3 +180,32 @@ def test_pipeline_vecadd(device):
         assert ttgir.count(f"num = {NUM_STAGES} : i32") != 0, "num_stages not match"
         # 3. check alloc
         assert ttgir.count("triton_gpu.local_alloc") == 2, "alloc number not match"
+
+
+@pytest.mark.parametrize("ROW_COUNT", [0, 1, 2, 3])
+@pytest.mark.parametrize("NUM_STAGES", [1, 2, 3, 4, 5])
+def test_pipeline_epilogue(ROW_COUNT, NUM_STAGES, device):
+
+    @triton.jit
+    def kernel_up(output_ptr, input_ptr, input_row_stride, output_row_stride, n_rows, n_cols, BLOCK_SIZE: tl.constexpr,
+                  NUM_STAGES: tl.constexpr):
+        row_step = tl.num_programs(0)
+        col_offsets = tl.arange(0, BLOCK_SIZE)
+        mask = col_offsets < n_cols
+        for row_idx in tl.range(0, n_rows, row_step, num_stages=NUM_STAGES):
+            row_start_ptr = input_ptr + row_idx * input_row_stride
+            input_ptrs = row_start_ptr + col_offsets
+            val = tl.load(input_ptrs, mask=mask, other=-float('inf'))
+            val += 1.0
+            output_row_start_ptr = output_ptr + row_idx * output_row_stride
+            output_ptrs = output_row_start_ptr + col_offsets
+            tl.store(output_ptrs, val, mask=mask)
+
+    width = ROW_COUNT
+    depth = 78
+    x = torch.zeros(width, depth, device='cuda')
+    y0 = torch.rand_like(x)
+    n_rows, n_cols = x.shape
+    BLOCK_SIZE = triton.next_power_of_2(n_cols)
+    kernel_up[(1, )](y0, x, x.stride(0), y0.stride(0), n_rows, n_cols, BLOCK_SIZE, NUM_STAGES)
+    assert (y0 == torch.ones_like(x)).all()
diff --git a/python/triton/language/_utils.py b/python/triton/language/_utils.py
@@ -0,0 +1,21 @@
+from typing import List
+
+TRITON_MAX_TENSOR_NUMEL = 1048576
+
+
+def is_power_of_two(x):
+    return (x & (x - 1)) == 0
+
+
+def validate_block_shape(shape: List[int]):
+    numel = 1
+    for i, d in enumerate(shape):
+        if not isinstance(d, int):
+            raise TypeError(f"Shape element {i} must have type `constexpr[int]`, got `constexpr[{type(d)}]")
+        if not is_power_of_two(d):
+            raise ValueError(f"Shape element {i} must be a power of 2")
+        numel *= d
+
+    if numel > TRITON_MAX_TENSOR_NUMEL:
+        raise ValueError(f"numel ({numel}) exceeds triton maximum tensor numel ({TRITON_MAX_TENSOR_NUMEL})")
+    return numel
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
@@ -13,11 +13,10 @@
 
 from .._C.libtriton import ir
 from . import semantic
+from ._utils import TRITON_MAX_TENSOR_NUMEL, validate_block_shape
 
 T = TypeVar('T')
 
-TRITON_MAX_TENSOR_NUMEL = 1048576
-
 TRITON_BUILTIN = "__triton_builtin__"
 
 PropagateNan = ir.PROPAGATE_NAN
@@ -612,18 +611,11 @@ def __init__(self, element_ty: dtype, shape: List):
         # while tensor's shape is a list of constexpr.
 
         # shape can be empty ([]) when an input is a 0D tensor.
-        if not shape:
+        self.shape = _unwrap_shape(shape)
+        if not self.shape:
             raise TypeError('0d block_type is forbidden')
-        if isinstance(shape[0], constexpr):
-            shape = [s.value for s in shape]
-
-        self.shape = shape
-        self.numel = 1
-        for s in self.shape:
-            self.numel *= s
-        if self.numel > TRITON_MAX_TENSOR_NUMEL:
-            raise ValueError(f"numel ({self.numel}) exceeds triton maximum tensor numel ({TRITON_MAX_TENSOR_NUMEL})")
 
+        self.numel = validate_block_shape(self.shape)
         self.name = f'<{self.shape}, {self.element_ty}>'
 
     def to_ir(self, builder: ir.builder) -> ir.block_type:
@@ -1208,18 +1200,15 @@ def arange(start, end, _builder=None):
 """
 
 
-def _shape_check_impl(shape):
+def _unwrap_shape(shape):
     shape = _constexpr_to_value(shape)
-    for i, d in enumerate(shape):
-        if isinstance(d, int):
-            d = constexpr(d)
-        if not isinstance(d, constexpr):
-            raise TypeError(f"Shape element {i} must have type `constexpr`")
-        if not isinstance(d.value, int):
-            raise TypeError(f"Shape element {i} must have type `constexpr[int]`, got `constexpr[{type(d.value)}]")
-        if d.value & (d.value - 1) != 0:
-            raise ValueError(f"Shape element {i} must be a power of 2")
-    return [_constexpr_to_value(x) for x in shape]
+    return [_constexpr_to_value(s) for s in shape]
+
+
+def _shape_check_impl(shape):
+    shape = _unwrap_shape(shape)
+    validate_block_shape(shape)
+    return shape
 
 
 @builtin
diff --git a/test/TritonGPU/loop-pipeline.mlir b/test/TritonGPU/loop-pipeline.mlir
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
diff --git a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp