intel
diff --git a/‎.github/workflows/integration-tests-amd.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/integration-tests-amd.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎Makefile
Lines changed: 1 addition & 1 deletion b/‎Makefile
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
Lines changed: 17 additions & 5 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
Lines changed: 17 additions & 5 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
Lines changed: 30 additions & 1 deletion b/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
Lines changed: 30 additions & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/IR/Ops.cpp
Lines changed: 99 additions & 6 deletions b/‎lib/Dialect/TritonGPU/IR/Ops.cpp
Lines changed: 99 additions & 6 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
Lines changed: 11 additions & 4 deletions b/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
Lines changed: 11 additions & 4 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/WGMMAPipeline.cpp
Lines changed: 6 additions & 34 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/WGMMAPipeline.cpp
Lines changed: 6 additions & 34 deletions
diff --git a/‎python/test/unit/language/test_matmul.py
Lines changed: 2 additions & 1 deletion b/‎python/test/unit/language/test_matmul.py
Lines changed: 2 additions & 1 deletion
@@ -109,7 +109,7 @@ jobs:
             echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
-          pytest --capture=tee-sys -rfs third_party/amd/python/test/test_extract_slice.py
+          pytest --capture=tee-sys -rfs third_party/amd/python/test/test_extract_slice_concat_op.py
           TRITON_ALWAYS_COMPILE=1 pytest --capture=tee-sys -rfs third_party/amd/python/test/test_scalarize_packed_fops.py
           cd python/test/unit
           pytest --capture=tee-sys -rfs -n 12 language runtime \
 
@@ -37,7 +37,7 @@ test-unit: all
 	$(PYTEST) -s -n 8 python/triton_kernels/tests/
 	TRITON_DISABLE_LINE_INFO=0 $(PYTEST) -s python/test/unit/language/test_line_info.py
 	# Run attention separately to avoid out of gpu memory
-	TRITON_PRINT_AUTOTUNING=1 $(PYTEST) -vs python/tutorials/06-fused-attention.py
+	$(PYTEST) -vs python/tutorials/06-fused-attention.py
 	TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=python/triton/instrumentation/libGPUInstrumentationTestLib.so \
 		$(PYTEST) --capture=tee-sys -rfs -vvv python/test/unit/instrumentation/test_gpuhello.py
 
 
@@ -208,14 +208,26 @@ def TTG_MemDescSubviewOp : TTG_Op<"memdesc_subview", [Pure, MemDescViewTrait]> {
 
   let description = [{
     This operation returns a new descriptor representing a subview of the buffer.
-    It doesn't affect the underlying memory. The subview can be rank-reduced.
+    It doesn't affect the underlying memory.
 
     For example, suppose that
      - the input shape is 2x4x16xf16,
-     - the output shape is 4x4xf16, and
-     - offsets = [1, 0, 4].
-
-    Then in Python syntax, the subview covers input[1][0:4][4:8].
+     - the output shape is 4x16xf16, and
+     - offsets = [1, 0, 0].
+
+    Then in Python syntax, the subview covers input[1].
+
+    Just one dimension may be split (at most one non-zero offset).
+
+    When the input shape and the output shape have different rank:
+    Or the output shape is a tensor of 1D tensor of 1 element:
+      - The rank of the output must be 1D smaller than the input.
+      - We assume the input is split along the 0th dimension.
+      - The offset along the 0th dimension may be a runtime value.
+    When the input and the output have the same rank:
+      - The offset must be a compile-time constant
+      - Larger or equal to the tile of the tensor (or zero)
+      - That does not split the input along the swizzling pattern (if any)
   }];
   let arguments = (
     ins TTG_MemDescType:$src, Variadic<I32>:$offsets);
 
@@ -3,6 +3,7 @@
 #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Types.h"
+#include "triton/Tools/LayoutUtils.h"
 
 using namespace mlir;
 using namespace mlir::triton;
@@ -421,6 +422,7 @@ struct MemDescSubviewOpConversion
   matchAndRewrite(triton::gpu::MemDescSubviewOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op->getLoc();
+    auto *ctx = op->getContext();
     auto b = TritonLLVMOpBuilder(loc, rewriter);
     auto srcTy = op.getSrc().getType();
     auto destTy = op.getResult().getType();
@@ -433,15 +435,42 @@ struct MemDescSubviewOpConversion
                                                    llvmElemTy, rewriter);
     auto smemStrides = smemObj.getStrides(srcTy, loc, rewriter);
     SmallVector<Value> opOffsetVals = op.getOffsets();
+    // We assume we always create a subview of the last dimensions
     SmallVector<Value> opSmemStrides(smemStrides.end() - opOffsetVals.size(),
                                      smemStrides.end());
+    // Compute total offset
     SmallVector<Value> offsetVals;
     auto destRank = op.getResult().getType().getRank();
     auto rankReduced = srcTy.getRank() - destRank;
     for (int i = rankReduced; i < opOffsetVals.size(); i++) {
       offsetVals.push_back(b.add(opOffsetVals[i], smemObj.getOffsets()[i]));
     }
-    Value offset = dot(rewriter, loc, opOffsetVals, opSmemStrides);
+
+    Value offset;
+    if (rankReduced || (destTy.getRank() == 1 && destTy.getDimSize(0) == 1)) {
+      // We are splitting the pipelining dimension which may not be a power of 2
+      // so we can't use LinearLayouts
+      offset = dot(rewriter, loc, opOffsetVals, opSmemStrides);
+    } else {
+      auto dimNames = standardOutDimNames(ctx, opOffsetVals.size());
+      SmallVector<std::pair<StringAttr, Value>> logicalOffsets;
+      // This assumes the subviews are additive, in the sense that we can
+      // compute the offset of one and an add it to the offset of the previous
+      // one we computed. We check for this in the verifier.
+      for (int i = 0; i < rankReduced; i++) {
+        logicalOffsets.push_back({dimNames[i], b.i32_val(0)});
+      }
+      for (int i = rankReduced; i < opOffsetVals.size(); i++) {
+        logicalOffsets.push_back({dimNames[i], offsetVals[i - rankReduced]});
+      }
+      // The order gives us the honest-to-goodness layout rank
+      auto srcAllocShape =
+          srcTy.getAllocShape().take_back(getOrder(srcTy).size());
+      auto llInv = toLinearLayout(srcAllocShape, srcTy.getEncoding()).invert();
+      offset =
+          applyLinearLayout(loc, rewriter, llInv, logicalOffsets)[0].second;
+    }
+
     auto base = smemObj.getBase();
     auto elemPtrTy = base.getType();
     smemObj = SharedMemoryObject(b.gep(elemPtrTy, llvmElemTy, base, offset),
 
@@ -10,6 +10,7 @@
 #include "triton/Dialect/TritonGPU/IR/Types.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
+#include "triton/Tools/LayoutUtils.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/LogicalResult.h"
 
@@ -602,15 +603,107 @@ LogicalResult MemDescSubviewOp::verify() {
             "offsets other than the first one must be constant zeros");
       }
     }
+    return success();
   }
 
-  // TODO(jlebar): Currently we generate illegal encodings, so we can't add a
-  // verifier for them.  In particular, we use the same encoding for the src and
-  // dst of a subview op, when the subview removes a dimension.  That generates
-  // an illegal shared encoding (because the size of `order` doesn't match the
-  // rank of the tensor), but it's not checked anywhere, and we believe the
-  // resulting code ultimately works.
+  assert(isa<SharedEncodingTrait>(srcEnc));
 
+  // corner case: 1D -> 1D into a 1 element tensor (we don't have 0D tensors)
+  if (srcTy.getRank() == 1 && dstTy.getRank() == 1 &&
+      dstTy.getDimSize(0) == 1) {
+    return success();
+  }
+
+  // There are two cases:
+  // 1. The subview is rank-reducing
+  //  - We split along the first dimension. It can be with non-constant offsets
+  if (srcTy.getRank() != dstTy.getRank()) {
+    if (srcTy.getRank() - dstTy.getRank() != 1) {
+      return emitError(
+          "only nD -> (n-1)D rank-reducing subviews are supported");
+    }
+    for (auto offset : getOffsets().take_back(dstTy.getRank())) {
+      if (auto constOp = offset.getDefiningOp<arith::ConstantOp>()) {
+        if (auto offsetInt = dyn_cast<IntegerAttr>(constOp.getValue())) {
+          if (offsetInt.getInt() != 0) {
+            return emitError("only first offset can be non-zero for a "
+                             "rank-reducing subview");
+          }
+        } else {
+          return emitError(
+              "only integer constant values are allowed for the split");
+        }
+      } else {
+        return emitError("only constant values are allowed outside the front "
+                         "dimension in a rank-reducing subview");
+      }
+    }
+    return success();
+  }
+  assert(srcTy.getRank() == dstTy.getRank());
+  // 2. The src is non-rank-reducing
+  //  - We split along at most one dim, but just with constant values
+  //  - The values where the split happens must not be within the swizzling
+  //  pattern
+  // Check which dimension we are splitting along
+  int dim = -1;
+  for (int i = 0; i < srcTy.getRank(); i++) {
+    if (srcTy.getDimSize(i) != dstTy.getDimSize(i)) {
+      if (dim != -1) {
+        return emitError(
+            "We don't allow subviews that split along multiple dimensions");
+      }
+      dim = i;
+    }
+  }
+  SmallVector<int64_t> offsets;
+  for (auto offset : getOffsets()) {
+    if (auto constOp = offset.getDefiningOp<arith::ConstantOp>()) {
+      if (auto offsetInt = dyn_cast<IntegerAttr>(constOp.getValue())) {
+        offsets.push_back(offsetInt.getInt());
+      } else {
+        return emitError(
+            "only integer constant values are allowed for the split");
+      }
+    } else {
+      return emitError("only constant values are allowed for the split");
+    }
+  }
+  // Identity subview
+  if (dim == -1) {
+    return success();
+  }
+
+  for (auto [i, offset] : llvm::enumerate(offsets)) {
+    if (i != dim) {
+      if (offset != 0) {
+        return emitError("A non zero offset found in a dimension that is "
+                         "not being split");
+      }
+    } else {
+      if (offset & (dstTy.getDimSize(dim) - 1)) {
+        return emitError("The split offset may not touch the tile");
+      }
+    }
+  }
+  auto ctx = getContext();
+  // The order gives us the honest-to-goodness layout rank
+  auto srcAllocShape = srcTy.getAllocShape().take_back(getOrder(srcTy).size());
+  auto llInv =
+      triton::gpu::toLinearLayout(srcAllocShape, srcTy.getEncoding()).invert();
+  auto kDim = mlir::StringAttr::get(ctx, "dim" + llvm::Twine(dim));
+  llvm::SmallVector<std::pair<mlir::StringAttr, int32_t>> namedOffsets;
+  for (auto d : standardOutDimNames(ctx, srcTy.getRank())) {
+    namedOffsets.push_back({d, 0});
+  }
+  for (int dimSize = dstTy.getDimSize(dim); dimSize < srcTy.getDimSize(dim);
+       dimSize *= 2) {
+    namedOffsets[dim] = {kDim, dimSize};
+    if (!llvm::isPowerOf2_32(llInv.apply(namedOffsets)[0].second)) {
+      return emitError(
+          "We don't support splitting along the swizzling pattern");
+    }
+  }
   return success();
 }
 
 
@@ -188,10 +188,12 @@ getSharedMemoryMMAOperand(Value v, mlir::PatternRewriter &rewriter, int opIdx,
   if (newOrder != order && op) {
     op->emitWarning("Warning: Forcing a different order [")
         << newOrder[0] << ", " << newOrder[1]
-        << "] on SMEM than the register order for the opreand " << opIdx
+        << "] on SMEM than the register order for the operand " << opIdx
         << ". Registers will be transposed before SMEM store and the pipelined "
            "load for this operand will be disabled, so poor performance is "
-           "expected.";
+           "expected. Recommendation: consider transposing the operand in "
+           "global "
+           "memory to remove the need to transpose the tensor in registers.";
   }
 
   Attribute SharedMemorySpace =
@@ -391,9 +393,14 @@ class BlockedToMMA : public mlir::OpRewritePattern<DotOp> {
         int bitwidth = getElementTypeOrSelf(a).getIntOrFloatBitWidth();
         a = getDotOperand(a, 0, bitwidth);
       } else {
-        a = getSharedMemoryMMAOperand(a, rewriter, 0, allowTranspose);
+        a = getSharedMemoryMMAOperand(a, rewriter, 0, allowTranspose,
+                                      /*isMMAv5Fp4Padded=*/false,
+                                      /*forceTranspose=*/false, dotOp);
       }
-      b = getSharedMemoryMMAOperand(b, rewriter, 1, allowTranspose);
+      b = getSharedMemoryMMAOperand(b, rewriter, 1, allowTranspose,
+                                    /*isMMAv5Fp4Padded=*/false,
+                                    /*forceTranspose=*/false, dotOp);
+
       newDot = rewriter.create<triton::nvidia_gpu::WarpGroupDotOp>(
           dotOp.getLoc(), newRetType, a, b, newAcc, nullptr,
           dotOp.getInputPrecision(), dotOp.getMaxNumImpreciseAcc(), false);
 
@@ -15,8 +15,6 @@
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h"
-#include "triton/Tools/LayoutUtils.h"
-#include "triton/Tools/LinearLayout.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
@@ -287,50 +285,24 @@ SmallVector<Value> splitLhs(OpBuilder &builder,
 SmallVector<Value> splitRhs(OpBuilder &builder,
                             TypedValue<ttg::MemDescType> rhs, int64_t newK) {
   auto loc = rhs.getLoc();
-  auto *ctx = builder.getContext();
   auto type = rhs.getType();
   auto rank = type.getRank();
   auto kDim = rank - 2;
   auto nSplits = type.getShape()[kDim] / newK;
-  // offset -> matrix
-  auto ll = ttg::toLinearLayout(type.getShape(), type.getEncoding());
-  auto llInv = ll.invert();
-
-  // Split into
-  auto kOffset = StringAttr::get(ctx, "offset");
-  assert(llInv.getOutDimSize(kOffset) == product(type.getShape()));
-  auto dimNames = tt::standardOutDimNames(ctx, rank);
-  SmallVector<std::pair<StringAttr, int32_t>> newOutDims;
-  for (auto d : getOrder(type)) {
-    newOutDims.push_back({dimNames[d], type.getShape()[d]});
-  }
-  // Split into shmem shape and invert
-  llInv = llInv.reshapeOuts(newOutDims);
-  llInv = llInv.transposeOuts(dimNames);
-  auto toOffsets = [&](const SmallVector<std::pair<StringAttr, int32_t>>
-                           &shape) {
-    return llvm::to_vector(
-        llvm::map_range(llvm::make_second_range(shape), [&](int32_t v) {
-          return builder.create<arith::ConstantIntOp>(loc, v, 32).getResult();
-        }));
-  };
-  // New Shape
   auto shape = llvm::to_vector(type.getShape());
   shape[kDim] = newK;
+  SmallVector<Value> offsetsVal;
+  for (int i = 0; i < rank; i++) {
+    offsetsVal.push_back(builder.create<arith::ConstantIntOp>(loc, 0, 32));
+  }
   auto newType = ttg::MemDescType::get(
       shape, type.getElementType(), type.getEncoding(), type.getMemorySpace(),
       /*isMutable=*/false, type.getAllocShape());
   SmallVector<Value> ret;
-  SmallVector<std::pair<StringAttr, int32_t>> logicalOffsets;
-  for (int i = 0; i < rank; i++) {
-    logicalOffsets.push_back({StringAttr::get(ctx, "dim" + Twine(i)), 0});
-  }
   for (int i = 0; i < nSplits; i++) {
-    logicalOffsets[kDim].second = i * newK;
-    auto shmemOffsets = toOffsets(llInv.apply(logicalOffsets));
-
+    offsetsVal[kDim] = builder.create<arith::ConstantIntOp>(loc, i * newK, 32);
     Value newSmem = builder.create<triton::gpu::MemDescSubviewOp>(
-        loc, newType, rhs, shmemOffsets);
+        loc, newType, rhs, offsetsVal);
     ret.push_back(newSmem);
   }
   return ret;
 
@@ -546,7 +546,8 @@ def flatten_scale(scale):
             print(f"SWP failed for M = {M}, N = {N}")
 
 
-@pytest.mark.parametrize("BLOCK_M, BLOCK_N, BLOCK_K", [(128, 128, 64), (128, 64, 128), (64, 128, 32), (128, 256, 32)])
+@pytest.mark.parametrize("BLOCK_M, BLOCK_N, BLOCK_K", [(128, 128, 64), (128, 64, 128), (64, 128, 32), (128, 256, 32),
+                                                       (256, 64, 32)])
 @pytest.mark.parametrize("a_trans", [False, True])
 @pytest.mark.parametrize("dtype_src_str", ["float32", "float16", "float8e5"])
 @pytest.mark.skipif(is_hip() or (is_cuda() and torch.cuda.get_device_capability()[0] != 10),