intel
diff --git a/‎third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp‎
Lines changed: 31 additions & 25 deletions b/‎third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp‎
Lines changed: 31 additions & 25 deletions
diff --git a/‎third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 17 additions & 11 deletions b/‎third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 17 additions & 11 deletions
diff --git a/‎third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 34 additions & 27 deletions b/‎third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 34 additions & 27 deletions
diff --git a/‎third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandDPAS.cpp‎
Lines changed: 42 additions & 32 deletions b/‎third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandDPAS.cpp‎
Lines changed: 42 additions & 32 deletions
@@ -202,39 +202,45 @@ SmallVector<unsigned> DpasEncodingAttr::getCTAsPerCGA() const {
 
 SmallVector<int64_t>
 DpasEncodingAttr::getDPASRepetitions(ArrayRef<int64_t> shape, int opIdx) const {
+  // Always return a 3D shape repetitions for the ease of value handling, same
+  // to mma.
   auto warpsPerCTA = getWarpsPerCTA();
   int rank = shape.size();
-  SmallVector<int64_t> res(rank);
+  SmallVector<int64_t> rep(3, 1);
   if (opIdx == 0) {
     auto shapePerWarp = getShapeA();
-    if (rank == 3)
-      res[0] =
-          std::max<int64_t>(1, shape[0] / (shapePerWarp[0] * warpsPerCTA[0]));
-    res[rank - 2] = std::max<int64_t>(
-        1, shape[rank - 2] / (shapePerWarp[rank - 2] * warpsPerCTA[rank - 2]));
-    res[rank - 1] =
-        std::max<int64_t>(1, shape[rank - 1] / shapePerWarp[rank - 1]);
+    int64_t numRepBatch =
+        rank == 3 ? std::max<int64_t>(1, shape[0] /
+                                             (shapePerWarp[0] * warpsPerCTA[0]))
+                  : 1;
+    return {numRepBatch,
+            std::max<int64_t>(1, shape[rank - 2] / (shapePerWarp[rank - 2] *
+                                                    warpsPerCTA[rank - 2])),
+            std::max<int64_t>(1, shape[rank - 1] / shapePerWarp[rank - 1])};
   } else if (opIdx == 1) {
     auto shapePerWarp = getShapeB();
-    if (rank == 3)
-      res[0] =
-          std::max<int64_t>(1, shape[0] / (shapePerWarp[0] * warpsPerCTA[0]));
-    res[rank - 2] =
-        std::max<int64_t>(1, shape[rank - 2] / shapePerWarp[rank - 2]);
-    res[rank - 1] = std::max<int64_t>(
-        1, shape[rank - 1] / (shapePerWarp[rank - 1] * warpsPerCTA[rank - 1]));
+    int64_t numRepBatch =
+        rank == 3 ? std::max<int64_t>(1, shape[0] /
+                                             (shapePerWarp[0] * warpsPerCTA[0]))
+                  : 1;
+    return {numRepBatch,
+            std::max<int64_t>(1, shape[rank - 2] / shapePerWarp[rank - 2]),
+            std::max<int64_t>(1, shape[rank - 1] / (shapePerWarp[rank - 1] *
+                                                    warpsPerCTA[rank - 1]))};
   } else {
     assert(opIdx == 2 && "Unexpected operand id (valid ids are 0, 1 or 2)");
     auto shapePerWarp = getShapeC();
-    if (rank == 3)
-      res[0] =
-          std::max<int64_t>(1, shape[0] / (shapePerWarp[0] * warpsPerCTA[0]));
-    res[rank - 2] = std::max<int64_t>(
-        1, shape[rank - 2] / (shapePerWarp[rank - 2] * warpsPerCTA[rank - 2]));
-    res[rank - 1] = std::max<int64_t>(
-        1, shape[rank - 1] / (shapePerWarp[rank - 1] * warpsPerCTA[rank - 1]));
+    int64_t numRepBatch =
+        rank == 3 ? std::max<int64_t>(1, shape[0] /
+                                             (shapePerWarp[0] * warpsPerCTA[0]))
+                  : 1;
+    return {numRepBatch,
+            std::max<int64_t>(1, shape[rank - 2] / (shapePerWarp[rank - 2] *
+                                                    warpsPerCTA[rank - 2])),
+            std::max<int64_t>(1, shape[rank - 1] / (shapePerWarp[rank - 1] *
+                                                    warpsPerCTA[rank - 1]))};
   }
-  return res;
+  return rep;
 }
 
 unsigned DpasEncodingAttr::getTotalElemsPerThreadForOperands(
@@ -364,8 +370,8 @@ SmallVector<unsigned> DpasEncodingAttr::getElemsPerThreadForOperands(
   SmallVector<unsigned> elemsPerThread(rank);
   if (rank == 3)
     elemsPerThread[0] = repetitions[0];
-  elemsPerThread[rank - 2] = sizePerThread[rank - 2] * repetitions[rank - 2];
-  elemsPerThread[rank - 1] = sizePerThread[rank - 1] * repetitions[rank - 1];
+  elemsPerThread[rank - 2] = sizePerThread[rank - 2] * repetitions[1];
+  elemsPerThread[rank - 1] = sizePerThread[rank - 1] * repetitions[2];
 
   return elemsPerThread;
 };
 
@@ -10,6 +10,7 @@
 #include "triton/Tools/StrUtil.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -565,6 +566,7 @@ LinearLayout DPAStoLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
         DPASLaneBasesC(repeatCount, executionSize, threadsPerWarp);
     tileLayout = LinearLayout({{kRegister, regBasesC}, {kLane, laneBasesC}},
                               ArrayRef(outDimNames).take_back(2));
+    // llvm::to_vector(llvm::reverse(ArrayRef(outDimNames).take_back(2))));
     // std::cout << (tileLayout.toString()) << std::endl;
     // The per-inst layout is repeated at each repCluster.
     // Hence, multiply with the identity layouts starting from the
@@ -575,30 +577,34 @@ LinearLayout DPAStoLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
                                            outDimNames[KDim]);
     tileLayout *= LinearLayout::identity1D(repCluster[nonKDim], kRegister,
                                            outDimNames[nonKDim]);
-    // std::cout << (tileLayout.toString()) << std::endl;
+    std::cout << (tileLayout.toString()) << std::endl;
 
     // // The identical layout is repeated among warps
     tileLayout *=
         LinearLayout::identity1D(warpsPerCTA[KDim], kWarp, outDimNames[KDim]);
     tileLayout *= LinearLayout::identity1D(warpsPerCTA[nonKDim], kWarp,
                                            outDimNames[nonKDim]);
-    // tileLayout *=
-    //     LinearLayout::identity1D(warpsPerCTA[0], kWarp, outDimNames[0]);
+    if (rank == 3)
+      tileLayout *=
+          LinearLayout::identity1D(warpsPerCTA[0], kWarp, outDimNames[0]);
+    auto order =
+        llvm::to_vector(llvm::reverse(triton::gpu::getWarpOrder(layout)));
+    std::cout << "order: " << order[1] << ", " << order[0] << std::endl;
     // tileLayout *= identityND(kWarp, warpsPerCTA,
-    //                          llvm::to_vector(llvm::reverse(triton::gpu::getWarpOrder(layout))),
+    //                          llvm::to_vector(llvm::reverse(llvm::seq<unsigned>(rank))),
     //                          outDimNames);
-    // std::cout << (tileLayout.toString()) << std::endl;
+    std::cout << (tileLayout.toString()) << std::endl;
   }
 
   // Lastly, the layout repeats to match the shape.
   // Operand A/B repeats through the K-dimension first then repeats
   // through the non-K dimension.
-  SmallVector<int64_t> numReps = dpas.getDPASRepetitions(shape, opIdx);
-  tileLayout *=
-      LinearLayout::identity1D(numReps[KDim], kRegister, outDimNames[KDim]);
-  tileLayout *= LinearLayout::identity1D(numReps[nonKDim], kRegister,
-                                         outDimNames[nonKDim]);
-  // std::cout << (tileLayout.toString()) << std::endl;
+  // SmallVector<int64_t> numReps = dpas.getDPASRepetitions(shape, opIdx);
+  // tileLayout *=
+  //     LinearLayout::identity1D(numReps[KDim], kRegister, outDimNames[KDim]);
+  // tileLayout *= LinearLayout::identity1D(numReps[nonKDim], kRegister,
+  //                                        outDimNames[nonKDim]);
+  // // std::cout << (tileLayout.toString()) << std::endl;
 
   return combineCtaCgaWithShape(std::move(tileLayout),
                                 CTALayoutAttr::getDefault(ctx, rank), shape);
 
@@ -313,7 +313,8 @@ struct ConvertLayoutOpConversion
     return success();
   }
 
-  using ValueTable = std::map<std::pair<unsigned, unsigned>, Value>;
+  // using ValueTable = std::map<std::pair<unsigned, unsigned>, Value>;
+  using ValueTable = std::map<std::array<unsigned, 3>, Value>;
 
   ValueTable getValuesFromDpasLayoutStruct(Location loc,
                                            ConversionPatternRewriter &rewriter,
@@ -338,17 +339,20 @@ struct ConvertLayoutOpConversion
 
     int offset = 0;
     ValueTable result;
-    for (int i = 0; i < repetitions[outerDim]; ++i) {
-      for (int j = 0; j < repetitions[innerDim]; ++j) {
-        for (int repOuter = 0; repOuter < repCluster[outerDim]; ++repOuter) {
-          for (int repInner = 0; repInner < repCluster[innerDim]; ++repInner) {
-            Value matVal = rewriter.create<LLVM::UndefOp>(loc, dotOpTy);
-            for (int k = 0; k < numElemsPerOperand; ++k) {
-              matVal =
-                  insert_element(dotOpTy, matVal, elems[offset++], i32_val(k));
+    for (unsigned b = 0; b < repetitions[0]; ++b) {
+      for (int i = 0; i < repetitions[1]; ++i) {
+        for (int j = 0; j < repetitions[2]; ++j) {
+          for (int repOuter = 0; repOuter < repCluster[outerDim]; ++repOuter) {
+            for (int repInner = 0; repInner < repCluster[innerDim];
+                 ++repInner) {
+              Value matVal = rewriter.create<LLVM::UndefOp>(loc, dotOpTy);
+              for (int k = 0; k < numElemsPerOperand; ++k) {
+                matVal = insert_element(dotOpTy, matVal, elems[offset++],
+                                        i32_val(k));
+              }
+              result[{b, i * repCluster[outerDim] + repOuter,
+                      j * repCluster[innerDim] + repInner}] = matVal;
             }
-            result[{i * repCluster[outerDim] + repOuter,
-                    j * repCluster[innerDim] + repInner}] = matVal;
           }
         }
       }
@@ -367,35 +371,38 @@ struct ConvertLayoutOpConversion
         dpasLayout.getDPASRepetitions(dstType.getShape(), opIdx);
     ArrayRef<unsigned> repCluster = dpasLayout.getRepCluster();
     size_t rank = repCluster.size();
+    unsigned repBatch = repetitions[0];
     unsigned repOuter = 0u;
     unsigned repInner = 0u;
     unsigned repClusterOuter = 0u;
     if (opIdx == 0) {
       // operand A
-      repOuter = repetitions[rank - 2];
-      repInner = repetitions[rank - 1];
+      repOuter = repetitions[1];
+      repInner = repetitions[2];
       repClusterOuter = repCluster[rank - 2];
     } else {
       // operand B
-      repOuter = repetitions[rank - 1];
-      repInner = repetitions[rank - 2];
+      repOuter = repetitions[2];
+      repInner = repetitions[1];
       repClusterOuter = repCluster[rank - 1];
     }
 
     // TODO: Operands B requires extra steps to combine [8, 16] to [16, 16].
     SmallVector<Value> elems;
-    for (int m = 0; m < repOuter; ++m) {
-      for (int k = 0; k < repInner; ++k) {
-        for (int repOuterIdx = 0; repOuterIdx < repClusterOuter;
-             ++repOuterIdx) {
-          unsigned offsetM = m * repClusterOuter + repOuterIdx;
-          unsigned offsetN = k;
-          Value matVal = vals.at({offsetM, offsetN});
-          VectorType vecType = cast<mlir::VectorType>(matVal.getType());
-          Type valTy = vecType.getElementType();
-          for (int i = 0; i < vecType.getNumElements(); ++i) {
-            Value val = extract_element(valTy, matVal, i32_val(i));
-            elems.push_back(val);
+    for (unsigned b = 0; b < repBatch; ++b) {
+      for (int m = 0; m < repOuter; ++m) {
+        for (int k = 0; k < repInner; ++k) {
+          for (int repOuterIdx = 0; repOuterIdx < repClusterOuter;
+               ++repOuterIdx) {
+            unsigned offsetM = m * repClusterOuter + repOuterIdx;
+            unsigned offsetN = k;
+            Value matVal = vals.at({b, offsetM, offsetN});
+            VectorType vecType = cast<mlir::VectorType>(matVal.getType());
+            Type valTy = vecType.getElementType();
+            for (int i = 0; i < vecType.getNumElements(); ++i) {
+              Value val = extract_element(valTy, matVal, i32_val(i));
+              elems.push_back(val);
+            }
           }
         }
       }
 
@@ -3,7 +3,7 @@
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "llvm/Support/ErrorHandling.h"
 
-using ValueTable = std::map<std::pair<int, int>, Value>;
+using ValueTable = std::map<std::array<int, 3>, Value>;
 using mlir::triton::gpu::getShapePerCTA;
 using mlir::triton::gpu::SharedEncodingAttr;
 using mlir::triton::gpu::intel::DpasEncodingAttr;
@@ -44,9 +44,9 @@ template <unsigned opIdx> class DpasMatmulLoader {
   SmallVector<Value> computeLdsMatOffs(Value warpOff, Value lane,
                                        Value cSwizzleOffset);
   // Load the matrix value.
-  Value loadMatrix(int repOuter, int repInner, const ArrayRef<Value> ptrs,
-                   LLVM::LLVMStructType structTy, Type smemTy,
-                   Value cSwizzleOffset) const;
+  Value loadMatrix(int repBatch, int repOuter, int repInner,
+                   const ArrayRef<Value> ptrs, LLVM::LLVMStructType structTy,
+                   Type smemTy, Value cSwizzleOffset) const;
 
 private:
   unsigned getThreadsPerWarp() const {
@@ -57,6 +57,7 @@ template <unsigned opIdx> class DpasMatmulLoader {
   MemDescType descTy;
 
   SmallVector<Value> smemStrides;
+  Value repBatchDimStride;
   Value repNonKDimStride;
   Value repKDimStride;
 
@@ -176,19 +177,19 @@ DpasMatmulLoader<opIdx>::computeLdsMatOffs(Value warpId, Value laneId,
 }
 
 template <unsigned opIdx>
-Value DpasMatmulLoader<opIdx>::loadMatrix(int repOuter, int repInner,
-                                          const ArrayRef<Value> ptrs,
-                                          LLVM::LLVMStructType structTy,
-                                          Type smemTy,
-                                          Value cSwizzleOffset) const {
+Value DpasMatmulLoader<opIdx>::loadMatrix(
+    int repBatch, int repOuter, int repInner, const ArrayRef<Value> ptrs,
+    LLVM::LLVMStructType structTy, Type smemTy, Value cSwizzleOffset) const {
   Type elemTy = structTy.getBody()[0];
   assert(
       llvm::any_of(structTy.getBody(), [&](Type ty) { return ty == elemTy; }) &&
       "The struct should have the same element types.");
 
+  Value offsetBatch = mul(i32_val(repBatch), repBatchDimStride);
   Value offsetOuter = mul(i32_val(repOuter), repNonKDimStride);
   Value offsetInner = mul(i32_val(repInner), repKDimStride);
   Value offset = add(offsetOuter, offsetInner);
+  offset = add(offset, offsetBatch);
 
   Value llvmStruct = rewriter.create<LLVM::UndefOp>(loc, structTy);
   size_t elemNum = structTy.getBody().size();
@@ -203,18 +204,20 @@ Value DpasMatmulLoader<opIdx>::loadMatrix(int repOuter, int repInner,
 }
 
 Value composeValuesToDotOperandLayoutStruct(
-    const ValueTable &vals, int n0, int n1,
+    const ValueTable &vals, int batch, int n0, int n1,
     const LLVMTypeConverter *typeConverter, Location loc,
     ConversionPatternRewriter &rewriter) {
   std::vector<Value> elems;
-  for (int m = 0; m < n0; ++m) {
-    for (int k = 0; k < n1; ++k) {
-      Value matVal = vals.at({m, k});
-      auto matType = cast<LLVM::LLVMStructType>(matVal.getType());
-      Type valTy = matType.getBody()[0];
-      for (int i = 0; i < matType.getBody().size(); ++i) {
-        auto val = extract_val(valTy, matVal, i);
-        elems.push_back(val);
+  for (int b = 0; b < batch; ++b) {
+    for (int m = 0; m < n0; ++m) {
+      for (int k = 0; k < n1; ++k) {
+        Value matVal = vals.at({b, m, k});
+        auto matType = cast<LLVM::LLVMStructType>(matVal.getType());
+        Type valTy = matType.getBody()[0];
+        for (int i = 0; i < matType.getBody().size(); ++i) {
+          auto val = extract_val(valTy, matVal, i);
+          elems.push_back(val);
+        }
       }
     }
   }
@@ -245,7 +248,7 @@ Type getSharedMemTy(Type argType) {
 }
 
 template <unsigned opIdx>
-std::function<void(int, int)>
+std::function<void(int, int, int)>
 getLoadMatrixFn(MemDescType descTy, const SharedMemoryObject &smemObj,
                 DpasEncodingAttr dpasLayout, unsigned warpsPerTile,
                 SmallVector<unsigned> instrShape, Value warpId,
@@ -261,7 +264,8 @@ getLoadMatrixFn(MemDescType descTy, const SharedMemoryObject &smemObj,
   ArrayRef<unsigned> order = sharedLayout.getOrder();
 
   // (a, b) is the coordinate.
-  auto load = [=, &rewriter, &smemObj, &instrShape, &vals](int a, int b) {
+  auto load = [=, &rewriter, &smemObj, &instrShape, &vals](int batch, int outer,
+                                                           int inner) {
     DpasMatmulLoader<opIdx> loader(dpasLayout, descTy, warpsPerTile,
                                    smemObj.strides, instrShape, rewriter,
                                    typeConverter, loc);
@@ -289,7 +293,8 @@ getLoadMatrixFn(MemDescType descTy, const SharedMemoryObject &smemObj,
         SmallVector<Type>(totalElem / threadsPerWarp,
                           typeConverter->convertType(eltTy)));
 
-    vals[{a, b}] = loader.loadMatrix(a, b, ptrs, matTy, smemTy, cSwizzleOffset);
+    vals[{batch, outer, inner}] = loader.loadMatrix(
+        batch, outer, inner, ptrs, matTy, smemTy, cSwizzleOffset);
   };
 
   return load;
@@ -325,27 +330,32 @@ Value loadOperand(ConversionPatternRewriter &rewriter, Location loc,
       LLVM::delinearize(rewriter, loc, warpId, warpsPerCTA, order);
 
   // FIXME: Using opIdx as the dimIdx will be incorrect in 3D case.
-  unsigned ceilRes = mlir::ceil<unsigned>(shapePerCTA[opIdx], shape[opIdx]);
-  Value outerWarpDim = urem(multiDimWarpId[opIdx], i32_val(ceilRes));
-  unsigned warpsPerTile = std::min<unsigned>(warpsPerCTA[opIdx], ceilRes);
+  unsigned rank = shape.size();
+  unsigned dimOuter = opIdx ? (rank - 1) : (rank - 2);
+  unsigned ceilRes =
+      mlir::ceil<unsigned>(shapePerCTA[dimOuter], shape[dimOuter]);
+  Value outerWarpDim = urem(multiDimWarpId[dimOuter], i32_val(ceilRes));
+  unsigned warpsPerTile = std::min<unsigned>(warpsPerCTA[dimOuter], ceilRes);
 
   // Get the function to use to load the operand.
   ValueTable vals;
-  std::function<void(int, int)> loadFn = getLoadMatrixFn<opIdx>(
+  std::function<void(int, int, int)> loadFn = getLoadMatrixFn<opIdx>(
       descTy, smemObj, dpasLayout, warpsPerTile, std::move(shape), warpId,
       outerWarpDim, laneId, vals, typeConverter, rewriter, loc);
 
   // Load the operand.
-  int64_t numRepOuter = numReps[opIdx];
-  int64_t numRepK = numReps[(opIdx == 0) ? 1 : 0];
+  int64_t numRepBatch = numReps[0];
+  int64_t numRepOuter = numReps[opIdx ? 2 : 1];
+  int64_t numRepK = numReps[opIdx ? 1 : 2];
 
-  for (int m = 0; m < numRepOuter; ++m)
-    for (int k = 0; k < numRepK; ++k)
-      loadFn(m, k);
+  for (int b = 0; b < numRepBatch; ++b)
+    for (int m = 0; m < numRepOuter; ++m)
+      for (int k = 0; k < numRepK; ++k)
+        loadFn(b, m, k);
 
   // Format the values into an LLVM::Struct.
-  return composeValuesToDotOperandLayoutStruct(vals, numRepOuter, numRepK,
-                                               typeConverter, loc, rewriter);
+  return composeValuesToDotOperandLayoutStruct(
+      vals, numRepBatch, numRepOuter, numRepK, typeConverter, loc, rewriter);
 }
 
 } // namespace