intel
diff --git a/‎build_tools/patches/0008-amend-xegpu-transpose_bit_width-and-qualified-type-f.patch
Lines changed: 32 additions & 9 deletions b/‎build_tools/patches/0008-amend-xegpu-transpose_bit_width-and-qualified-type-f.patch
Lines changed: 32 additions & 9 deletions
diff --git a/‎lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp
Lines changed: 4 additions & 1 deletion b/‎lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp
Lines changed: 4 additions & 1 deletion
diff --git a/‎lib/Dialect/XeTile/Transforms/Blocking.cpp
Lines changed: 6 additions & 16 deletions b/‎lib/Dialect/XeTile/Transforms/Blocking.cpp
Lines changed: 6 additions & 16 deletions
diff --git a/‎lib/Utils/XeArch.cpp
Lines changed: 6 additions & 8 deletions b/‎lib/Utils/XeArch.cpp
Lines changed: 6 additions & 8 deletions
@@ -1,14 +1,14 @@
-From 1ee69567682e0f653d17d8eaaa3f54ec40201b44 Mon Sep 17 00:00:00 2001
+From 49cf7d3645dece35c0e5a4d48d2a00c801218656 Mon Sep 17 00:00:00 2001
 From: Chao Chen <[email protected]>
-Date: Thu, 2 May 2024 14:53:44 +0000
-Subject: [PATCH 1/2] amend xegpu: transpose_bit_width and qualified type for
- atomic_amw
+Date: Fri, 10 May 2024 14:36:04 +0000
+Subject: [PATCH] amend xegpu defintion:  - add transpose_bit_width for load nd
+  - fix type print for atomic_rmw  - relax dpas verfier to accept 2D operand
 
 ---
- mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td |  6 +++---
- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp         | 10 ++++++++++
- mlir/test/Dialect/XeGPU/XeGPUOps.mlir          |  2 +-
- 3 files changed, 14 insertions(+), 4 deletions(-)
+ .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td |  6 +++---
+ mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 19 ++++++++++++++-----
+ mlir/test/Dialect/XeGPU/XeGPUOps.mlir         |  2 +-
+ 3 files changed, 18 insertions(+), 9 deletions(-)
 
 diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
 index e477d9a0ca3f..5f95be1c87df 100644
@@ -42,7 +42,7 @@ index e477d9a0ca3f..5f95be1c87df 100644
  }
 
 diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
-index 22959224d56c..e550de6a97cd 100644
+index 22959224d56c..858afbd6d8aa 100644
 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
 +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -219,6 +219,16 @@ LogicalResult LoadNdOp::verify() {
@@ -62,6 +62,29 @@ index 22959224d56c..e550de6a97cd 100644
    if (array_len > 1) {
      auto it = tdescShape.begin();
      tdescShape.insert(it, array_len);
+@@ -413,9 +423,8 @@ LogicalResult DpasOp::verify() {
+   int64_t lhsRank = getLhsType().getRank();
+   int64_t rhsRank = getRhsType().getRank();
+
+-  if (lhsRank != rhsRank || lhsRank != 3)
+-    return emitOpError(
+-        "lhs and rhs rank does not match for dpas op, or their rank is not 3.");
++  if (lhsRank != rhsRank)
++    return emitOpError("lhs and rhs rank does not match for dpas op.");
+
+   if (getAcc() && getAccType() != getResultType())
+     return emitOpError("Accumulator and Result for dpas op should have the "
+@@ -423,8 +432,8 @@ LogicalResult DpasOp::verify() {
+
+   auto lhsShape = getLhsType().getShape();
+   auto rhsShape = getRhsType().getShape();
+-  if (lhsShape[1] != rhsShape[0] || lhsShape[2] != rhsShape[2])
+-    return emitOpError("K-dimension or vnni-factor mismatch.");
++  if (lhsShape[1] != rhsShape[0])
++    return emitOpError("K-dimension mismatch.");
+
+   return success();
+ }
 diff --git a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
 index 00d32d2a2ee9..ad037d3fbefd 100644
 --- a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
 
@@ -145,6 +145,7 @@ lowerUnpackOrPack(XeGPUOneToNPatterRewriter &rewriter, mlir::Operation *op,
                   llvm::ArrayRef<int64_t> inGrids,
                   llvm::ArrayRef<int64_t> outGrids, bool isVnniFormat = false,
                   bool isForDPASB = false) {
+
   // handle based on the dim0, and save results into intermediates
   llvm::SmallVector<mlir::Value> intermediates;
   if (inBlkSizes[0] == outBlkSizes[0]) { // do nothing
@@ -269,9 +270,11 @@ class SgTileUnpackOpPattern
     // specific attention needed for vectors in vnni format,
     // which is applied to load for dpas.
     auto loadOp = op.getInVec().getDefiningOp<xetile::LoadTileOp>();
+    auto elemTy = op.getInVec().getType().getElementType();
     bool isDpasA = loadOp && isForDPASA(loadOp);
     bool isDpasB = loadOp && isForDPASB(loadOp);
-    bool isVnniFormat = isDpasA || isDpasB;
+    bool isVnniFormat = (isDpasA || isDpasB) && elemTy.isIntOrFloat() &&
+                        elemTy.getIntOrFloatBitWidth() < 32;
 
     llvm::ArrayRef<int64_t> outGrids;
     mlir::DenseI64ArrayAttr outBlkSizes;
 
@@ -197,23 +197,13 @@ static llvm::SmallVector<unsigned int>
 getMMASize(mlir::Type elemTy, const int APrecision, const int BPrecision,
            const int CPrecision, const int DPrecision,
            std::shared_ptr<XeuArchInterface> uArchInterface) {
-  assert(elemTy.isIntOrFloat());
-  auto bits = elemTy.getIntOrFloatBitWidth();
-  imex::DPASConfig dpasParams;
-  llvm::SmallVector<unsigned int> result;
-  switch (bits) {
-  case 16:
-    dpasParams = uArchInterface->getDPASConfig(APrecision, BPrecision,
-                                               CPrecision, DPrecision);
-    result = llvm::SmallVector<unsigned int>(
-        {dpasParams.m, dpasParams.k, dpasParams.n});
-    break;
-  default:
-    result = llvm::SmallVector<unsigned int>({8, 8, 8});
-    break;
-  }
-  return result;
+  assert(elemTy.isIntOrFloat() && "unsupported element type.");
+  auto dpasParams = uArchInterface->getDPASConfig(APrecision, BPrecision,
+                                                  CPrecision, DPrecision);
+  return llvm::SmallVector<unsigned int>(
+      {dpasParams.m, dpasParams.k, dpasParams.n});
 }
+
 // it blocks a constant dense value if it is used by XeTile operators,
 // e.g, tile_mma and store_tile. It currently extends a 2D vector into
 // 4D vector with the last 2 dim corresponding to block size.
 
@@ -218,13 +218,10 @@ mlir::LogicalResult XeuArchInterface::isLegalDpasOp(mlir::Operation *op) {
       return op->emitOpError() << "Unsupported dpas config";
     }
 
-    if ((lhsRank != rhsRank) || (lhsRank != 3)) {
-      return op->emitOpError()
-             << "lhs and rhs rank does not match for dpas op, or "
-             << "their rank is not 3. "
-             << "\n"
-             << "lhsRank: " << lhsRank << "\n"
-             << "rhsRank:" << rhsRank;
+    if (lhsRank != rhsRank) {
+      return op->emitOpError() << "lhs and rhs rank does not match for dpas op "
+                               << "(lhsRank: " << lhsRank << ", "
+                               << "rhsRank:" << rhsRank << ").\n";
     }
 
     DPASConfig dpasParams =
@@ -241,7 +238,8 @@ mlir::LogicalResult XeuArchInterface::isLegalDpasOp(mlir::Operation *op) {
              << " dpas config: mxnxk = " << M << "x" << N << "x" << K;
     }
 
-    unsigned int BNumElements = rhsShape[0] * rhsShape[1] * rhsShape[2];
+    unsigned int BNumElements = std::accumulate(
+        rhsShape.begin(), rhsShape.end(), 1, std::multiplies<unsigned>());
     // Execution size for matrix B should match dpas params
     if (BNumElements != K * N) {
       return op->emitOpError()