save work

charithaintc · charithaintc · commit 5c1c908670fb · 2025-04-23T21:03:08.000Z
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -34,4 +34,12 @@ class TensorDescType;
 #define GET_OP_CLASSES
 #include <mlir/Dialect/XeGPU/IR/XeGPU.h.inc>
 
+namespace mlir {
+namespace xegpu {
+FailureOr<VectorType> getDistributedVectorType(VectorType originalType,
+                                               LayoutAttr layout);
+FailureOr<VectorType> getDistributedVectorType(xegpu::TensorDescType tdescTy);
+} // namespace xegpu
+} // namespace mlir
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPU_H
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -189,11 +189,6 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
         return scatter_attr.getChunkSize().getInt();
       return 1;
     }
-
-    // This returns a vector type that represents the fragment of data owned by
-    // a work item in SIMT mode if this tensor descriptor is used in a XeGPU
-    // load/store operation.
-    FailureOr<VectorType> getDistributedVectorType();
   }];
 
   let hasCustomAssemblyFormat = true;
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -8,6 +8,7 @@
 
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include <numeric>
@@ -336,16 +337,17 @@ LogicalResult TensorDescType::verify(
 //        * tensor_desc[1] % (lane_layout[1] × lane_data[1]) == 0
 // Distributed vector is a 1D vector with shape:
 //        [fragment_size]
-FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
-  auto layout = llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
+FailureOr<VectorType> getDistributedVectorType(xegpu::TensorDescType tdescTy) {
+  auto layout = llvm::dyn_cast_if_present<LayoutAttr>(tdescTy.getLayout());
   // It only works for subgroup level layout, which only has lane_layout
   // and lane_data, and is to distribute a SIMD code into SIMT code.
   if (!layout || !layout.isSgLayout())
     return failure();
 
   SmallVector<int64_t> laneData(layout.getLaneData().asArrayRef());
   SmallVector<int64_t> laneLayout(layout.getLaneLayout().asArrayRef());
-  auto tdescShape = getShape();
+  auto tdescShape = tdescTy.getShape();
+  auto elementType = tdescTy.getElementType();
 
   // compute sgSize by multiply elements of laneLayout
   // e.g. for 2D layout, sgSize = laneLayout[0] * laneLayout[1]
@@ -354,14 +356,14 @@ FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
                                 std::multiplies<int64_t>());
 
   // Case 1: regular loads/stores
-  auto scatterAttr = getEncodingAsScatterTensorDescAttr();
+  auto scatterAttr = tdescTy.getEncodingAsScatterTensorDescAttr();
   if (scatterAttr) {
     auto chunkSize = scatterAttr.getChunkSize().getInt();
     // Verify if the first dimension of the tensor descriptor shape is
     // distributable.
     assert(tdescShape[0] == laneLayout[0] &&
            "tensor descriptor shape is not distributable");
-    return VectorType::get({chunkSize}, getElementType());
+    return VectorType::get({chunkSize}, elementType);
   }
 
   // Case 2: block loads/stores
@@ -374,9 +376,21 @@ FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
     tensorSize *= tdescDim;
   }
   // tensorSize must be adjusted for array_length.
-  tensorSize *= getArrayLength();
+  tensorSize *= tdescTy.getArrayLength();
 
-  return VectorType::get({tensorSize / sgSize}, getElementType());
+  return VectorType::get({tensorSize / sgSize}, elementType);
+}
+
+// Helper to get the distributed vector type for a given vector type according
+// to a given LayoutAttr.
+FailureOr<VectorType> getDistributedVectorType(VectorType originalType,
+                                               LayoutAttr layout) {
+  auto shape = originalType.getShape();
+  auto helperTdescTy = xegpu::TensorDescType::get(
+      shape, originalType.getElementType(),
+      /*array_length=*/1, /*boundary_check=*/true,
+      /*memory_space=*/xegpu::MemorySpace::Global, layout);
+  return xegpu::getDistributedVectorType(helperTdescTy);
 }
 
 } // namespace xegpu
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
@@ -16,4 +16,5 @@ add_mlir_dialect_library(MLIRXeGPUTransforms
   MLIRPass
   MLIRTransforms
   MLIRGPUDialect
+  MLIRXeGPUDialect
 )
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -696,9 +696,9 @@ class LayoutAttrAssignment {
   void assignToUsers(Value v, xegpu::LayoutAttr layout);
   xegpu::LayoutAttr getLayoutAttrForValue(Value v);
   LogicalResult resolveConflicts();
-  function_ref<LayoutInfo(Value)>
-      getAnalysisResult; // Callable to get the layout of a value based on the
-                         // layout propagation analysis.
+  // Callable to get the layout of a value based on the layout propagation
+  // analysis.
+  function_ref<LayoutInfo(Value)> getAnalysisResult;
   Operation *top;
 };
 
@@ -851,22 +851,6 @@ FailureOr<VectorType> getDistVecTypeBasedOnLaneLayout(xegpu::LayoutAttr layout,
   return VectorType::get(distributedShape, originalType.getElementType());
 }
 
-/// Get the distributed vector type for a source vector type according to a
-/// xegpu::LayoutAttr.
-static VectorType getDistributedVectorType(xegpu::LayoutAttr layout,
-                                           VectorType originalType) {
-  auto shape = originalType.getShape();
-  auto distVecTyOrFailure =
-      xegpu::TensorDescType::get(shape, originalType.getElementType(),
-                                 /*array_length=*/1, /*boundary_check=*/true,
-                                 /*memory_space=*/xegpu::MemorySpace::Global,
-                                 layout)
-          .getDistributedVectorType();
-  assert(llvm::succeeded(distVecTyOrFailure) &&
-         "Failed to compute distributed vector type for the given vector type");
-  return distVecTyOrFailure.value();
-}
-
 /// Drop the layout attribute from the tensor descriptor type if layout is
 /// present.
 static xegpu::TensorDescType dropLayouts(xegpu::TensorDescType tensorDesc) {
@@ -1175,7 +1159,7 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
     /// supported by the store op. Type mismatch must be resolved using
     /// appropriate cast op.
     auto storeNdDistributedValueTyOrFailure =
-        storeOp.getTensorDescType().getDistributedVectorType();
+        xegpu::getDistributedVectorType(storeOp.getTensorDescType());
     if (failed(storeNdDistributedValueTyOrFailure))
       return rewriter.notifyMatchFailure(
           storeOp, "Failed to get distributed vector type for the store op");
@@ -1263,7 +1247,7 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
     /// type.
     rewriter.setInsertionPointAfter(newWarpOp);
     auto loadNdDistValueTyOrFailure =
-        loadOp.getTensorDescType().getDistributedVectorType();
+        xegpu::getDistributedVectorType(loadOp.getTensorDescType());
     if (failed(loadNdDistValueTyOrFailure))
       return rewriter.notifyMatchFailure(
           loadOp, "Failed to get distributed vector type for the load op");
@@ -1379,17 +1363,27 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
 
+    FailureOr<VectorType> expectedDistLhsTyOrFailure =
+        xegpu::getDistributedVectorType(dpasOp.getLhsType(), layoutA);
+    FailureOr<VectorType> expectedDistRhsTyOrFailure =
+        xegpu::getDistributedVectorType(dpasOp.getRhsType(), layoutB);
+    FailureOr<VectorType> expectedDistResultTyOrFailure =
+        xegpu::getDistributedVectorType(dpasOp.getResultType(), layoutOut);
+    if (failed(expectedDistLhsTyOrFailure) ||
+        failed(expectedDistRhsTyOrFailure) ||
+        failed(expectedDistResultTyOrFailure))
+      return rewriter.notifyMatchFailure(
+          dpasOp,
+          "Failed to get distributed vector type for the dpas operands.");
     // Create a new dpas op outside the warp op.
     rewriter.setInsertionPointAfter(newWarpOp);
     SmallVector<Value> newDpasOperands;
     SmallVector<VectorType> newDpasOperandExpectedTypes;
+
     /// Resolve the distributed types with the original types.
-    newDpasOperandExpectedTypes.push_back(
-        getDistributedVectorType(layoutA, dpasOp.getLhsType()));
-    newDpasOperandExpectedTypes.push_back(
-        getDistributedVectorType(layoutB, dpasOp.getRhsType()));
-    auto distributedResultTy =
-        getDistributedVectorType(layoutOut, dpasOp.getResultType());
+    newDpasOperandExpectedTypes.push_back(expectedDistLhsTyOrFailure.value());
+    newDpasOperandExpectedTypes.push_back(expectedDistRhsTyOrFailure.value());
+    auto distributedResultTy = expectedDistResultTyOrFailure.value();
     if (dpasOp.getAcc())
       newDpasOperandExpectedTypes.push_back(distributedResultTy);
 

Original file line number	Diff line number	Diff line change
`@@ -16,4 +16,5 @@ add_mlir_dialect_library(MLIRXeGPUTransforms`
`16`	`16`	`MLIRPass`
`17`	`17`	`MLIRTransforms`
`18`	`18`	`MLIRGPUDialect`
	`19`	`+ MLIRXeGPUDialect`
`19`	`20`	`)`