Address feedback

nbpatel · nbpatel · commit d628a2269d0d · 2025-10-29T21:45:03.000Z
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -1225,9 +1225,7 @@ struct WgToSgVectorTransposeOp
   LogicalResult
   matchAndRewrite(vector::TransposeOp op, OneToNOpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    VectorType resultType = dyn_cast<VectorType>(op.getResult().getType());
-    if (!resultType)
-      return failure();
+    VectorType resultType = op.getResultVectorType();
 
     ArrayRef<int64_t> wgShape = resultType.getShape();
     xegpu::DistributeLayoutAttr layout =
@@ -1242,9 +1240,7 @@ struct WgToSgVectorTransposeOp
 
     SmallVector<int64_t> sourceSgLayout =
         sourceLayout.getEffectiveSgLayoutAsInt();
-    SmallVector<int64_t> sourceSgData = sourceLayout.getEffectiveSgDataAsInt();
     SmallVector<int64_t> resultSgLayout = layout.getEffectiveSgLayoutAsInt();
-    SmallVector<int64_t> resultSgData = layout.getEffectiveSgDataAsInt();
     DenseI32ArrayAttr sourceOrder = sourceLayout.getOrder();
     DenseI32ArrayAttr resultOrder = layout.getOrder();
 
@@ -1253,37 +1249,20 @@ struct WgToSgVectorTransposeOp
           op, "Both source and result must have order attributes");
     }
 
-    SmallVector<int64_t> sourceOrderVec = llvm::to_vector(
-        llvm::map_range(sourceOrder.asArrayRef(),
-                        [](int32_t idx) { return static_cast<int64_t>(idx); }));
-    SmallVector<int64_t> resultOrderVec = llvm::to_vector(
-        llvm::map_range(resultOrder.asArrayRef(),
-                        [](int32_t idx) { return static_cast<int64_t>(idx); }));
-
     ArrayRef<int64_t> permutation = op.getPermutation();
-    size_t expectedSize = permutation.size();
-    if (sourceSgLayout.size() != expectedSize ||
-        sourceSgData.size() != expectedSize ||
-        resultSgLayout.size() != expectedSize ||
-        resultSgData.size() != expectedSize ||
-        sourceOrderVec.size() != expectedSize ||
-        resultOrderVec.size() != expectedSize) {
+    size_t permutationSize = permutation.size();
+    if (sourceSgLayout.size() != permutationSize ||
+        resultSgLayout.size() != permutationSize) {
       return rewriter.notifyMatchFailure(
-          op, "All layouts and permutation must have the same rank");
+          op, "Layouts and permutation must have the same rank");
     }
 
-    // Check that sgLayout, sgData & order are properly transposed for operand
+    // Check that sgLayout, sgData & order are properly transposed for source
     // and result
-    for (size_t i = 0; i < permutation.size(); ++i) {
-      int64_t srcDim = permutation[i];
-      if (resultSgLayout[i] != sourceSgLayout[srcDim] ||
-          resultSgData[i] != sourceSgData[srcDim] ||
-          resultOrderVec[i] != sourceOrderVec[srcDim]) {
-        return rewriter.notifyMatchFailure(
-            op, "Result layout is not a valid transpose of source layout "
-                "according to permutation");
-      }
-    }
+    if (!layout.isTransposeOf(sourceLayout, permutation))
+      return rewriter.notifyMatchFailure(
+          op, "Result layout is not a valid transpose of source layout "
+              "according to permutation");
 
     SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
     VectorType newResultType =
@@ -1292,10 +1271,8 @@ struct WgToSgVectorTransposeOp
     for (auto src : adaptor.getVector()) {
       auto newTranspose = vector::TransposeOp::create(
           rewriter, op.getLoc(), newResultType, src, permutation);
-      if (!layout.getEffectiveLaneLayoutAsInt().empty() ||
-          !layout.getEffectiveInstDataAsInt().empty())
-        xegpu::setDistributeLayoutAttr(newTranspose->getResult(0),
-                                       layout.dropSgLayoutAndData());
+      xegpu::setDistributeLayoutAttr(newTranspose->getResult(0),
+                                     layout.dropSgLayoutAndData());
       newTransposeOps.push_back(newTranspose.getResult());
     }
 
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
@@ -121,9 +121,9 @@ gpu.module @test_distribution {
   // CHECK-LABEL: vector_transpose
   gpu.func @vector_transpose(%src: memref<256x128xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
-        -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [1, 16], lane_data = [1, 1], order =[0, 1]>>
+        -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
     %load = xegpu.load_nd %tdesc[0, 0]
-        : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [1, 16], lane_data = [1, 1], order =[0, 1]>>
+        : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
         -> vector<256x128xf32>
     // CHECK-COUNT-2: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<32x16xf32> to vector<16x32xf32>
     // CHECK-NOT: vector.transpose
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -467,9 +467,9 @@ gpu.module @test_distribution {
   // CHECK-LABEL: vector_transpose
   gpu.func @vector_transpose(%src: memref<256x32xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x32xf32>
-        -> !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [1, 16], lane_data = [1, 1], order =[0, 1]>>
+        -> !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
     %load = xegpu.load_nd %tdesc[0, 0]
-        : !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [1, 16], lane_data = [1, 1], order =[0, 1]>>
+        : !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
         -> vector<256x32xf32>
     //CHECK: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<64x32xf32> to vector<32x64xf32>
     %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], lane_layout = [1, 16], lane_data = [1, 1], order =[1, 0]>} : vector<256x32xf32> to vector<32x256xf32>