llvm
diff --git a/‎mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp‎
Lines changed: 0 additions & 1 deletion b/‎mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp‎
Lines changed: 31 additions & 37 deletions b/‎mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp‎
Lines changed: 31 additions & 37 deletions
diff --git a/‎mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir‎
Lines changed: 0 additions & 2 deletions b/‎mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir‎
Lines changed: 2 additions & 4 deletions b/‎mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir‎
Lines changed: 9 additions & 27 deletions b/‎mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir‎
Lines changed: 9 additions & 27 deletions
@@ -288,7 +288,6 @@ LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
         llvm::reverse(llvm::seq<int64_t>(0, sgLayoutInt.size())));
   }
 
-  // Validate order
   if (order.size() != sgLayoutInt.size()) {
     return failure();
   }
 
@@ -1235,60 +1235,63 @@ struct WgToSgVectorTransposeOp
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
-    // Get the source layout for validation
     xegpu::DistributeLayoutAttr sourceLayout =
         xegpu::getDistributeLayoutAttr(op.getVector());
     if (!sourceLayout || !sourceLayout.isForWorkgroup())
       return failure();
 
-    // Validate that result layout is transpose of source layout
     SmallVector<int64_t> sourceSgLayout =
         sourceLayout.getEffectiveSgLayoutAsInt();
     SmallVector<int64_t> sourceSgData = sourceLayout.getEffectiveSgDataAsInt();
     SmallVector<int64_t> resultSgLayout = layout.getEffectiveSgLayoutAsInt();
     SmallVector<int64_t> resultSgData = layout.getEffectiveSgDataAsInt();
+    DenseI32ArrayAttr sourceOrder = sourceLayout.getOrder();
+    DenseI32ArrayAttr resultOrder = layout.getOrder();
 
-    ArrayRef<int64_t> permutation = op.getPermutation();
-
-    // Check that sgLayout and sgData are properly transposed
-    if (sourceSgLayout.size() != resultSgLayout.size() ||
-        sourceSgData.size() != resultSgData.size() ||
-        sourceSgLayout.size() != permutation.size()) {
+    if (!sourceOrder || !resultOrder) {
       return rewriter.notifyMatchFailure(
-          op, "Source and result layouts must have same rank as permutation");
+          op, "Both source and result must have order attributes");
     }
 
-    // Validate sgLayout transpose
-    for (size_t i = 0; i < permutation.size(); ++i) {
-      int64_t srcDim = permutation[i];
-      if (srcDim < 0 || srcDim >= static_cast<int64_t>(sourceSgLayout.size())) {
-        return rewriter.notifyMatchFailure(op, "Invalid permutation index");
-      }
-      if (resultSgLayout[i] != sourceSgLayout[srcDim]) {
-        return rewriter.notifyMatchFailure(
-            op, "Result sgLayout is not transpose of source sgLayout according "
-                "to permutation");
-      }
+    SmallVector<int64_t> sourceOrderVec = llvm::to_vector(
+        llvm::map_range(sourceOrder.asArrayRef(),
+                        [](int32_t idx) { return static_cast<int64_t>(idx); }));
+    SmallVector<int64_t> resultOrderVec = llvm::to_vector(
+        llvm::map_range(resultOrder.asArrayRef(),
+                        [](int32_t idx) { return static_cast<int64_t>(idx); }));
+
+    ArrayRef<int64_t> permutation = op.getPermutation();
+    size_t expectedSize = permutation.size();
+    if (sourceSgLayout.size() != expectedSize ||
+        sourceSgData.size() != expectedSize ||
+        resultSgLayout.size() != expectedSize ||
+        resultSgData.size() != expectedSize ||
+        sourceOrderVec.size() != expectedSize ||
+        resultOrderVec.size() != expectedSize) {
+      return rewriter.notifyMatchFailure(
+          op, "All layouts and permutation must have the same rank");
     }
 
-    // Validate sgData transpose
+    // Check that sgLayout, sgData & order are properly transposed for operand
+    // and result
     for (size_t i = 0; i < permutation.size(); ++i) {
       int64_t srcDim = permutation[i];
-      if (resultSgData[i] != sourceSgData[srcDim]) {
+      if (resultSgLayout[i] != sourceSgLayout[srcDim] ||
+          resultSgData[i] != sourceSgData[srcDim] ||
+          resultOrderVec[i] != sourceOrderVec[srcDim]) {
         return rewriter.notifyMatchFailure(
-            op, "Result sgData is not transpose of source sgData according to "
-                "permutation");
+            op, "Result layout is not a valid transpose of source layout "
+                "according to permutation");
       }
     }
 
     SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
     VectorType newResultType =
         VectorType::get(sgShape, resultType.getElementType());
-
     SmallVector<Value> newTransposeOps;
     for (auto src : adaptor.getVector()) {
       auto newTranspose = vector::TransposeOp::create(
-          rewriter, op.getLoc(), newResultType, src, op.getPermutation());
+          rewriter, op.getLoc(), newResultType, src, permutation);
       if (!layout.getEffectiveLaneLayoutAsInt().empty() ||
           !layout.getEffectiveInstDataAsInt().empty())
         xegpu::setDistributeLayoutAttr(newTranspose->getResult(0),
@@ -1444,7 +1447,8 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
       });
 
   target.addDynamicallyLegalOp<vector::ShapeCastOp, vector::StepOp,
-                               vector::TransposeOp, vector::BroadcastOp>(
+                               vector::TransposeOp, vector::BroadcastOp,
+                               vector::MultiDimReductionOp>(
       [=](Operation *op) -> bool {
         // Check for either a SliceAttr or LayoutAttr on the result.
         auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0));
@@ -1463,16 +1467,6 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
         return isLegal(layout);
       });
 
-  target.addDynamicallyLegalOp<vector::BroadcastOp>(
-      [=](vector::BroadcastOp op) -> bool {
-        return isLegal(xegpu::getDistributeLayoutAttr(op.getResult()));
-      });
-
-  target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
-      [=](vector::MultiDimReductionOp op) -> bool {
-        return isLegal(xegpu::getDistributeLayoutAttr(op.getResult()));
-      });
-
   target.addDynamicallyLegalOp<xegpu::ConvertLayoutOp>(
       [=](xegpu::ConvertLayoutOp op) -> bool {
         return isLegal(op.getInputLayout()) && isLegal(op.getTargetLayout());
 
@@ -10,7 +10,6 @@ gpu.module @test {
     // CHECK-DAG: %[[BASE:.*]] = vector.step : vector<32xindex>
     // CHECK-DAG: %[[CAST:.*]] = vector.broadcast %[[MOD]] : index to vector<32xindex>
     // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[BASE]], %[[CAST]] : vector<32xindex>
-    // CHECK-DAG: %[[RESULT:.*]] = builtin.unrealized_conversion_cast %[[ADD]] : vector<32xindex> to vector<128xindex>
     %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>}: vector<128xindex>
     gpu.return %step : vector<128xindex>
   }
@@ -25,7 +24,6 @@ gpu.module @test {
     // CHECK-DAG: %[[BASE:.*]] = vector.step : vector<32xindex>
     // CHECK-DAG: %[[CAST:.*]] = vector.broadcast %[[MOD]] : index to vector<32xindex>
     // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[BASE]], %[[CAST]] : vector<32xindex>
-    // CHECK-DAG: %[[RESULT:.*]] = builtin.unrealized_conversion_cast %[[ADD]] : vector<32xindex> to vector<128xindex>
     %0 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [4, 8, 1], sg_data = [32, 32, 1]>, dims = [2]>, dims = [1]>} : vector<128xindex>
     gpu.return %0 : vector<128xindex>
   }
 
@@ -166,14 +166,12 @@ gpu.module @test_elementwise_ops {
     %load_b = xegpu.load_nd %tdesc_b
       : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
       -> vector<24x32xf32>
-    // CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
-    // CHECK-SAME-COUNT-12: : vector<2x2xf32>
+    // CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} : vector<2x2xf32>
     // CHECK-NOT: arith.negf
     %negf = arith.negf %load_a
       {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
       : vector<24x32xf32>
-    // CHECK-COUNT-12: math.powf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
-    // CHECK-SAME-COUNT-12: : vector<2x2xf32>
+    // CHECK-COUNT-12: math.powf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} : vector<2x2xf32>
     // CHECK-NOT: math.powf
     %powf = math.powf %load_a, %load_b
       {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
 
@@ -4,14 +4,7 @@ gpu.module @test_round_robin_assignment {
   // CHECK-LABEL: create_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @create_nd_tdesc(%src: memref<256x128xf32>) {
-      // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index
-      // CHECK: %[[C4:.*]] = arith.constant 4 : index
-      // CHECK: %[[IDX:.*]] = index.remu %[[SGID]], %[[C4]]
-      // CHECK: %[[IDY_DIV:.*]] = index.divu %[[SGID]], %[[C4]]
-      // CHECK: %[[C8:.*]] = arith.constant 8 : index
-      // CHECK: %[[IDY:.*]] = index.remu %[[IDY_DIV]], %[[C8]]
-      // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf32>
-      // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
       // CHECK-NOT: xegpu.create_nd_tdesc
       %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
         -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -46,9 +39,7 @@ gpu.module @test_round_robin_assignment {
   gpu.func @load_nd_tdesc(%src: memref<256x128xf32>) {
       %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
         -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-      // CHECK-COUNT-4: xegpu.load_nd %{{.*}}
-      // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-      // CHECK-SAME-COUNT-4: -> vector<16x16xf32>
+      // CHECK-COUNT-4: xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32>
       // CHECK-NOT: xegpu.load_nd
       %load =  xegpu.load_nd %tdesc
         : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -61,8 +52,7 @@ gpu.module @test_round_robin_assignment {
   gpu.func @store_nd(%src: memref<256x128xf32>) {
       %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
         -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-      // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, %{{.*}}
-      // CHECK-SAME-COUNT-4: : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, %{{.*}} : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
       // CHECK-NOT: xegpu.store_nd
       %load = xegpu.load_nd %tdesc
         : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -77,8 +67,7 @@ gpu.module @test_round_robin_assignment {
   gpu.func @update_nd(%src: memref<256x128xf32>){
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
       ->  !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-    // CHECK-COUNT-4: xegpu.update_nd_offset %{{.*}}, [0, 16]
-    // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-COUNT-4: xegpu.update_nd_offset %{{.*}}, [0, 16] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     // CHECK-NOT: xegpu.update_nd_offset
     %update = xegpu.update_nd_offset %tdesc, [0, 16]
       : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -88,13 +77,9 @@ gpu.module @test_round_robin_assignment {
   // CHECK-LABEL: dpas
   // CHECK-SAME: (%[[ARG_0:.*]]: memref<256x128xf16>, %[[ARG_1:.*]]: memref<128x256xf16>)
   gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) {
-    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf16>
-    // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<128x256xf16>
-    // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-    // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}}
-    // CHECK-SAME-COUNT-16: {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    // CHECK-SAME-COUNT-16: : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+    // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
     // CHECK-NOT: xegpu.dpas
     %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<256x128xf16>
       -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -115,8 +100,7 @@ gpu.module @test_round_robin_assignment {
   // CHECK-LABEL: prefetch_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @prefetch_nd_tdesc(%src: memref<256x128xf32>) {
-    // CHECK-COUNT-4: xegpu.prefetch_nd %{{.*}}
-    // CHECK-SAME-COUNT-4: !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-COUNT-4: xegpu.prefetch_nd %{{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     // CHECK-NOT: xegpu.prefetch_nd
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -133,9 +117,7 @@ gpu.module @test_round_robin_assignment {
     %load =  xegpu.load_nd %tdesc
       : !xegpu.tensor_desc<128x1xf32, #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
       -> vector<128x1xf32>
-    // CHECK-COUNT-2: vector.broadcast {{.*}}
-    // CHECK-SAME-COUNT-2: {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>}
-    // CHECK-SAME-COUNT-2: : vector<16x1xf32> to vector<16x32xf32>
+    // CHECK-COUNT-2: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>} : vector<16x1xf32> to vector<16x32xf32>
     // CHECK-NOT: vector.broadcast
     %broadcast = vector.broadcast %load
       {layout_result_0 = #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 32], lane_layout = [8, 1], lane_data = [1, 1]>}
Original file line number	Diff line number	Diff line change
`@@ -288,7 +288,6 @@ LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,`
`288`	`288`	`llvm::reverse(llvm::seq<int64_t>(0, sgLayoutInt.size())));`
`289`	`289`	`}`
`290`	`290`
`291`		`- // Validate order`
`292`	`291`	`if (order.size() != sgLayoutInt.size()) {`
`293`	`292`	`return failure();`
`294`	`293`	`}`