Feedback

nbpatel · nbpatel · commit a25c40deb198 · 2025-08-28T16:38:55.000Z
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -780,7 +780,7 @@ struct WgToSgLoadGatherOpWithOffset
     ArrayRef<int64_t> wgShape = resultType.getShape();
 
     xegpu::LayoutAttr layout = xegpu::getLayoutAttr(op.getResult());
-    if (!layout || !layout.getSgLayout())
+    if (!layout || !layout.isForWorkgroup())
       return failure();
 
     SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
@@ -820,9 +820,8 @@ struct WgToSgStoreScatterOpWithOffset
     if (!valueType)
       return failure();
 
-    ArrayRef<int64_t> wgShape = valueType.getShape();
     xegpu::LayoutAttr layout = xegpu::getLayoutAttr(op.getValue());
-    if (!layout || !layout.getSgLayout())
+    if (!layout || !layout.isForWorkgroup())
       return failure();
 
     auto chunkSizeOpt = op.getChunkSize();
@@ -833,12 +832,9 @@ struct WgToSgStoreScatterOpWithOffset
       rewriter.create<xegpu::StoreScatterOp>(
           loc, val, op.getDest(), offs, mask, chunkSizeAttr, op.getL1HintAttr(),
           op.getL2HintAttr(), op.getL3HintAttr());
-      // Update the layout_result_0 attribute to drop sg_layout and sg_data.
-      if (auto layoutAttr =
-              op->getAttrOfType<xegpu::LayoutAttr>("layout_result_0")) {
-        if (auto newLayout = layoutAttr.dropSgLayoutAndData())
-          op->setAttr("layout_result_0", newLayout);
-      }
+      // Update the layout attribute to drop sg_layout and sg_data.
+      if (auto newLayout = layout.dropSgLayoutAndData())
+        op->setAttr("layout", newLayout);
     }
     rewriter.eraseOp(op);
     return success();
@@ -1042,7 +1038,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
   target.addDynamicallyLegalOp<xegpu::StoreScatterOp>(
       [=](xegpu::StoreScatterOp op) -> bool {
         // Check if the layout attribute is present on the result.
-        auto layout = op->getAttrOfType<xegpu::LayoutAttr>("layout_result_0");
+        auto layout = op->getAttrOfType<xegpu::LayoutAttr>("layout");
         if (!layout)
           return true;
         return isLegal(layout);
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -269,7 +269,8 @@ gpu.module @test_distribution {
   gpu.func @load_gather(%src : memref<?xf16>) {
     // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<32x4xindex>
     // CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<32x4xi1>
-    // CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : memref<?xf16>, vector<32x4xindex>, vector<32x4xi1> -> vector<32x4xf16>
+    // CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}>
+    // CHECK-SAME: : memref<?xf16>, vector<32x4xindex>, vector<32x4xi1> -> vector<32x4xf16>
     %offset =  arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 4]>} dense<0> : vector<256x16xindex>
     %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 4]>} dense<1> : vector<256x16xi1>
     %load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 4]>, l1_hint = #xegpu.cache_hint<cached>}
@@ -283,21 +284,23 @@ gpu.module @test_distribution {
     // CHECK: %[[VAL:.*]] = arith.constant dense<2.550000e+01> : vector<8xf16>
     // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xindex>
     // CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<8xi1>
-    // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1>
+    // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}>
+    // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1>
     %val = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<25.5> : vector<256xf16>
     %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<0> : vector<256xindex>
     %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<1> : vector<256xi1>
-    xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>, l1_hint = #xegpu.cache_hint<cached>}
+    xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [32], sg_data = [8]>, l1_hint = #xegpu.cache_hint<cached>}
       : vector<256xf16>, memref<256xf16>, vector<256xindex>, vector<256xi1>
     gpu.return
   }
 
-  // CHECK-LABEL: @load_with_chunk_size
+  // CHECK-LABEL: @load_with_non_unit_chunk_size
   // CHECK-SAME: %[[ARG0:.*]]: memref<?xf16>
-  gpu.func @load_with_chunk_size(%src : memref<?xf16>) {
+  gpu.func @load_with_non_unit_chunk_size(%src : memref<?xf16>) {
     // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xindex>
     // CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<8xi1>
-    // CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 4 : i64, l1_hint = #xegpu.cache_hint<cached>}> : memref<?xf16>, vector<8xindex>, vector<8xi1> -> vector<8x4xf16>
+    // CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 4 : i64, l1_hint = #xegpu.cache_hint<cached>}>
+    // CHECK-SAME: : memref<?xf16>, vector<8xindex>, vector<8xi1> -> vector<8x4xf16>
     %offset =  arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<0> : vector<256xindex>
     %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<1> : vector<256xi1>
     %load = xegpu.load %src[%offset], %mask {chunk_size = 4, layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 4]>, l1_hint = #xegpu.cache_hint<cached>}