[mlir][xegpu] Bug fix in UpdateNdOffset distribution. (#150545)

charithaintc · web-flow · commit 06884d0204ec · 2025-08-05T14:42:14.000-07:00
Reason is UpdateNdOffset source operand not retaining the layouts when
it is yielded by the warp op. `warp_execute_on_lane0` op expects that
TensorDesc type is unchanged during distribution out of its region. we
use UnrealizedCasts to reconcile this mismatch outside the warpOp (via
`resolveDistributedTy`)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -277,22 +277,13 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
           descOp, "the tensor descriptor lacks layout attribute");
 
     SmallVector<size_t> newRetIndices;
-    SmallVector<Value> newYieldValues;
-    SmallVector<Type> newYieldTypes;
-
-    for (Value operand : descOp->getOperands()) {
-      newYieldValues.push_back(operand);
-      newYieldTypes.push_back(operand.getType());
-    }
     rewriter.setInsertionPoint(warpOp);
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, warpOp, /* new yieled values = */ newYieldValues,
-        /* new yielded types = */ newYieldTypes, newRetIndices);
+        rewriter, warpOp, /* new yieled values = */ descOp->getOperands(),
+        /* new yielded types = */ descOp.getOperandTypes(), newRetIndices);
 
-    SmallVector<Value> newDescOperands;
-    for (size_t i : newRetIndices) {
-      newDescOperands.push_back(newWarpOp.getResult(i));
-    }
+    SmallVector<Value> newDescOperands = llvm::map_to_vector(
+        newRetIndices, [&](size_t i) { return newWarpOp.getResult(i); });
     rewriter.setInsertionPointAfter(newWarpOp);
     xegpu::TensorDescType distributedTensorDescTy =
         descOp.getType().dropLayouts(); // Distributed tensor descriptor type
@@ -696,39 +687,30 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
           warpOp, "warp result is not a xegpu::UpdateNdOffset op");
     auto updateOp = operand->get().getDefiningOp<xegpu::UpdateNdOffsetOp>();
     unsigned operandIdx = operand->getOperandNumber();
-    // new update op does not have layout attribute.
-    xegpu::TensorDescType newTensorDescTy =
-        updateOp.getTensorDescType().dropLayouts();
 
-    SmallVector<Value, 3> newYieldValues;
-    SmallVector<Type, 3> newYieldTypes;
-    for (Value operand : updateOp->getOperands()) {
-      newYieldValues.push_back(operand);
-      if (isa<xegpu::TensorDescType>(operand.getType())) {
-        newYieldTypes.push_back(newTensorDescTy);
-      } else {
-        newYieldTypes.push_back(operand.getType());
-      }
-    }
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
+        rewriter, warpOp, updateOp->getOperands(), updateOp.getOperandTypes(),
+        newRetIndices);
     rewriter.setInsertionPointAfter(newWarpOp);
-    SmallVector<Value> newUpdateOperands;
-    for (size_t i : newRetIndices) {
-      // For the tensor descriptor operand, the layout attribute is dropped
-      // after distribution. Types needs to be resolved in this case.
-      if (isa<xegpu::TensorDescType>(newWarpOp.getResult(i).getType())) {
-        newUpdateOperands.push_back(resolveDistributedTy(
-            newWarpOp.getResult(i), newTensorDescTy, rewriter));
-      } else {
-        newUpdateOperands.push_back(newWarpOp.getResult(i));
-      }
-    }
+    // new update op does not have layout attribute.
+    xegpu::TensorDescType distributedTensorDescTy =
+        updateOp.getTensorDescType().dropLayouts();
+    SmallVector<Value> newUpdateOperands =
+        llvm::map_to_vector(newRetIndices, [&](size_t i) {
+          // For the tensor descriptor operand, the layout attribute is
+          // dropped after distribution. Types needs to be resolved in this
+          // case.
+          if (isa<xegpu::TensorDescType>(newWarpOp.getResult(i).getType())) {
+            return resolveDistributedTy(newWarpOp.getResult(i),
+                                        distributedTensorDescTy, rewriter);
+          }
+          return newWarpOp.getResult(i);
+        });
     // Create a new update op outside the warp op.
     auto newUpdateOp = xegpu::UpdateNdOffsetOp::create(
-        rewriter, newWarpOp.getLoc(), newTensorDescTy, newUpdateOperands,
-        updateOp->getAttrs());
+        rewriter, newWarpOp.getLoc(), distributedTensorDescTy,
+        newUpdateOperands, updateOp->getAttrs());
     xegpu::removeLayoutAttrs(newUpdateOp);
     Value distributedVal = newWarpOp.getResult(operandIdx);
     // Resolve the distributed type with the original type.
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -xegpu-subgroup-distribute -canonicalize -cse -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -xegpu-subgroup-distribute -allow-unregistered-dialect -canonicalize -cse -split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: gpu.func @store_nd_1d
 // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) {
@@ -265,6 +265,28 @@ gpu.module @test {
   }
 }
 
+// -----
+// Explicitly check that update_nd_offset op's source retain layout when yielded from the warp op (PR150545)
+// CHECK-LABEL: gpu.func @check_update_nd_offset_distributed_tensor_desc
+// CHECK:      %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] ->
+// CHECK-SAME:    (!xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
+// CHECK:         %[[T0:.*]] = "some_op"() : () -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK:         gpu.yield %[[T0]] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK:       }
+// CHECK:      %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]] :
+// CHECK-SAME:    !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf32> {resolve_simt_type_mismatch}
+// CHECK:      xegpu.update_nd_offset %[[T1]], [%{{.*}}] : !xegpu.tensor_desc<16x16xf32>
+gpu.module @test {
+  gpu.func @check_update_nd_offset_distributed_tensor_desc() {
+    %c32 = arith.constant 32 : index
+    %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<1.000000e+00> : vector<16x16xf32>
+    %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %1 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    xegpu.store_nd %cst, %1  : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
+  }
+}
+
 // -----
 // CHECK-LABEL: gpu.func @prefetch_1d
 // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {