add cse for cleaning up

charithaintc · charithaintc · commit 519d02aded7b · 2025-05-06T02:51:19.000Z
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1008,6 +1008,11 @@ struct MoveFuncBodyToWarpExecuteOnLane0
     rewriter.setInsertionPointAfter(warpOp);
     rewriter.create<gpu::ReturnOp>(newGpuFunc.getLoc(), warpOp.getResults());
     rewriter.replaceOp(gpuFuncOp, newGpuFunc);
+    // At this point, we have moved the entire function body inside the warpOp.
+    // Now move any scalar uniform code outside of the warpOp (like GPU index
+    // ops, scalar constants, etc.). This will simplify the later lowering and
+    // avoid custom patterns for these ops.
+    vector::moveScalarUniformCode(warpOp);
     return success();
   }
 };
@@ -1412,63 +1417,6 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
   }
 };
 
-/// Generic pattern for sinking a GPU index operations feeding into yield op
-/// of an enclosing `gpu.warp_execute_on_lane_0` region. The original index op
-/// becomes dead and an equivalent copy of the index op is created outside the
-/// warp op.
-/// Example:
-/// ```
-///   %r = gpu.warp_execute_on_lane_0(%laneid) -> (index) {
-///     ...
-///     %index = gpu.block_id x : index
-///     gpu.yield %index
-///   }
-///   ...
-/// ```
-/// To
-/// ```
-///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (index) {
-///     ...
-///     %dead = gpu.block_id x : index
-///     gpu.yield %dead
-///   }
-///   %0 = gpu.block_id x : index
-///   ...
-/// ```
-template <typename IndexOp>
-struct GpuIndexOpDistribution final : public gpu::WarpDistributionPattern {
-  using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
-                                PatternRewriter &rewriter) const override {
-    OpOperand *operand = getWarpResult(subgroupOp, llvm::IsaPred<IndexOp>);
-    if (!operand)
-      return rewriter.notifyMatchFailure(subgroupOp,
-                                         "warp result is not a gpu index op");
-    Operation *indexOp = operand->get().getDefiningOp<IndexOp>();
-    unsigned operandIdx = operand->getOperandNumber();
-    SmallVector<Value, 3> newYieldValues;
-    SmallVector<Type, 3> newYieldTypes;
-    for (Value operand : indexOp->getOperands()) {
-      newYieldValues.push_back(operand);
-      newYieldTypes.push_back(operand.getType());
-    }
-    SmallVector<size_t> newRetIndices;
-    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
-    rewriter.setInsertionPointAfter(newWarpOp);
-    SmallVector<Value> newIndexOperands;
-    for (size_t i : newRetIndices) {
-      newIndexOperands.push_back(newWarpOp.getResult(i));
-    }
-    auto newIndexOp = rewriter.create<IndexOp>(
-        newWarpOp.getLoc(), newIndexOperands,
-        removeTemporaryLayoutAttributes(indexOp->getAttrs()));
-    Value distributedVal = newWarpOp.getResult(operandIdx);
-    rewriter.replaceAllUsesWith(distributedVal, newIndexOp);
-    return success();
-  }
-};
-
 } // namespace
 
 namespace {
@@ -1488,20 +1436,6 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
     RewritePatternSet &patterns) {
   patterns.add<CreateNdDescDistribution, StoreNdDistribution,
                LoadNdDistribution, DpasDistribution>(patterns.getContext());
-  // TODO: Is this the right place to add these patterns?
-  patterns.add<GpuIndexOpDistribution<gpu::BlockIdOp>,
-               GpuIndexOpDistribution<gpu::BlockDimOp>,
-               GpuIndexOpDistribution<gpu::SubgroupIdOp>,
-               GpuIndexOpDistribution<gpu::SubgroupSizeOp>,
-               GpuIndexOpDistribution<gpu::NumSubgroupsOp>,
-               GpuIndexOpDistribution<gpu::ClusterDimOp>,
-               GpuIndexOpDistribution<gpu::ClusterDimBlocksOp>,
-               GpuIndexOpDistribution<gpu::ClusterIdOp>,
-               GpuIndexOpDistribution<gpu::ClusterBlockIdOp>,
-               GpuIndexOpDistribution<gpu::GridDimOp>,
-               GpuIndexOpDistribution<gpu::ThreadIdOp>,
-               GpuIndexOpDistribution<gpu::LaneIdOp>,
-               GpuIndexOpDistribution<gpu::GlobalIdOp>>(patterns.getContext());
 }
 
 void XeGPUSubgroupDistributePass::runOnOperation() {
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -xegpu-subgroup-distribute -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -xegpu-subgroup-distribute -cse -split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: gpu.func @store_nd_1d
 // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) {
@@ -164,9 +164,9 @@ gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64,
 // -----
 // CHECK-LABEL: gpu.func @gemm_loop
 // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) {
+// CHECK: %[[BLOCK_ID_X:.*]] = gpu.block_id x
 // CHECK: %[[BLOCK_ID_Y:.*]] = gpu.block_id y
 // CHECK: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index
-// CHECK: %[[BLOCK_ID_X:.*]] = gpu.block_id x
 // CHECK: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index
 // CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]][%[[X_COORD]], %[[Y_COORD]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
@@ -181,9 +181,8 @@ gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64,
 // CHECK: %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32>
 // CHECK: scf.yield %[[T16]] : vector<8x1xf32>
 // CHECK: }
-// CHECK: %[[T8:.*]] = xegpu.create_nd_tdesc %[[ARG2]]{{.*}} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32>
-// CHECK: xegpu.store_nd %[[T9]], %[[T8]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+// CHECK: xegpu.store_nd %[[T9]], %[[T2]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
 gpu.module @test {
 gpu.func @gemm_loop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){
   %c0 = arith.constant 0 : index