save work

charithaintc · charithaintc · commit d5d2713d1370 · 2025-04-30T22:17:10.000Z
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1475,11 +1475,12 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
           subgroupOp, "warp result is not a xegpu::UpdateNdOffset op");
     auto updateOp = operand->get().getDefiningOp<xegpu::UpdateNdOffsetOp>();
     unsigned operandIdx = operand->getOperandNumber();
-    auto newTensorDescTy = dropLayouts(updateOp.getTensorDescType());
+    xegpu::TensorDescType newTensorDescTy =
+        dropLayouts(updateOp.getTensorDescType());
 
     SmallVector<Value, 3> newYieldValues;
     SmallVector<Type, 3> newYieldTypes;
-    for (auto operand : updateOp->getOperands()) {
+    for (Value operand : updateOp->getOperands()) {
       newYieldValues.push_back(operand);
       if (isa<xegpu::TensorDescType>(operand.getType())) {
         newYieldTypes.push_back(newTensorDescTy);
@@ -1492,7 +1493,7 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
         rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
     rewriter.setInsertionPointAfter(newWarpOp);
     SmallVector<Value> newUpdateOperands;
-    for (auto i : newRetIndices) {
+    for (size_t i : newRetIndices) {
       if (isa<xegpu::TensorDescType>(newWarpOp.getResult(i).getType())) {
         newUpdateOperands.push_back(resolveDistributedTy(
             newWarpOp.getResult(i), newTensorDescTy, rewriter));
@@ -1519,7 +1520,7 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
     auto prefetchOp = dyn_cast_or_null<xegpu::PrefetchNdOp>(lastNode);
     if (!prefetchOp)
       return failure();
-    auto layout = prefetchOp.getTensorDescType().getLayoutAttr();
+    xegpu::LayoutAttr layout = prefetchOp.getTensorDescType().getLayoutAttr();
     if (!layout)
       return rewriter.notifyMatchFailure(
           prefetchOp, "the source tensor descriptor lacks layout attribute");
@@ -1530,7 +1531,8 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
 
-    auto newTensorDescTy = dropLayouts(prefetchOp.getTensorDescType());
+    xegpu::TensorDescType newTensorDescTy =
+        dropLayouts(prefetchOp.getTensorDescType());
     rewriter.setInsertionPointAfter(newWarpOp);
     SmallVector<Value> newPrefetchOperands = {resolveDistributedTy(
         newWarpOp.getResult(newRetIndices[0]), newTensorDescTy, rewriter)};
@@ -1570,12 +1572,12 @@ struct GpuIndexOpDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
                                 PatternRewriter &rewriter) const override {
-    auto operand = getWarpResult(subgroupOp, llvm::IsaPred<IndexOp>);
+    OpOperand *operand = getWarpResult(subgroupOp, llvm::IsaPred<IndexOp>);
     if (!operand)
       return rewriter.notifyMatchFailure(subgroupOp,
                                          "warp result is not a gpu index op");
-    auto indexOp = operand->template get().template getDefiningOp<IndexOp>();
-    unsigned operandIdx = operand->template getOperandNumber();
+    auto indexOp = operand->get().getDefiningOp<IndexOp>();
+    unsigned operandIdx = operand->getOperandNumber();
     SmallVector<Value, 3> newYieldValues;
     SmallVector<Type, 3> newYieldTypes;
     for (auto operand : indexOp->template getOperands()) {
@@ -1587,7 +1589,7 @@ struct GpuIndexOpDistribution final : public gpu::WarpDistributionPattern {
         rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
     rewriter.setInsertionPointAfter(newWarpOp);
     SmallVector<Value> newIndexOperands;
-    for (auto i : newRetIndices) {
+    for (size_t i : newRetIndices) {
       newIndexOperands.push_back(newWarpOp.getResult(i));
     }
     auto newIndexOp = rewriter.create<IndexOp>(
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
@@ -162,14 +162,14 @@ gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64,
 }
 
 // -----
-// CHECK-LABEL: gpu.func @test_update_nd_offset_1d(
+// CHECK-LABEL: gpu.func @update_nd_offset_1d(
 // CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>) {
 // CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32>
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32>
 // CHECK: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%c32] : !xegpu.tensor_desc<16xf32>
 // CHECK: xegpu.store_nd %[[CST]], %[[T1]]  : vector<1xf32>, !xegpu.tensor_desc<16xf32>
 gpu.module @test {
-gpu.func @test_update_nd_offset_1d(%arg0: memref<256xf32>){
+gpu.func @update_nd_offset_1d(%arg0: memref<256xf32>){
   %c0 = arith.constant 0 : index
   %c32 = arith.constant 32 : index
   %1 = arith.constant dense<1.000000e+00> : vector<16xf32>
@@ -181,14 +181,14 @@ gpu.func @test_update_nd_offset_1d(%arg0: memref<256xf32>){
 }
 
 // -----
-// CHECK-LABEL: gpu.func @test_update_nd_offset_2d
+// CHECK-LABEL: gpu.func @update_nd_offset_2d
 // CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf32>) {
 // CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf32>
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32>
 // CHECK: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%c32, %c32] : !xegpu.tensor_desc<16x16xf32>
 // CHECK: xegpu.store_nd %[[CST]], %[[T1]]  : vector<16xf32>, !xegpu.tensor_desc<16x16xf32>
 gpu.module @test {
-gpu.func @test_update_nd_offset_2d(%arg0: memref<256x256xf32>){
+gpu.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){
   %c0 = arith.constant 0 : index
   %c32 = arith.constant 32 : index
   %1 = arith.constant dense<1.000000e+00> : vector<16x16xf32>
@@ -200,12 +200,12 @@ gpu.func @test_update_nd_offset_2d(%arg0: memref<256x256xf32>){
 }
 
 // -----
-// CHECK-LABEL: gpu.func @test_prefetch_2d
+// CHECK-LABEL: gpu.func @prefetch_2d
 // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
 // CHECK: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16>
 gpu.module @test {
-gpu.func @test_prefetch_2d(%arg0: memref<256x256xf16>){
+gpu.func @prefetch_2d(%arg0: memref<256x256xf16>){
   %c0 = arith.constant 0 : index
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
   xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16x16xf16>
@@ -214,12 +214,12 @@ gpu.func @test_prefetch_2d(%arg0: memref<256x256xf16>){
 }
 
 // -----
-// CHECK-LABEL: gpu.func @test_prefetch_1d
+// CHECK-LABEL: gpu.func @prefetch_1d
 // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
 // CHECK: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16>
 gpu.module @test {
-gpu.func @test_prefetch_1d(%arg0: memref<256xf16>){
+gpu.func @prefetch_1d(%arg0: memref<256xf16>){
   %c0 = arith.constant 0 : index
   %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
   xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16xf16>
@@ -229,7 +229,7 @@ gpu.func @test_prefetch_1d(%arg0: memref<256xf16>){
 
 
 // -----
-// CHECK-LABEL: gpu.func @test_gemm_loop
+// CHECK-LABEL: gpu.func @gemm_loop
 // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) {
 // CHECK: %[[BLOCK_ID_Y:.*]] = gpu.block_id y
 // CHECK: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index
@@ -252,7 +252,7 @@ gpu.func @test_prefetch_1d(%arg0: memref<256xf16>){
 // CHECK: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32>
 // CHECK: xegpu.store_nd %[[T9]], %[[T8]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
 gpu.module @test {
-gpu.func @test_gemm_loop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){
+gpu.func @gemm_loop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){
   %c0 = arith.constant 0 : index
   %c16 = arith.constant 16 : index
   %c8 = arith.constant 8 : index