diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp index 7c019e7d25bf2..8b5e950733a22 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp @@ -341,13 +341,18 @@ struct WarpOpToScfIfPattern : public WarpDistributionPattern { /// Return the distributed vector type based on the original type and the /// distribution map. The map is expected to have a dimension equal to the /// original type rank and should be a projection where the results are the -/// distributed dimensions. The number of results should be equal to the number +/// distributed dimensions. If the number of results is zero there is no +/// distribution (i.e. original type is returned). +/// Otherwise, The number of results should be equal to the number /// of warp sizes which is currently limited to 1. /// Example: For a vector<16x32x64> distributed with a map(d0, d1, d2) -> (d1) /// and a warp size of 16 would distribute the second dimension (associated to /// d1) and return vector<16x2x64> static VectorType getDistributedType(VectorType originalType, AffineMap map, int64_t warpSize) { + // If the map has zero results, return the original type. + if (map.getNumResults() == 0) + return originalType; SmallVector targetShape(originalType.getShape()); for (unsigned i = 0, e = map.getNumResults(); i < e; i++) { unsigned position = map.getDimPosition(i); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 26770b3c003ea..bd48332f5ec7b 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -1505,14 +1505,19 @@ void XeGPUSubgroupDistributePass::runOnOperation() { return AffineMap::get(val.getContext()); // Get the layout of the vector type. xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(val); - // If no layout is specified, assume the inner most dimension is distributed - // for now. + // If no layout is specified, that means no distribution. if (!layout) - return AffineMap::getMultiDimMapWithTargets( - vecRank, {static_cast(vecRank - 1)}, val.getContext()); + return AffineMap::getMultiDimMapWithTargets(vecRank, {}, + val.getContext()); + // Expecting vector and layout rank to match. + assert(layout.getRank() == vecRank && + "Expecting vector and layout rank to match"); + // A dimension is distributed only if layout suggests there are + // multiple lanes assigned for this dimension and the shape can be evenly + // distributed to those lanes. SmallVector distributedDims; for (auto [i, v] : llvm::enumerate(layout.getEffectiveLaneLayoutAsInt())) { - if (v > 1) + if (v > 1 && vecType.getShape()[i] % v == 0) distributedDims.push_back(i); } return AffineMap::getMultiDimMapWithTargets(vecRank, distributedDims, diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 0e1365aa64171..27a3dc373c739 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -214,3 +214,54 @@ gpu.module @xevm_module{ } } + +// ----- +// CHECK-LABEL: gpu.func @warp_scf_for_unused_uniform_for_result( +// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] args(%{{.*}} : index, +// CHECK-SAME: !xegpu.tensor_desc<16x16xf32, #xegpu.layout>, +// CHECK-SAME: memref<16x16xf32>) -> (vector<16x1xf32>, vector<16x1xf32>) { +// CHECK: gpu.yield %{{.*}}, {{.*}} : vector<16x16xf32>, vector<16x1xf32> +// CHECK: } +// CHECK: %{{.*}}:2 = scf.for {{.*}} to %{{.*}} step %{{.*}} iter_args +// CHECK-SAME: (%{{.*}} = %[[W]]#0, %{{.*}} = %[[W]]#1) -> (vector<16x1xf32>, vector<16x1xf32>) { +// CHECK: %[[W1:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] +// CHECK-SAME: args(%{{.*}} : vector<16x1xf32>, vector<16x1xf32>) -> (vector<16x1xf32>, vector<16x1xf32>) { +// CHECK: gpu.yield %{{.*}}, %{{.*}} : vector<16x16xf32>, vector<16x1xf32> +// CHECK: } +// CHECK: scf.yield %[[W1]]#0, %[[W1]]#1 : vector<16x1xf32>, vector<16x1xf32> +// CHECK: } +gpu.module @xevm_module{ + gpu.func @warp_scf_for_unused_uniform_for_result(%arg0: index, + %arg1: !xegpu.tensor_desc<16x16xf32, #xegpu.layout>, + %arg2: memref<16x16xf32>) { + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %ini = "some_def"() {layout_result_0 = #xegpu.layout} + : () -> (vector<16x1xf32>) + %ini2 = "some_def"() {layout_result_0 = #xegpu.layout} + : () -> (vector<16x16xf32>) + %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini2, %arg5 = %ini) -> (vector<16x16xf32>, vector<16x1xf32>) { + %1 = "some_def"(%arg5) + { + layout_operand_0 = #xegpu.layout, + layout_result_0 = #xegpu.layout + } + : (vector<16x1xf32>) -> (vector<16x1xf32>) + %acc = "some_def"(%arg4, %1) + { + layout_operand_0 = #xegpu.layout, + layout_operand_1 = #xegpu.layout, + layout_result_0 = #xegpu.layout + } + : (vector<16x16xf32>, vector<16x1xf32>) -> (vector<16x16xf32>) + scf.yield %acc, %1 : vector<16x16xf32>, vector<16x1xf32> + } + { + layout_result_0 = #xegpu.layout + } + xegpu.store_nd %3#0, %arg1[%c0, %c0] + : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout> + gpu.return + } +}