Skip to content

Commit e48ec72

Browse files
charithaintcaokblast
authored andcommitted
[mlir][vector][xegpu] Accept uniform values in getDistributedType (llvm#163887)
Uniform values should not be distributed during vector distribution. Example would be a reduction result where reduction happens across lanes. However, current `getDistributedType` does not accept a zero result affine map (i.e. no distributed dims) when describing the distributed dimensions. This result in null type being returned and crashing the vector distribution in some cases. An example case would be a `scf.for` op (about to be distributed) in which one of the for result is a uniform value and it does not have a user outside the warp op. This necessitates querying the `getDistributedType` to figure our the distributed type of this value.
1 parent 24b7107 commit e48ec72

File tree

3 files changed

+67
-6
lines changed

3 files changed

+67
-6
lines changed

mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -341,13 +341,18 @@ struct WarpOpToScfIfPattern : public WarpDistributionPattern {
341341
/// Return the distributed vector type based on the original type and the
342342
/// distribution map. The map is expected to have a dimension equal to the
343343
/// original type rank and should be a projection where the results are the
344-
/// distributed dimensions. The number of results should be equal to the number
344+
/// distributed dimensions. If the number of results is zero there is no
345+
/// distribution (i.e. original type is returned).
346+
/// Otherwise, The number of results should be equal to the number
345347
/// of warp sizes which is currently limited to 1.
346348
/// Example: For a vector<16x32x64> distributed with a map(d0, d1, d2) -> (d1)
347349
/// and a warp size of 16 would distribute the second dimension (associated to
348350
/// d1) and return vector<16x2x64>
349351
static VectorType getDistributedType(VectorType originalType, AffineMap map,
350352
int64_t warpSize) {
353+
// If the map has zero results, return the original type.
354+
if (map.getNumResults() == 0)
355+
return originalType;
351356
SmallVector<int64_t> targetShape(originalType.getShape());
352357
for (unsigned i = 0, e = map.getNumResults(); i < e; i++) {
353358
unsigned position = map.getDimPosition(i);

mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1505,14 +1505,19 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
15051505
return AffineMap::get(val.getContext());
15061506
// Get the layout of the vector type.
15071507
xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(val);
1508-
// If no layout is specified, assume the inner most dimension is distributed
1509-
// for now.
1508+
// If no layout is specified, that means no distribution.
15101509
if (!layout)
1511-
return AffineMap::getMultiDimMapWithTargets(
1512-
vecRank, {static_cast<unsigned int>(vecRank - 1)}, val.getContext());
1510+
return AffineMap::getMultiDimMapWithTargets(vecRank, {},
1511+
val.getContext());
1512+
// Expecting vector and layout rank to match.
1513+
assert(layout.getRank() == vecRank &&
1514+
"Expecting vector and layout rank to match");
1515+
// A dimension is distributed only if layout suggests there are
1516+
// multiple lanes assigned for this dimension and the shape can be evenly
1517+
// distributed to those lanes.
15131518
SmallVector<unsigned int> distributedDims;
15141519
for (auto [i, v] : llvm::enumerate(layout.getEffectiveLaneLayoutAsInt())) {
1515-
if (v > 1)
1520+
if (v > 1 && vecType.getShape()[i] % v == 0)
15161521
distributedDims.push_back(i);
15171522
}
15181523
return AffineMap::getMultiDimMapWithTargets(vecRank, distributedDims,

mlir/test/Dialect/XeGPU/subgroup-distribute.mlir

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,3 +214,54 @@ gpu.module @xevm_module{
214214

215215
}
216216
}
217+
218+
// -----
219+
// CHECK-LABEL: gpu.func @warp_scf_for_unused_uniform_for_result(
220+
// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] args(%{{.*}} : index,
221+
// CHECK-SAME: !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
222+
// CHECK-SAME: memref<16x16xf32>) -> (vector<16x1xf32>, vector<16x1xf32>) {
223+
// CHECK: gpu.yield %{{.*}}, {{.*}} : vector<16x16xf32>, vector<16x1xf32>
224+
// CHECK: }
225+
// CHECK: %{{.*}}:2 = scf.for {{.*}} to %{{.*}} step %{{.*}} iter_args
226+
// CHECK-SAME: (%{{.*}} = %[[W]]#0, %{{.*}} = %[[W]]#1) -> (vector<16x1xf32>, vector<16x1xf32>) {
227+
// CHECK: %[[W1:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
228+
// CHECK-SAME: args(%{{.*}} : vector<16x1xf32>, vector<16x1xf32>) -> (vector<16x1xf32>, vector<16x1xf32>) {
229+
// CHECK: gpu.yield %{{.*}}, %{{.*}} : vector<16x16xf32>, vector<16x1xf32>
230+
// CHECK: }
231+
// CHECK: scf.yield %[[W1]]#0, %[[W1]]#1 : vector<16x1xf32>, vector<16x1xf32>
232+
// CHECK: }
233+
gpu.module @xevm_module{
234+
gpu.func @warp_scf_for_unused_uniform_for_result(%arg0: index,
235+
%arg1: !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
236+
%arg2: memref<16x16xf32>) {
237+
%c128 = arith.constant 128 : index
238+
%c1 = arith.constant 1 : index
239+
%c0 = arith.constant 0 : index
240+
%ini = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
241+
: () -> (vector<16x1xf32>)
242+
%ini2 = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
243+
: () -> (vector<16x16xf32>)
244+
%3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini2, %arg5 = %ini) -> (vector<16x16xf32>, vector<16x1xf32>) {
245+
%1 = "some_def"(%arg5)
246+
{
247+
layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
248+
layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
249+
}
250+
: (vector<16x1xf32>) -> (vector<16x1xf32>)
251+
%acc = "some_def"(%arg4, %1)
252+
{
253+
layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
254+
layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
255+
layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
256+
}
257+
: (vector<16x16xf32>, vector<16x1xf32>) -> (vector<16x16xf32>)
258+
scf.yield %acc, %1 : vector<16x16xf32>, vector<16x1xf32>
259+
}
260+
{
261+
layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
262+
}
263+
xegpu.store_nd %3#0, %arg1[%c0, %c0]
264+
: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
265+
gpu.return
266+
}
267+
}

0 commit comments

Comments
 (0)