Skip to content

Commit 3ff8593

Browse files
committed
save work
1 parent 08c115d commit 3ff8593

File tree

3 files changed

+63
-2
lines changed

3 files changed

+63
-2
lines changed

mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -341,13 +341,18 @@ struct WarpOpToScfIfPattern : public WarpDistributionPattern {
341341
/// Return the distributed vector type based on the original type and the
342342
/// distribution map. The map is expected to have a dimension equal to the
343343
/// original type rank and should be a projection where the results are the
344-
/// distributed dimensions. The number of results should be equal to the number
344+
/// distributed dimensions. If the number of results is zero there is no
345+
/// distribution (i.e. original type is returned).
346+
/// Otherwise, The number of results should be equal to the number
345347
/// of warp sizes which is currently limited to 1.
346348
/// Example: For a vector<16x32x64> distributed with a map(d0, d1, d2) -> (d1)
347349
/// and a warp size of 16 would distribute the second dimension (associated to
348350
/// d1) and return vector<16x2x64>
349351
static VectorType getDistributedType(VectorType originalType, AffineMap map,
350352
int64_t warpSize) {
353+
// If the map has zero results, return the original type.
354+
if (map.getNumResults() == 0)
355+
return originalType;
351356
SmallVector<int64_t> targetShape(originalType.getShape());
352357
for (unsigned i = 0, e = map.getNumResults(); i < e; i++) {
353358
unsigned position = map.getDimPosition(i);

mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1510,9 +1510,14 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
15101510
if (!layout)
15111511
return AffineMap::getMultiDimMapWithTargets(
15121512
vecRank, {static_cast<unsigned int>(vecRank - 1)}, val.getContext());
1513+
// Expecting vector and layout rank to match.
1514+
assert(layout.getRank() == vecRank &&
1515+
"Expecting vector and layout rank to match");
1516+
// A dimension is distributed if its layout value is > 1 and the dimension
1517+
// size is evenly divisible by the layout value.
15131518
SmallVector<unsigned int> distributedDims;
15141519
for (auto [i, v] : llvm::enumerate(layout.getEffectiveLaneLayoutAsInt())) {
1515-
if (v > 1)
1520+
if (v > 1 && vecType.getShape()[i] % v == 0)
15161521
distributedDims.push_back(i);
15171522
}
15181523
return AffineMap::getMultiDimMapWithTargets(vecRank, distributedDims,

mlir/test/Dialect/XeGPU/subgroup-distribute.mlir

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,3 +214,54 @@ gpu.module @xevm_module{
214214

215215
}
216216
}
217+
218+
// -----
219+
// CHECK-LABEL: gpu.func @warp_scf_for_unused_uniform_for_result(
220+
// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] args(%{{.*}} : index,
221+
// CHECK-SAME: !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
222+
// CHECK-SAME: memref<16x16xf32>) -> (vector<16x1xf32>, vector<16x1xf32>) {
223+
// CHECK: gpu.yield %{{.*}}, {{.*}} : vector<16x16xf32>, vector<16x1xf32>
224+
// CHECK: }
225+
// CHECK: %{{.*}}:2 = scf.for {{.*}} to %{{.*}} step %{{.*}} iter_args
226+
// CHECK-SAME: (%{{.*}} = %[[W]]#0, %{{.*}} = %[[W]]#1) -> (vector<16x1xf32>, vector<16x1xf32>) {
227+
// CHECK: %[[W1:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
228+
// CHECK-SAME: args(%{{.*}} : vector<16x1xf32>, vector<16x1xf32>) -> (vector<16x1xf32>, vector<16x1xf32>) {
229+
// CHECK: gpu.yield %{{.*}}, %{{.*}} : vector<16x16xf32>, vector<16x1xf32>
230+
// CHECK: }
231+
// CHECK: scf.yield %[[W1]]#0, %[[W1]]#1 : vector<16x1xf32>, vector<16x1xf32>
232+
// CHECK: }
233+
gpu.module @xevm_module{
234+
gpu.func @warp_scf_for_unused_uniform_for_result(%arg0: index,
235+
%arg1: !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
236+
%arg2: memref<16x16xf32>) {
237+
%c128 = arith.constant 128 : index
238+
%c1 = arith.constant 1 : index
239+
%c0 = arith.constant 0 : index
240+
%ini = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
241+
: () -> (vector<16x1xf32>)
242+
%ini2 = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
243+
: () -> (vector<16x16xf32>)
244+
%3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini2, %arg5 = %ini) -> (vector<16x16xf32>, vector<16x1xf32>) {
245+
%1 = "some_def"(%arg5)
246+
{
247+
layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
248+
layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
249+
}
250+
: (vector<16x1xf32>) -> (vector<16x1xf32>)
251+
%acc = "some_def"(%arg4, %1)
252+
{
253+
layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
254+
layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
255+
layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
256+
}
257+
: (vector<16x16xf32>, vector<16x1xf32>) -> (vector<16x16xf32>)
258+
scf.yield %acc, %1 : vector<16x16xf32>, vector<16x1xf32>
259+
}
260+
{
261+
layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
262+
}
263+
xegpu.store_nd %3#0, %arg1[%c0, %c0]
264+
: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
265+
gpu.return
266+
}
267+
}

0 commit comments

Comments
 (0)