-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[mlir][vector][xegpu] Accept uniform values in getDistributedType
#163887
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1510,9 +1510,14 @@ void XeGPUSubgroupDistributePass::runOnOperation() { | |
if (!layout) | ||
return AffineMap::getMultiDimMapWithTargets( | ||
vecRank, {static_cast<unsigned int>(vecRank - 1)}, val.getContext()); | ||
// Expecting vector and layout rank to match. | ||
assert(layout.getRank() == vecRank && | ||
"Expecting vector and layout rank to match"); | ||
// A dimension is distributed if its layout value is > 1 and the dimension | ||
|
||
// size is evenly divisible by the layout value. | ||
SmallVector<unsigned int> distributedDims; | ||
for (auto [i, v] : llvm::enumerate(layout.getEffectiveLaneLayoutAsInt())) { | ||
if (v > 1) | ||
if (v > 1 && vecType.getShape()[i] % v == 0) | ||
distributedDims.push_back(i); | ||
} | ||
return AffineMap::getMultiDimMapWithTargets(vecRank, distributedDims, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -214,3 +214,54 @@ gpu.module @xevm_module{ | |
|
||
} | ||
} | ||
|
||
// ----- | ||
// CHECK-LABEL: gpu.func @warp_scf_for_unused_uniform_for_result( | ||
// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] args(%{{.*}} : index, | ||
// CHECK-SAME: !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, | ||
// CHECK-SAME: memref<16x16xf32>) -> (vector<16x1xf32>, vector<16x1xf32>) { | ||
// CHECK: gpu.yield %{{.*}}, {{.*}} : vector<16x16xf32>, vector<16x1xf32> | ||
// CHECK: } | ||
// CHECK: %{{.*}}:2 = scf.for {{.*}} to %{{.*}} step %{{.*}} iter_args | ||
// CHECK-SAME: (%{{.*}} = %[[W]]#0, %{{.*}} = %[[W]]#1) -> (vector<16x1xf32>, vector<16x1xf32>) { | ||
// CHECK: %[[W1:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] | ||
// CHECK-SAME: args(%{{.*}} : vector<16x1xf32>, vector<16x1xf32>) -> (vector<16x1xf32>, vector<16x1xf32>) { | ||
// CHECK: gpu.yield %{{.*}}, %{{.*}} : vector<16x16xf32>, vector<16x1xf32> | ||
// CHECK: } | ||
// CHECK: scf.yield %[[W1]]#0, %[[W1]]#1 : vector<16x1xf32>, vector<16x1xf32> | ||
// CHECK: } | ||
gpu.module @xevm_module{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can a simple test be used to motivate the change? Like form a vector of 2 scalar and extract scalar back, what would current distribution do for the vector of 2? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this would not trigger the bug. if the vector of 2 is extracted, then it means it has a user. This bug triggers when "before sinking a region op we don't know the distributed type of all the operands of this region op". This won't ever be triggered for a non-region op that has at least one user. |
||
gpu.func @warp_scf_for_unused_uniform_for_result(%arg0: index, | ||
%arg1: !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, | ||
%arg2: memref<16x16xf32>) { | ||
%c128 = arith.constant 128 : index | ||
%c1 = arith.constant 1 : index | ||
%c0 = arith.constant 0 : index | ||
%ini = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} | ||
: () -> (vector<16x1xf32>) | ||
%ini2 = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} | ||
: () -> (vector<16x16xf32>) | ||
%3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini2, %arg5 = %ini) -> (vector<16x16xf32>, vector<16x1xf32>) { | ||
%1 = "some_def"(%arg5) | ||
{ | ||
layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, | ||
layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> | ||
} | ||
: (vector<16x1xf32>) -> (vector<16x1xf32>) | ||
%acc = "some_def"(%arg4, %1) | ||
{ | ||
layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, | ||
layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, | ||
layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> | ||
} | ||
: (vector<16x16xf32>, vector<16x1xf32>) -> (vector<16x16xf32>) | ||
scf.yield %acc, %1 : vector<16x16xf32>, vector<16x1xf32> | ||
} | ||
{ | ||
layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> | ||
} | ||
xegpu.store_nd %3#0, %arg1[%c0, %c0] | ||
: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> | ||
gpu.return | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if there is no layout assigned, there should not be distributed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed.