Skip to content

Commit f345426

Browse files
[MLIR][AMDGPU] Support gpu::ShuffleMode::DOWN lowering in ROCDL
1 parent f8b4460 commit f345426

File tree

2 files changed

+20
-2
lines changed

2 files changed

+20
-2
lines changed

mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,10 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
157157
// TODO: Use ds_swizzle for XOR when step/offsets are constants for better
158158
// perf.
159159
switch (op.getMode()) {
160+
case gpu::ShuffleMode::DOWN:
161+
dstLane = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId,
162+
adaptor.getOffset());
163+
break;
160164
case gpu::ShuffleMode::XOR:
161165
dstLane = rewriter.create<LLVM::XOrOp>(loc, int32Type, srcLaneId,
162166
adaptor.getOffset());

mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -600,7 +600,7 @@ gpu.module @test_module {
600600

601601
gpu.module @test_module {
602602
// CHECK-LABEL: func @gpu_shuffle()
603-
func.func @gpu_shuffle() -> (f32, f32) {
603+
func.func @gpu_shuffle() -> (f32, f32, f32) {
604604
// CHECK: %[[#VALUE:]] = llvm.mlir.constant(1.000000e+00 : f32) : f32
605605
%arg0 = arith.constant 1.0 : f32
606606
// CHECK: %[[#OFFSET:]] = llvm.mlir.constant(4 : i32) : i32
@@ -634,7 +634,21 @@ gpu.module @test_module {
634634
// CHECK: %[[#PERMUTE:]] = rocdl.ds_bpermute %[[#ALIGNED_DST_LANE]], %[[#CAST_VALUE]] : (i32, i32) -> i32
635635
// CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32
636636
%shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32
637-
func.return %shfl, %shfli : f32, f32
637+
// CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi
638+
// CHECK: %[[#ZERO:]] = llvm.mlir.constant(0 : i32) : i32
639+
// CHECK: %[[#NEG_WIDTH:]] = llvm.sub %[[#ZERO]], %[[#WIDTH]] : i32
640+
// CHECK: %[[#ADD:]] = llvm.add %[[#LANE_ID]], %[[#WIDTH]] : i32
641+
// CHECK: %[[#WARP_OR_ZERO:]] = llvm.and %[[#ADD]], %[[#NEG_WIDTH]] : i32
642+
// CHECK: %[[#DOWN:]] = llvm.add %[[#LANE_ID]], %{{.*}} : i32
643+
// CHECK: %[[#CMP:]] = llvm.icmp "slt" %[[#DOWN]], %[[#WARP_OR_ZERO]] : i32
644+
// CHECK: %[[#DST_LANE:]] = llvm.select %[[#CMP]], %[[#DOWN]], %{{.*}} : i1, i32
645+
// CHECK: %[[#TWO:]] = llvm.mlir.constant(2 : i32) : i32
646+
// CHECK: %[[#ALIGNED_DST_LANE:]] = llvm.shl %[[#DST_LANE]], %[[#TWO]] : i32
647+
// CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
648+
// CHECK: %[[#PERMUTE:]] = rocdl.ds_bpermute %[[#ALIGNED_DST_LANE]], %[[#CAST_VALUE]] : (i32, i32) -> i32
649+
// CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32
650+
%shfld, %predd = gpu.shuffle down %arg0, %arg1, %arg2 : f32
651+
func.return %shfl, %shfli, %shfld : f32, f32, f32
638652
}
639653
}
640654

0 commit comments

Comments
 (0)