-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Use s_cmovk_i32 instead of s_cselect_b32 when applicable #135232
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: Ryan Buchner (bababuck) ChangesNotes:
That without my changes gets lowered to: But with the changes and hint gets lowered to: and eventually: I'm not entirely sure how to efficiently create a test like this. Any ideas would be appreciated. Addresses #129984. @jayfoad Full diff: https://github.com/llvm/llvm-project/pull/135232.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 73343e1c80f33..4ec0ed5836fae 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -883,6 +883,50 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
}
}
+ // Try to use S_MOVK_I32
+ if (MI.getOpcode() == AMDGPU::S_CSELECT_B32) {
+ const MachineOperand *Dest = &MI.getOperand(0);
+ MachineOperand *Src0 = &MI.getOperand(1);
+ MachineOperand *Src1 = &MI.getOperand(2);
+ // Must be exactly one Immediate
+ if (!(Src0->isReg() ^ Src1->isReg()))
+ continue;
+
+ bool swapped = false;
+ // Don't actually swap the MachineOperands yet
+ // Could do it now, but don't want to since will modify generated
+ // program even in cases where we don't insert a S_CMOVK_I32
+ if (!Src0->isReg() && Src1->isReg()) {
+ swapped = true;
+ std::swap(Src0, Src1);
+ }
+
+ if (!(Src1->isImm() && isKImmOperand(*Src1)))
+ continue;
+
+ // Hint that the source and destination register should be allocated
+ // as the same register so that we can shrink to S_CMOVK_I32 on the
+ // post-allocation SIShrinkInstructions pass.
+ if (Dest->getReg().isVirtual()) {
+ MRI->setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
+ MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
+ continue;
+ }
+
+ if (Src0->getReg() != Dest->getReg())
+ continue;
+
+ // Actually swap the operands in the MachineInst now that we know we
+ // are going through with the shrink
+ if (swapped) {
+ if (!TII->commuteInstruction(MI, false, 1, 2))
+ continue;
+ }
+
+ MI.setDesc(TII->get(AMDGPU::S_CMOVK_I32));
+ MI.removeOperand(1);
+ }
+
// Try to use S_ADDK_I32 and S_MULK_I32.
if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
MI.getOpcode() == AMDGPU::S_MUL_I32) {
diff --git a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
index 2c2855c860ebb..27ab9b88e23bd 100644
--- a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
@@ -58,10 +58,10 @@ entry:
; FUNC-LABEL: {{^}}null_32bit_lds_ptr:
; GFX7 v_cmp_ne_u32
-; GFX7: s_cselect_b32
+; GFX7: s_cmovk_i32
; GFX8: s_cmp_lg_u32
; GFX8-NOT: v_cmp_ne_u32
-; GFX8: s_cselect_b32
+; GFX8: s_cmovk_i32
define amdgpu_kernel void @null_32bit_lds_ptr(ptr addrspace(1) %out, ptr addrspace(3) %lds) nounwind {
%cmp = icmp ne ptr addrspace(3) %lds, null
%x = select i1 %cmp, i32 123, i32 456
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index 7fdc012d4f1b5..40ff19404e5f1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -396,7 +396,7 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s0, 0
; GCN-NEXT: s_movk_i32 s0, 0x80
-; GCN-NEXT: s_cselect_b32 s0, s0, 0x83
+; GCN-NEXT: s_cmovk_i32 s0, 0x83
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: flat_store_short v[0:1], v0
; GCN-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index f851548ec5f21..7487e7d8c1180 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -242,7 +242,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; VI-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
; VI-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
; VI-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
+; VI-SAFE-GISEL-NEXT: s_cmovk_i32 s2, 0x7c00
; VI-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2
; VI-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16
@@ -367,7 +367,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX10-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
; GFX10-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
; GFX10-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
+; GFX10-SAFE-GISEL-NEXT: s_cmovk_i32 s2, 0x7c00
; GFX10-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2
; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16
@@ -502,7 +502,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
; GFX11-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
; GFX11-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
+; GFX11-SAFE-GISEL-NEXT: s_cmovk_i32 s2, 0x7c00
; GFX11-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2
; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
index 0c8dbe865a872..f75982caab7f5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
@@ -1371,7 +1371,7 @@ define amdgpu_gfx void @s_set_rounding_select_2_1(i32 inreg %cond) {
; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX678-NEXT: s_cmp_eq_u32 s4, 0
; GFX678-NEXT: s_movk_i32 s34, 0xa5
-; GFX678-NEXT: s_cselect_b32 s34, s34, 0xa50
+; GFX678-NEXT: s_cmovk_i32 s34, 0xa50
; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX678-NEXT: s_setpc_b64 s[30:31]
;
@@ -1380,7 +1380,7 @@ define amdgpu_gfx void @s_set_rounding_select_2_1(i32 inreg %cond) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_cmp_eq_u32 s4, 0
; GFX9-NEXT: s_movk_i32 s34, 0xa5
-; GFX9-NEXT: s_cselect_b32 s34, s34, 0xa50
+; GFX9-NEXT: s_cmovk_i32 s34, 0xa50
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1389,7 +1389,7 @@ define amdgpu_gfx void @s_set_rounding_select_2_1(i32 inreg %cond) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_cmp_eq_u32 s4, 0
; GFX10-NEXT: s_movk_i32 s34, 0xa5
-; GFX10-NEXT: s_cselect_b32 s34, s34, 0xa50
+; GFX10-NEXT: s_cmovk_i32 s34, 0xa50
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -1398,7 +1398,7 @@ define amdgpu_gfx void @s_set_rounding_select_2_1(i32 inreg %cond) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_cmp_eq_u32 s4, 0
; GFX11-NEXT: s_movk_i32 s0, 0xa5
-; GFX11-NEXT: s_cselect_b32 s0, s0, 0xa50
+; GFX11-NEXT: s_cmovk_i32 s0, 0xa50
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %cond, 0
@@ -1413,7 +1413,7 @@ define amdgpu_gfx void @s_set_rounding_select_1_2(i32 inreg %cond) {
; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX678-NEXT: s_cmp_eq_u32 s4, 0
; GFX678-NEXT: s_movk_i32 s34, 0xa50
-; GFX678-NEXT: s_cselect_b32 s34, s34, 0xa5
+; GFX678-NEXT: s_cmovk_i32 s34, 0xa5
; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX678-NEXT: s_setpc_b64 s[30:31]
;
@@ -1422,7 +1422,7 @@ define amdgpu_gfx void @s_set_rounding_select_1_2(i32 inreg %cond) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_cmp_eq_u32 s4, 0
; GFX9-NEXT: s_movk_i32 s34, 0xa50
-; GFX9-NEXT: s_cselect_b32 s34, s34, 0xa5
+; GFX9-NEXT: s_cmovk_i32 s34, 0xa5
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1431,7 +1431,7 @@ define amdgpu_gfx void @s_set_rounding_select_1_2(i32 inreg %cond) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_cmp_eq_u32 s4, 0
; GFX10-NEXT: s_movk_i32 s34, 0xa50
-; GFX10-NEXT: s_cselect_b32 s34, s34, 0xa5
+; GFX10-NEXT: s_cmovk_i32 s34, 0xa5
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -1440,7 +1440,7 @@ define amdgpu_gfx void @s_set_rounding_select_1_2(i32 inreg %cond) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_cmp_eq_u32 s4, 0
; GFX11-NEXT: s_movk_i32 s0, 0xa50
-; GFX11-NEXT: s_cselect_b32 s0, s0, 0xa5
+; GFX11-NEXT: s_cmovk_i32 s0, 0xa5
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %cond, 0
|
1fc8f32 to
0516e48
Compare
|
I updated this to the most recent
I split up commits 2 and 3 to demonstrate the impact of the hinting change, namely in the new test I added. @jayfoad @arsenm would it be possible for this to get assigned to the appropriate people. |
| @@ -0,0 +1,35 @@ | |||
| # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 | |||
| # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -start-before si-shrink-instructions -stop-before si-post-ra-bundler -o - %s | FileCheck -check-prefix=GCN %s | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -start-before si-shrink-instructions -stop-before si-post-ra-bundler -o - %s | FileCheck -check-prefix=GCN %s | |
| # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -start-before=si-shrink-instructions -stop-before=si-post-ra-bundler -o - %s | FileCheck -check-prefix=GCN %s |
| $scc = IMPLICIT_DEF | ||
| %1:sgpr_32 = S_CSELECT_B32 %0, 31744, implicit $scc | ||
| S_ENDPGM 0, implicit %1 | ||
| ... |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also the b64 versions, and tests with exotic operand types
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I expanded the tests to include b64. For operand types, an assertion fired when I tried to use a floating point operand, but I did add a case where the second operand is a register and one where the second operand is an immediate but not a K-Immediate.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't use FP immediate, only ever use isImm
| std::swap(Src0, Src1); | ||
| } | ||
|
|
||
| if (!(Src1->isImm() && isKImmOperand(*Src1))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
demorgan condition, also should work with non-register operand
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
demorgan condition
Changed.
also should work with non-register operand
A little confused by this, my understanding is that for the transformation to occur is RD = S_CSELECT_B32 SRC0, SRC1 where RD == SRC0 and SRC1 is a K-Immediate. It's a little unclear to me which operand you are suggesting could be a non-register operand. If SRC0 is an immediate, then in order to use S_CMOVK_B32 it would require a 2-instruction sequence:
S_MOV_B32 $sgpr0, 0x1234
S_CMOVK $sgpr0, 0x5678, implicit $scc
as opposed to:
S_CSELECT $sgpr0, 0x1234, 0x5678, implicit $scc
| } | ||
| } | ||
|
|
||
| // Try to use S_MOVK_I32 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Comment doesn't match
| if (!(Src0->isReg() ^ Src1->isReg())) | ||
| continue; | ||
|
|
||
| bool swapped = false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Capitalize
|
Made changes for the above comments. I left a couple responses to the comments above as there was some wording I didn't quite understand. |
jayfoad
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can't just commute the select since that changes the meaning of the instruction! You would have to find the cmp instruction the defines scc and invert the condition, which is probably not worth doing just to save a few bytes in the encoding.
| MachineOperand *Src0 = &MI.getOperand(1); | ||
| MachineOperand *Src1 = &MI.getOperand(2); | ||
| // Must be exactly one Immediate | ||
| if (!(Src0->isReg() ^ Src1->isReg())) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| if (!(Src0->isReg() ^ Src1->isReg())) | |
| if (Src0->isReg() == Src1->isReg()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I didn't consider that. We would also have to commute any other instructions dependent on the compare.
I removed the commute logic for now, I can open a separate MR later to address the commutability. There is one test case in the lit tests that no longer is optimized after removing the commute from this MR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The commutability should probably have been addressed during dag combine
Partially addresses llvm#129984.
… assigned the same physical register. Addresses llvm#129984.
Notes:
Similar code flow to
S_ADDK_I32, but modified so that the program won't change in situations that we won't Shrink to ans_cmovki32; thes_addk_i32logic switches around the order of operands of manys_add_i32instructions that it doesn't end up shrinking tos_addk_i32. There's nothing wrong with this, I just wanted to minimize the amount of test changes for this PR.For testing, I did not add any new tests as the logic seems to be exercised adequately by previous test. One shortcoming is none of the existing tests highlight the register allocation hints. To do this, I would need to generate a case such as:
That without my changes gets lowered to:
But with the changes and hint gets lowered to:
and eventually:
I'm not entirely sure how to efficiently create a test like this. Any ideas would be appreciated.
Addresses part of #129984.
@jayfoad