Skip to content

Commit 3c979ae

Browse files
committed
[AMDGPU][GlobalISel] Remove redundant cmp when copying constant to vcc
Differential Revision: https://reviews.llvm.org/D95540
1 parent 4b42270 commit 3c979ae

File tree

5 files changed

+109
-22
lines changed

5 files changed

+109
-22
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -136,20 +136,29 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
136136
const TargetRegisterClass *SrcRC
137137
= TRI.getConstrainedRegClassForOperand(Src, *MRI);
138138

139-
Register MaskedReg = MRI->createVirtualRegister(SrcRC);
139+
Optional<ValueAndVReg> ConstVal =
140+
getConstantVRegValWithLookThrough(SrcReg, *MRI, true, true);
141+
if (ConstVal) {
142+
unsigned MovOpc =
143+
STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
144+
BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
145+
.addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
146+
} else {
147+
Register MaskedReg = MRI->createVirtualRegister(SrcRC);
140148

141-
// We can't trust the high bits at this point, so clear them.
149+
// We can't trust the high bits at this point, so clear them.
142150

143-
// TODO: Skip masking high bits if def is known boolean.
151+
// TODO: Skip masking high bits if def is known boolean.
144152

145-
unsigned AndOpc = TRI.isSGPRClass(SrcRC) ?
146-
AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
147-
BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
148-
.addImm(1)
149-
.addReg(SrcReg);
150-
BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
151-
.addImm(0)
152-
.addReg(MaskedReg);
153+
unsigned AndOpc =
154+
TRI.isSGPRClass(SrcRC) ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
155+
BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
156+
.addImm(1)
157+
.addReg(SrcReg);
158+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
159+
.addImm(0)
160+
.addReg(MaskedReg);
161+
}
153162

154163
if (!MRI->getRegClassOrNull(SrcReg))
155164
MRI->setRegClass(SrcReg, SrcRC);

llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ define amdgpu_kernel void @break_loop(i32 %arg) {
219219
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
220220
; CHECK-NEXT: v_add_u32_e32 v1, 1, v1
221221
; CHECK-NEXT: v_cmp_le_i32_e32 vcc, 0, v1
222-
; CHECK-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 1
222+
; CHECK-NEXT: s_mov_b64 s[2:3], -1
223223
; CHECK-NEXT: s_cbranch_vccnz BB5_1
224224
; CHECK-NEXT: ; %bb.3: ; %bb4
225225
; CHECK-NEXT: ; in Loop: Header=BB5_2 Depth=1

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,3 +321,82 @@ body: |
321321
S_ENDPGM 0, implicit %0
322322
323323
...
324+
325+
---
326+
327+
name: copy_s1_to_vcc
328+
legalized: true
329+
regBankSelected: true
330+
tracksRegLiveness: true
331+
body: |
332+
bb.0:
333+
liveins: $sgpr0_sgpr1
334+
335+
; WAVE64-LABEL: name: copy_s1_to_vcc
336+
; WAVE64: liveins: $sgpr0_sgpr1
337+
; WAVE64: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
338+
; WAVE64: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
339+
; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[COPY1]], implicit-def $scc
340+
; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
341+
; WAVE64: S_ENDPGM 0, implicit [[V_CMP_NE_U32_e64_]]
342+
; WAVE32-LABEL: name: copy_s1_to_vcc
343+
; WAVE32: liveins: $sgpr0_sgpr1
344+
; WAVE32: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
345+
; WAVE32: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
346+
; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[COPY1]], implicit-def $scc
347+
; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
348+
; WAVE32: S_ENDPGM 0, implicit [[V_CMP_NE_U32_e64_]]
349+
%0:sgpr(s64) = COPY $sgpr0_sgpr1
350+
%1:sgpr(s1) = G_TRUNC %0(s64)
351+
%2:vcc(s1) = COPY %1(s1)
352+
S_ENDPGM 0, implicit %2
353+
354+
...
355+
356+
---
357+
358+
name: copy_s1_false_to_vcc
359+
legalized: true
360+
regBankSelected: true
361+
tracksRegLiveness: true
362+
body: |
363+
bb.0:
364+
liveins: $sgpr0
365+
366+
; WAVE64-LABEL: name: copy_s1_false_to_vcc
367+
; WAVE64: liveins: $sgpr0
368+
; WAVE64: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
369+
; WAVE64: S_ENDPGM 0, implicit [[S_MOV_B64_]]
370+
; WAVE32-LABEL: name: copy_s1_false_to_vcc
371+
; WAVE32: liveins: $sgpr0
372+
; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
373+
; WAVE32: S_ENDPGM 0, implicit [[S_MOV_B32_]]
374+
%0:sgpr(s1) = G_CONSTANT i1 false
375+
%1:vcc(s1) = COPY %0(s1)
376+
S_ENDPGM 0, implicit %1
377+
378+
...
379+
380+
---
381+
382+
name: copy_s1_true_to_vcc
383+
legalized: true
384+
regBankSelected: true
385+
tracksRegLiveness: true
386+
body: |
387+
bb.0:
388+
liveins: $sgpr0
389+
390+
; WAVE64-LABEL: name: copy_s1_true_to_vcc
391+
; WAVE64: liveins: $sgpr0
392+
; WAVE64: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
393+
; WAVE64: S_ENDPGM 0, implicit [[S_MOV_B64_]]
394+
; WAVE32-LABEL: name: copy_s1_true_to_vcc
395+
; WAVE32: liveins: $sgpr0
396+
; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
397+
; WAVE32: S_ENDPGM 0, implicit [[S_MOV_B32_]]
398+
%0:sgpr(s1) = G_CONSTANT i1 true
399+
%1:vcc(s1) = COPY %0(s1)
400+
S_ENDPGM 0, implicit %1
401+
402+
...

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -672,7 +672,7 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspa
672672
; GFX7-NEXT: s_load_dword s2, s[0:1], 0x13
673673
; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c
674674
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x25
675-
; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, 0
675+
; GFX7-NEXT: s_mov_b64 vcc, 0
676676
; GFX7-NEXT: s_mov_b32 s6, -1
677677
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
678678
; GFX7-NEXT: v_mov_b32_e32 v0, s2
@@ -688,7 +688,7 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspa
688688
; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c
689689
; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70
690690
; GFX8-NEXT: s_load_dword s4, s[0:1], 0x94
691-
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, 0
691+
; GFX8-NEXT: s_mov_b64 vcc, 0
692692
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
693693
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
694694
; GFX8-NEXT: v_mov_b32_e32 v0, s2
@@ -707,7 +707,7 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspa
707707
; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x94
708708
; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c
709709
; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
710-
; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, 0
710+
; GFX10_W32-NEXT: s_mov_b32 vcc_lo, 0
711711
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
712712
; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5
713713
; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6
@@ -723,7 +723,7 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspa
723723
; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x94
724724
; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c
725725
; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
726-
; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, 0
726+
; GFX10_W64-NEXT: s_mov_b64 vcc, 0
727727
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
728728
; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5
729729
; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6
@@ -743,7 +743,7 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspac
743743
; GFX7-NEXT: s_load_dword s2, s[0:1], 0x13
744744
; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c
745745
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x25
746-
; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, 1
746+
; GFX7-NEXT: s_mov_b64 vcc, -1
747747
; GFX7-NEXT: s_mov_b32 s6, -1
748748
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
749749
; GFX7-NEXT: v_mov_b32_e32 v0, s2
@@ -759,7 +759,7 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspac
759759
; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c
760760
; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70
761761
; GFX8-NEXT: s_load_dword s4, s[0:1], 0x94
762-
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, 1
762+
; GFX8-NEXT: s_mov_b64 vcc, -1
763763
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
764764
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
765765
; GFX8-NEXT: v_mov_b32_e32 v0, s2
@@ -778,7 +778,7 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspac
778778
; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x94
779779
; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c
780780
; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
781-
; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, 1
781+
; GFX10_W32-NEXT: s_mov_b32 vcc_lo, -1
782782
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
783783
; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5
784784
; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6
@@ -794,7 +794,7 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspac
794794
; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x94
795795
; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c
796796
; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
797-
; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, 1
797+
; GFX10_W64-NEXT: s_mov_b64 vcc, -1
798798
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
799799
; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5
800800
; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6

llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -170,8 +170,7 @@ define void @localize_internal_globals(i1 %cond) {
170170
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
171171
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
172172
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
173-
; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1
174-
; GFX9-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
173+
; GFX9-NEXT: s_xor_b64 s[4:5], vcc, -1
175174
; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
176175
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
177176
; GFX9-NEXT: s_cbranch_execz BB2_2

0 commit comments

Comments
 (0)