Skip to content

Commit b4c658e

Browse files
committed
AMDGPU: Use v_mov_b32 to implement divergent zext i32->i64
Some cases are relying on SIFixSGPRCopies to force VALU reg_sequence inputs with SGPR inputs to use all VGPR inputs, but this doesn't always happen if the reg_sequence isn't invalid. Make sure we use a vgpr up-front here so we don't rely on something later.
1 parent 6214dcc commit b4c658e

21 files changed

+1225
-1237
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2941,10 +2941,15 @@ def : GCNPat <
29412941
>;
29422942

29432943
def : GCNPat <
2944-
(i64 (zext i32:$src)),
2944+
(i64 (UniformUnaryFrag<zext> i32:$src)),
29452945
(REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
29462946
>;
29472947

2948+
def : GCNPat <
2949+
(i64 (zext i32:$src)),
2950+
(REG_SEQUENCE VReg_64, $src, sub0, (V_MOV_B32_e32 (i32 0)), sub1)
2951+
>;
2952+
29482953
def : GCNPat <
29492954
(i64 (anyext i32:$src)),
29502955
(REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)

llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2549,17 +2549,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
25492549
;
25502550
; GFX1164_ITERATIVE-LABEL: add_i64_varying:
25512551
; GFX1164_ITERATIVE: ; %bb.0: ; %entry
2552-
; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0
2553-
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
2552+
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
2553+
; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0
25542554
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
25552555
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0
25562556
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
25572557
; GFX1164_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
25582558
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
25592559
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
2560-
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2561-
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2
2562-
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2
2560+
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
2561+
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s2
2562+
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v2, s2
25632563
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2
25642564
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
25652565
; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3
@@ -2606,16 +2606,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
26062606
;
26072607
; GFX1132_ITERATIVE-LABEL: add_i64_varying:
26082608
; GFX1132_ITERATIVE: ; %bb.0: ; %entry
2609-
; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
2609+
; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
26102610
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo
26112611
; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0
26122612
; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
26132613
; GFX1132_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
26142614
; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
26152615
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
26162616
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
2617-
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
2618-
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
2617+
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s1
2618+
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s1
26192619
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
26202620
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
26212621
; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2
@@ -2659,17 +2659,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
26592659
;
26602660
; GFX1264_ITERATIVE-LABEL: add_i64_varying:
26612661
; GFX1264_ITERATIVE: ; %bb.0: ; %entry
2662-
; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0
2663-
; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
2662+
; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
2663+
; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0
26642664
; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
26652665
; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0
26662666
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
26672667
; GFX1264_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
26682668
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
26692669
; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1]
26702670
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
2671-
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8
2672-
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8
2671+
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s8
2672+
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s8
26732673
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8
26742674
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8
26752675
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
@@ -2714,7 +2714,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
27142714
;
27152715
; GFX1232_ITERATIVE-LABEL: add_i64_varying:
27162716
; GFX1232_ITERATIVE: ; %bb.0: ; %entry
2717-
; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
2717+
; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
27182718
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo
27192719
; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0
27202720
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -2723,8 +2723,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
27232723
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
27242724
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
27252725
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
2726-
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
2727-
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
2726+
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s1
2727+
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s1
27282728
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
27292729
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
27302730
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
@@ -6930,15 +6930,15 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
69306930
;
69316931
; GFX1164_ITERATIVE-LABEL: sub_i64_varying:
69326932
; GFX1164_ITERATIVE: ; %bb.0: ; %entry
6933-
; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0
69346933
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0
6934+
; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0
69356935
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
69366936
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[8:9], 0
69376937
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr4_vgpr5
69386938
; GFX1164_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
69396939
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
69406940
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
6941-
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
6941+
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
69426942
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
69436943
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2
69446944
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2
@@ -7087,17 +7087,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
70877087
;
70887088
; GFX1264_ITERATIVE-LABEL: sub_i64_varying:
70897089
; GFX1264_ITERATIVE: ; %bb.0: ; %entry
7090-
; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0
7091-
; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
7090+
; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
7091+
; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0
70927092
; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
70937093
; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0
70947094
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
70957095
; GFX1264_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
70967096
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
70977097
; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1]
70987098
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
7099-
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8
7100-
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8
7099+
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s8
7100+
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s8
71017101
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8
71027102
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8
71037103
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
@@ -7142,7 +7142,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
71427142
;
71437143
; GFX1232_ITERATIVE-LABEL: sub_i64_varying:
71447144
; GFX1232_ITERATIVE: ; %bb.0: ; %entry
7145-
; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
7145+
; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
71467146
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo
71477147
; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0
71487148
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -7151,8 +7151,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
71517151
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
71527152
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
71537153
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
7154-
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
7155-
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
7154+
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s1
7155+
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s1
71567156
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
71577157
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
71587158
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1

0 commit comments

Comments
 (0)