@@ -1118,8 +1118,8 @@ tail:
11181118; Since functions that contain amdgcn.init.whole.wave do not preserve the inactive
11191119; lanes of any VGPRs, the middle end will explicitly preserve them if needed by adding
11201120; dummy VGPR arguments. Since only the inactive lanes are important, we need to make
1121- ; it clear to the backend that it's safe to allocate v9 inside shader.
1122- ; FIXME: Using poison is not clear enough .
1121+ ; it clear to the backend that it's safe to allocate v9's active lanes inside
1122+ ; shader. This is achieved by using the llvm.amdgcn.dead intrinsic .
11231123define amdgpu_cs_chain void @with_inactive_vgprs (ptr inreg %callee , i32 inreg %exec , i32 inreg %sgpr , i32 %active.vgpr , i32 %inactive.vgpr ) {
11241124; GISEL12-LABEL: with_inactive_vgprs:
11251125; GISEL12: ; %bb.0: ; %entry
@@ -1136,14 +1136,15 @@ define amdgpu_cs_chain void @with_inactive_vgprs(ptr inreg %callee, i32 inreg %e
11361136; GISEL12-NEXT: s_and_saveexec_b32 s1, s6
11371137; GISEL12-NEXT: s_cbranch_execz .LBB6_2
11381138; GISEL12-NEXT: ; %bb.1: ; %shader
1139- ; GISEL12-NEXT: v_dual_mov_b32 v11 , s5 :: v_dual_mov_b32 v10 , s4
1140- ; GISEL12-NEXT: flat_load_b32 v12 , v[10:11 ]
1139+ ; GISEL12-NEXT: v_dual_mov_b32 v10 , s5 :: v_dual_mov_b32 v9 , s4
1140+ ; GISEL12-NEXT: flat_load_b32 v11 , v[9:10 ]
11411141; GISEL12-NEXT: ;;#ASMSTART
11421142; GISEL12-NEXT: ; use v0-7
11431143; GISEL12-NEXT: ;;#ASMEND
11441144; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
1145- ; GISEL12-NEXT: v_add_nc_u32_e32 v8, v8, v12
1146- ; GISEL12-NEXT: flat_store_b32 v[10:11], v12
1145+ ; GISEL12-NEXT: v_add_nc_u32_e32 v8, v8, v11
1146+ ; GISEL12-NEXT: flat_store_b32 v[9:10], v11
1147+ ; GISEL12-NEXT: ; implicit-def: $vgpr9
11471148; GISEL12-NEXT: .LBB6_2: ; %tail.block
11481149; GISEL12-NEXT: s_wait_alu 0xfffe
11491150; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s1
@@ -1165,14 +1166,15 @@ define amdgpu_cs_chain void @with_inactive_vgprs(ptr inreg %callee, i32 inreg %e
11651166; DAGISEL12-NEXT: s_and_saveexec_b32 s0, s6
11661167; DAGISEL12-NEXT: s_cbranch_execz .LBB6_2
11671168; DAGISEL12-NEXT: ; %bb.1: ; %shader
1168- ; DAGISEL12-NEXT: v_dual_mov_b32 v11 , s5 :: v_dual_mov_b32 v10 , s4
1169- ; DAGISEL12-NEXT: flat_load_b32 v12 , v[10:11 ]
1169+ ; DAGISEL12-NEXT: v_dual_mov_b32 v10 , s5 :: v_dual_mov_b32 v9 , s4
1170+ ; DAGISEL12-NEXT: flat_load_b32 v11 , v[9:10 ]
11701171; DAGISEL12-NEXT: ;;#ASMSTART
11711172; DAGISEL12-NEXT: ; use v0-7
11721173; DAGISEL12-NEXT: ;;#ASMEND
11731174; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
1174- ; DAGISEL12-NEXT: v_add_nc_u32_e32 v8, v8, v12
1175- ; DAGISEL12-NEXT: flat_store_b32 v[10:11], v12
1175+ ; DAGISEL12-NEXT: v_add_nc_u32_e32 v8, v8, v11
1176+ ; DAGISEL12-NEXT: flat_store_b32 v[9:10], v11
1177+ ; DAGISEL12-NEXT: ; implicit-def: $vgpr9
11761178; DAGISEL12-NEXT: .LBB6_2: ; %tail.block
11771179; DAGISEL12-NEXT: s_wait_alu 0xfffe
11781180; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s0
@@ -1191,15 +1193,16 @@ define amdgpu_cs_chain void @with_inactive_vgprs(ptr inreg %callee, i32 inreg %e
11911193; GISEL10-NEXT: s_and_saveexec_b32 s1, s6
11921194; GISEL10-NEXT: s_cbranch_execz .LBB6_2
11931195; GISEL10-NEXT: ; %bb.1: ; %shader
1194- ; GISEL10-NEXT: v_mov_b32_e32 v11 , s5
1195- ; GISEL10-NEXT: v_mov_b32_e32 v10 , s4
1196- ; GISEL10-NEXT: flat_load_dword v12 , v[10:11 ]
1196+ ; GISEL10-NEXT: v_mov_b32_e32 v10 , s5
1197+ ; GISEL10-NEXT: v_mov_b32_e32 v9 , s4
1198+ ; GISEL10-NEXT: flat_load_dword v11 , v[9:10 ]
11971199; GISEL10-NEXT: ;;#ASMSTART
11981200; GISEL10-NEXT: ; use v0-7
11991201; GISEL10-NEXT: ;;#ASMEND
12001202; GISEL10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1201- ; GISEL10-NEXT: v_add_nc_u32_e32 v8, v8, v12
1202- ; GISEL10-NEXT: flat_store_dword v[10:11], v12
1203+ ; GISEL10-NEXT: v_add_nc_u32_e32 v8, v8, v11
1204+ ; GISEL10-NEXT: flat_store_dword v[9:10], v11
1205+ ; GISEL10-NEXT: ; implicit-def: $vgpr9
12031206; GISEL10-NEXT: .LBB6_2: ; %tail.block
12041207; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s1
12051208; GISEL10-NEXT: s_mov_b32 exec_lo, s2
@@ -1214,21 +1217,23 @@ define amdgpu_cs_chain void @with_inactive_vgprs(ptr inreg %callee, i32 inreg %e
12141217; DAGISEL10-NEXT: s_and_saveexec_b32 s0, s6
12151218; DAGISEL10-NEXT: s_cbranch_execz .LBB6_2
12161219; DAGISEL10-NEXT: ; %bb.1: ; %shader
1217- ; DAGISEL10-NEXT: v_mov_b32_e32 v11 , s5
1218- ; DAGISEL10-NEXT: v_mov_b32_e32 v10 , s4
1219- ; DAGISEL10-NEXT: flat_load_dword v12 , v[10:11 ]
1220+ ; DAGISEL10-NEXT: v_mov_b32_e32 v10 , s5
1221+ ; DAGISEL10-NEXT: v_mov_b32_e32 v9 , s4
1222+ ; DAGISEL10-NEXT: flat_load_dword v11 , v[9:10 ]
12201223; DAGISEL10-NEXT: ;;#ASMSTART
12211224; DAGISEL10-NEXT: ; use v0-7
12221225; DAGISEL10-NEXT: ;;#ASMEND
12231226; DAGISEL10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1224- ; DAGISEL10-NEXT: v_add_nc_u32_e32 v8, v8, v12
1225- ; DAGISEL10-NEXT: flat_store_dword v[10:11], v12
1227+ ; DAGISEL10-NEXT: v_add_nc_u32_e32 v8, v8, v11
1228+ ; DAGISEL10-NEXT: flat_store_dword v[9:10], v11
1229+ ; DAGISEL10-NEXT: ; implicit-def: $vgpr9
12261230; DAGISEL10-NEXT: .LBB6_2: ; %tail.block
12271231; DAGISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s0
12281232; DAGISEL10-NEXT: s_mov_b32 s0, s3
12291233; DAGISEL10-NEXT: s_mov_b32 exec_lo, s2
12301234; DAGISEL10-NEXT: s_setpc_b64 s[4:5]
12311235entry:
1236+ %0 = call i32 @llvm.amdgcn.dead ()
12321237 %1 = call i1 @llvm.amdgcn.init.whole.wave ()
12331238 br i1 %1 , label %shader , label %tail.block
12341239
@@ -1241,7 +1246,7 @@ shader: ; preds = %entry
12411246
12421247tail .block: ; preds = %.exit27, %.exit49, %244, %243, %entry
12431248 %active.vgpr.arg = phi i32 [ %active.vgpr , %entry ], [ %active.vgpr.new , %shader ]
1244- %inactive.vgpr.arg = phi i32 [ %inactive.vgpr , %entry ], [ poison , %shader ]
1249+ %inactive.vgpr.arg = phi i32 [ %inactive.vgpr , %entry ], [ %0 , %shader ]
12451250 %vgprs.0 = insertvalue { i32 , i32 } poison, i32 %active.vgpr.arg , 0
12461251 %vgprs = insertvalue { i32 , i32 } %vgprs.0 , i32 %inactive.vgpr.arg , 1
12471252 call void (ptr , i32 , i32 , { i32 , i32 }, i32 , ...) @llvm.amdgcn.cs.chain.p0.i32.i32.sl_i32i32 (ptr inreg %callee , i32 inreg %exec , i32 inreg %sgpr , { i32 , i32 } %vgprs , i32 0 )
0 commit comments