@@ -1115,4 +1115,141 @@ tail:
11151115 unreachable
11161116}
11171117
1118+ ; Since functions that contain amdgcn.init.whole.wave do not preserve the inactive
1119+ ; lanes of any VGPRs, the middle end will explicitly preserve them if needed by adding
1120+ ; dummy VGPR arguments. Since only the inactive lanes are important, we need to make
1121+ ; it clear to the backend that it's safe to allocate v9's active lanes inside
1122+ ; shader. This is achieved by using the llvm.amdgcn.dead intrinsic.
1123+ define amdgpu_cs_chain void @with_inactive_vgprs (ptr inreg %callee , i32 inreg %exec , i32 inreg %sgpr , i32 %active.vgpr , i32 %inactive.vgpr ) {
1124+ ; GISEL12-LABEL: with_inactive_vgprs:
1125+ ; GISEL12: ; %bb.0: ; %entry
1126+ ; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
1127+ ; GISEL12-NEXT: s_wait_expcnt 0x0
1128+ ; GISEL12-NEXT: s_wait_samplecnt 0x0
1129+ ; GISEL12-NEXT: s_wait_bvhcnt 0x0
1130+ ; GISEL12-NEXT: s_wait_kmcnt 0x0
1131+ ; GISEL12-NEXT: s_or_saveexec_b32 s6, -1
1132+ ; GISEL12-NEXT: s_mov_b32 s4, s0
1133+ ; GISEL12-NEXT: s_mov_b32 s5, s1
1134+ ; GISEL12-NEXT: s_mov_b32 s0, s3
1135+ ; GISEL12-NEXT: s_wait_alu 0xfffe
1136+ ; GISEL12-NEXT: s_and_saveexec_b32 s1, s6
1137+ ; GISEL12-NEXT: s_cbranch_execz .LBB6_2
1138+ ; GISEL12-NEXT: ; %bb.1: ; %shader
1139+ ; GISEL12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4
1140+ ; GISEL12-NEXT: flat_load_b32 v11, v[9:10]
1141+ ; GISEL12-NEXT: ;;#ASMSTART
1142+ ; GISEL12-NEXT: ; use v0-7
1143+ ; GISEL12-NEXT: ;;#ASMEND
1144+ ; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
1145+ ; GISEL12-NEXT: v_add_nc_u32_e32 v8, v8, v11
1146+ ; GISEL12-NEXT: flat_store_b32 v[9:10], v11
1147+ ; GISEL12-NEXT: ; implicit-def: $vgpr9
1148+ ; GISEL12-NEXT: .LBB6_2: ; %tail.block
1149+ ; GISEL12-NEXT: s_wait_alu 0xfffe
1150+ ; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s1
1151+ ; GISEL12-NEXT: s_mov_b32 exec_lo, s2
1152+ ; GISEL12-NEXT: s_setpc_b64 s[4:5]
1153+ ;
1154+ ; DAGISEL12-LABEL: with_inactive_vgprs:
1155+ ; DAGISEL12: ; %bb.0: ; %entry
1156+ ; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
1157+ ; DAGISEL12-NEXT: s_wait_expcnt 0x0
1158+ ; DAGISEL12-NEXT: s_wait_samplecnt 0x0
1159+ ; DAGISEL12-NEXT: s_wait_bvhcnt 0x0
1160+ ; DAGISEL12-NEXT: s_wait_kmcnt 0x0
1161+ ; DAGISEL12-NEXT: s_or_saveexec_b32 s6, -1
1162+ ; DAGISEL12-NEXT: s_mov_b32 s5, s1
1163+ ; DAGISEL12-NEXT: s_mov_b32 s4, s0
1164+ ; DAGISEL12-NEXT: s_wait_alu 0xfffe
1165+ ; DAGISEL12-NEXT: s_and_saveexec_b32 s0, s6
1166+ ; DAGISEL12-NEXT: s_cbranch_execz .LBB6_2
1167+ ; DAGISEL12-NEXT: ; %bb.1: ; %shader
1168+ ; DAGISEL12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4
1169+ ; DAGISEL12-NEXT: flat_load_b32 v11, v[9:10]
1170+ ; DAGISEL12-NEXT: ;;#ASMSTART
1171+ ; DAGISEL12-NEXT: ; use v0-7
1172+ ; DAGISEL12-NEXT: ;;#ASMEND
1173+ ; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
1174+ ; DAGISEL12-NEXT: v_add_nc_u32_e32 v8, v8, v11
1175+ ; DAGISEL12-NEXT: flat_store_b32 v[9:10], v11
1176+ ; DAGISEL12-NEXT: ; implicit-def: $vgpr9
1177+ ; DAGISEL12-NEXT: .LBB6_2: ; %tail.block
1178+ ; DAGISEL12-NEXT: s_wait_alu 0xfffe
1179+ ; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s0
1180+ ; DAGISEL12-NEXT: s_mov_b32 s0, s3
1181+ ; DAGISEL12-NEXT: s_mov_b32 exec_lo, s2
1182+ ; DAGISEL12-NEXT: s_wait_alu 0xfffe
1183+ ; DAGISEL12-NEXT: s_setpc_b64 s[4:5]
1184+ ;
1185+ ; GISEL10-LABEL: with_inactive_vgprs:
1186+ ; GISEL10: ; %bb.0: ; %entry
1187+ ; GISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1188+ ; GISEL10-NEXT: s_or_saveexec_b32 s6, -1
1189+ ; GISEL10-NEXT: s_mov_b32 s4, s0
1190+ ; GISEL10-NEXT: s_mov_b32 s5, s1
1191+ ; GISEL10-NEXT: s_mov_b32 s0, s3
1192+ ; GISEL10-NEXT: s_and_saveexec_b32 s1, s6
1193+ ; GISEL10-NEXT: s_cbranch_execz .LBB6_2
1194+ ; GISEL10-NEXT: ; %bb.1: ; %shader
1195+ ; GISEL10-NEXT: v_mov_b32_e32 v10, s5
1196+ ; GISEL10-NEXT: v_mov_b32_e32 v9, s4
1197+ ; GISEL10-NEXT: flat_load_dword v11, v[9:10]
1198+ ; GISEL10-NEXT: ;;#ASMSTART
1199+ ; GISEL10-NEXT: ; use v0-7
1200+ ; GISEL10-NEXT: ;;#ASMEND
1201+ ; GISEL10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1202+ ; GISEL10-NEXT: v_add_nc_u32_e32 v8, v8, v11
1203+ ; GISEL10-NEXT: flat_store_dword v[9:10], v11
1204+ ; GISEL10-NEXT: ; implicit-def: $vgpr9
1205+ ; GISEL10-NEXT: .LBB6_2: ; %tail.block
1206+ ; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s1
1207+ ; GISEL10-NEXT: s_mov_b32 exec_lo, s2
1208+ ; GISEL10-NEXT: s_setpc_b64 s[4:5]
1209+ ;
1210+ ; DAGISEL10-LABEL: with_inactive_vgprs:
1211+ ; DAGISEL10: ; %bb.0: ; %entry
1212+ ; DAGISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1213+ ; DAGISEL10-NEXT: s_or_saveexec_b32 s6, -1
1214+ ; DAGISEL10-NEXT: s_mov_b32 s5, s1
1215+ ; DAGISEL10-NEXT: s_mov_b32 s4, s0
1216+ ; DAGISEL10-NEXT: s_and_saveexec_b32 s0, s6
1217+ ; DAGISEL10-NEXT: s_cbranch_execz .LBB6_2
1218+ ; DAGISEL10-NEXT: ; %bb.1: ; %shader
1219+ ; DAGISEL10-NEXT: v_mov_b32_e32 v10, s5
1220+ ; DAGISEL10-NEXT: v_mov_b32_e32 v9, s4
1221+ ; DAGISEL10-NEXT: flat_load_dword v11, v[9:10]
1222+ ; DAGISEL10-NEXT: ;;#ASMSTART
1223+ ; DAGISEL10-NEXT: ; use v0-7
1224+ ; DAGISEL10-NEXT: ;;#ASMEND
1225+ ; DAGISEL10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1226+ ; DAGISEL10-NEXT: v_add_nc_u32_e32 v8, v8, v11
1227+ ; DAGISEL10-NEXT: flat_store_dword v[9:10], v11
1228+ ; DAGISEL10-NEXT: ; implicit-def: $vgpr9
1229+ ; DAGISEL10-NEXT: .LBB6_2: ; %tail.block
1230+ ; DAGISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s0
1231+ ; DAGISEL10-NEXT: s_mov_b32 s0, s3
1232+ ; DAGISEL10-NEXT: s_mov_b32 exec_lo, s2
1233+ ; DAGISEL10-NEXT: s_setpc_b64 s[4:5]
1234+ entry:
1235+ %imp.def = call i32 @llvm.amdgcn.dead ()
1236+ %initial.exec = call i1 @llvm.amdgcn.init.whole.wave ()
1237+ br i1 %initial.exec , label %shader , label %tail.block
1238+
1239+ shader: ; preds = %entry
1240+ %use.another.vgpr = load i32 , ptr %callee ; smth that won't be moved past the inline asm
1241+ call void asm sideeffect "; use v0-7" , "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
1242+ store i32 %use.another.vgpr , ptr %callee
1243+ %active.vgpr.new = add i32 %active.vgpr , %use.another.vgpr
1244+ br label %tail.block
1245+
1246+ tail .block: ; preds = %.exit27, %.exit49, %244, %243, %entry
1247+ %active.vgpr.arg = phi i32 [ %active.vgpr , %entry ], [ %active.vgpr.new , %shader ]
1248+ %inactive.vgpr.arg = phi i32 [ %inactive.vgpr , %entry ], [ %imp.def , %shader ]
1249+ %vgprs.0 = insertvalue { i32 , i32 } poison, i32 %active.vgpr.arg , 0
1250+ %vgprs = insertvalue { i32 , i32 } %vgprs.0 , i32 %inactive.vgpr.arg , 1
1251+ call void (ptr , i32 , i32 , { i32 , i32 }, i32 , ...) @llvm.amdgcn.cs.chain.p0.i32.i32.sl_i32i32 (ptr inreg %callee , i32 inreg %exec , i32 inreg %sgpr , { i32 , i32 } %vgprs , i32 0 )
1252+ unreachable
1253+ }
1254+
11181255declare amdgpu_gfx <16 x i32 > @write_v0_v15 (<16 x i32 >)
0 commit comments