@@ -1115,4 +1115,137 @@ tail:
11151115 unreachable
11161116}
11171117
1118+ ; Since functions that contain amdgcn.init.whole.wave do not preserve the inactive
1119+ ; lanes of any VGPRs, the middle end will explicitly preserve them if needed by adding
1120+ ; dummy VGPR arguments. Since only the inactive lanes are important, we need to make
1121+ ; it clear to the backend that it's safe to allocate v9 inside shader.
1122+ ; FIXME: Using poison is not clear enough.
1123+ define amdgpu_cs_chain void @with_inactive_vgprs (ptr inreg %callee , i32 inreg %exec , i32 inreg %sgpr , i32 %active.vgpr , i32 %inactive.vgpr ) {
1124+ ; GISEL12-LABEL: with_inactive_vgprs:
1125+ ; GISEL12: ; %bb.0: ; %entry
1126+ ; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
1127+ ; GISEL12-NEXT: s_wait_expcnt 0x0
1128+ ; GISEL12-NEXT: s_wait_samplecnt 0x0
1129+ ; GISEL12-NEXT: s_wait_bvhcnt 0x0
1130+ ; GISEL12-NEXT: s_wait_kmcnt 0x0
1131+ ; GISEL12-NEXT: s_or_saveexec_b32 s6, -1
1132+ ; GISEL12-NEXT: s_mov_b32 s4, s0
1133+ ; GISEL12-NEXT: s_mov_b32 s5, s1
1134+ ; GISEL12-NEXT: s_mov_b32 s0, s3
1135+ ; GISEL12-NEXT: s_wait_alu 0xfffe
1136+ ; GISEL12-NEXT: s_and_saveexec_b32 s1, s6
1137+ ; GISEL12-NEXT: s_cbranch_execz .LBB6_2
1138+ ; GISEL12-NEXT: ; %bb.1: ; %shader
1139+ ; GISEL12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
1140+ ; GISEL12-NEXT: flat_load_b32 v12, v[10:11]
1141+ ; GISEL12-NEXT: ;;#ASMSTART
1142+ ; GISEL12-NEXT: ; use v0-7
1143+ ; GISEL12-NEXT: ;;#ASMEND
1144+ ; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
1145+ ; GISEL12-NEXT: v_add_nc_u32_e32 v8, v8, v12
1146+ ; GISEL12-NEXT: flat_store_b32 v[10:11], v12
1147+ ; GISEL12-NEXT: .LBB6_2: ; %tail.block
1148+ ; GISEL12-NEXT: s_wait_alu 0xfffe
1149+ ; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s1
1150+ ; GISEL12-NEXT: s_mov_b32 exec_lo, s2
1151+ ; GISEL12-NEXT: s_wait_alu 0xfffe
1152+ ; GISEL12-NEXT: s_setpc_b64 s[4:5]
1153+ ;
1154+ ; DAGISEL12-LABEL: with_inactive_vgprs:
1155+ ; DAGISEL12: ; %bb.0: ; %entry
1156+ ; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
1157+ ; DAGISEL12-NEXT: s_wait_expcnt 0x0
1158+ ; DAGISEL12-NEXT: s_wait_samplecnt 0x0
1159+ ; DAGISEL12-NEXT: s_wait_bvhcnt 0x0
1160+ ; DAGISEL12-NEXT: s_wait_kmcnt 0x0
1161+ ; DAGISEL12-NEXT: s_or_saveexec_b32 s6, -1
1162+ ; DAGISEL12-NEXT: s_mov_b32 s5, s1
1163+ ; DAGISEL12-NEXT: s_mov_b32 s4, s0
1164+ ; DAGISEL12-NEXT: s_wait_alu 0xfffe
1165+ ; DAGISEL12-NEXT: s_and_saveexec_b32 s0, s6
1166+ ; DAGISEL12-NEXT: s_cbranch_execz .LBB6_2
1167+ ; DAGISEL12-NEXT: ; %bb.1: ; %shader
1168+ ; DAGISEL12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
1169+ ; DAGISEL12-NEXT: flat_load_b32 v12, v[10:11]
1170+ ; DAGISEL12-NEXT: ;;#ASMSTART
1171+ ; DAGISEL12-NEXT: ; use v0-7
1172+ ; DAGISEL12-NEXT: ;;#ASMEND
1173+ ; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
1174+ ; DAGISEL12-NEXT: v_add_nc_u32_e32 v8, v8, v12
1175+ ; DAGISEL12-NEXT: flat_store_b32 v[10:11], v12
1176+ ; DAGISEL12-NEXT: .LBB6_2: ; %tail.block
1177+ ; DAGISEL12-NEXT: s_wait_alu 0xfffe
1178+ ; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s0
1179+ ; DAGISEL12-NEXT: s_mov_b32 s0, s3
1180+ ; DAGISEL12-NEXT: s_mov_b32 exec_lo, s2
1181+ ; DAGISEL12-NEXT: s_wait_alu 0xfffe
1182+ ; DAGISEL12-NEXT: s_setpc_b64 s[4:5]
1183+ ;
1184+ ; GISEL10-LABEL: with_inactive_vgprs:
1185+ ; GISEL10: ; %bb.0: ; %entry
1186+ ; GISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1187+ ; GISEL10-NEXT: s_or_saveexec_b32 s6, -1
1188+ ; GISEL10-NEXT: s_mov_b32 s4, s0
1189+ ; GISEL10-NEXT: s_mov_b32 s5, s1
1190+ ; GISEL10-NEXT: s_mov_b32 s0, s3
1191+ ; GISEL10-NEXT: s_and_saveexec_b32 s1, s6
1192+ ; GISEL10-NEXT: s_cbranch_execz .LBB6_2
1193+ ; GISEL10-NEXT: ; %bb.1: ; %shader
1194+ ; GISEL10-NEXT: v_mov_b32_e32 v11, s5
1195+ ; GISEL10-NEXT: v_mov_b32_e32 v10, s4
1196+ ; GISEL10-NEXT: flat_load_dword v12, v[10:11]
1197+ ; GISEL10-NEXT: ;;#ASMSTART
1198+ ; GISEL10-NEXT: ; use v0-7
1199+ ; GISEL10-NEXT: ;;#ASMEND
1200+ ; GISEL10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1201+ ; GISEL10-NEXT: v_add_nc_u32_e32 v8, v8, v12
1202+ ; GISEL10-NEXT: flat_store_dword v[10:11], v12
1203+ ; GISEL10-NEXT: .LBB6_2: ; %tail.block
1204+ ; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s1
1205+ ; GISEL10-NEXT: s_mov_b32 exec_lo, s2
1206+ ; GISEL10-NEXT: s_setpc_b64 s[4:5]
1207+ ;
1208+ ; DAGISEL10-LABEL: with_inactive_vgprs:
1209+ ; DAGISEL10: ; %bb.0: ; %entry
1210+ ; DAGISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1211+ ; DAGISEL10-NEXT: s_or_saveexec_b32 s6, -1
1212+ ; DAGISEL10-NEXT: s_mov_b32 s5, s1
1213+ ; DAGISEL10-NEXT: s_mov_b32 s4, s0
1214+ ; DAGISEL10-NEXT: s_and_saveexec_b32 s0, s6
1215+ ; DAGISEL10-NEXT: s_cbranch_execz .LBB6_2
1216+ ; DAGISEL10-NEXT: ; %bb.1: ; %shader
1217+ ; DAGISEL10-NEXT: v_mov_b32_e32 v11, s5
1218+ ; DAGISEL10-NEXT: v_mov_b32_e32 v10, s4
1219+ ; DAGISEL10-NEXT: flat_load_dword v12, v[10:11]
1220+ ; DAGISEL10-NEXT: ;;#ASMSTART
1221+ ; DAGISEL10-NEXT: ; use v0-7
1222+ ; DAGISEL10-NEXT: ;;#ASMEND
1223+ ; DAGISEL10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1224+ ; DAGISEL10-NEXT: v_add_nc_u32_e32 v8, v8, v12
1225+ ; DAGISEL10-NEXT: flat_store_dword v[10:11], v12
1226+ ; DAGISEL10-NEXT: .LBB6_2: ; %tail.block
1227+ ; DAGISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s0
1228+ ; DAGISEL10-NEXT: s_mov_b32 s0, s3
1229+ ; DAGISEL10-NEXT: s_mov_b32 exec_lo, s2
1230+ ; DAGISEL10-NEXT: s_setpc_b64 s[4:5]
1231+ entry:
1232+ %1 = call i1 @llvm.amdgcn.init.whole.wave ()
1233+ br i1 %1 , label %shader , label %tail.block
1234+
1235+ shader: ; preds = %entry
1236+ %use.another.vgpr = load i32 , ptr %callee ; smth that won't be moved past the inline asm
1237+ call void asm sideeffect "; use v0-7" , "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
1238+ store i32 %use.another.vgpr , ptr %callee
1239+ %active.vgpr.new = add i32 %active.vgpr , %use.another.vgpr
1240+ br label %tail.block
1241+
1242+ tail .block: ; preds = %.exit27, %.exit49, %244, %243, %entry
1243+ %active.vgpr.arg = phi i32 [ %active.vgpr , %entry ], [ %active.vgpr.new , %shader ]
1244+ %inactive.vgpr.arg = phi i32 [ %inactive.vgpr , %entry ], [ poison, %shader ]
1245+ %vgprs.0 = insertvalue { i32 , i32 } poison, i32 %active.vgpr.arg , 0
1246+ %vgprs = insertvalue { i32 , i32 } %vgprs.0 , i32 %inactive.vgpr.arg , 1
1247+ call void (ptr , i32 , i32 , { i32 , i32 }, i32 , ...) @llvm.amdgcn.cs.chain.p0.i32.i32.sl_i32i32 (ptr inreg %callee , i32 inreg %exec , i32 inreg %sgpr , { i32 , i32 } %vgprs , i32 0 )
1248+ unreachable
1249+ }
1250+
11181251declare amdgpu_gfx <16 x i32 > @write_v0_v15 (<16 x i32 >)
0 commit comments