-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[AMDGPU] Elide bitcast fold i64 imm to build_vector #154115
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
2f4f1da
07c7d9b
aae4aea
0ceb3ce
4d208c6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5296,6 +5296,36 @@ SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N, | |
| return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0)); | ||
| } | ||
|
|
||
| bool AMDGPUTargetLowering::isInt64ImmLegal(SDNode *N, SelectionDAG &DAG) const { | ||
| if (!Subtarget->isGCN()) | ||
| return false; | ||
|
|
||
| ConstantSDNode *SDConstant = dyn_cast<ConstantSDNode>(N); | ||
| ConstantFPSDNode *SDFPConstant = dyn_cast<ConstantFPSDNode>(N); | ||
| auto &ST = DAG.getSubtarget<GCNSubtarget>(); | ||
| bool isInlineable = false; | ||
| const auto *TII = ST.getInstrInfo(); | ||
|
|
||
| if (!ST.hasMovB64() || (!SDConstant && !SDFPConstant)) | ||
| return false; | ||
|
|
||
| if (ST.has64BitLiterals()) | ||
| return true; | ||
|
|
||
| uint64_t Val = 0; | ||
| if (SDConstant) { | ||
| const APInt &APVal = SDConstant->getAPIntValue(); | ||
| isInlineable = TII->isInlineConstant(APVal); | ||
| Val = APVal.getZExtValue(); | ||
| } else if (SDFPConstant) { | ||
| const APFloat &APVal = SDFPConstant->getValueAPF(); | ||
| isInlineable = TII->isInlineConstant(APVal); | ||
| Val = APVal.bitcastToAPInt().getZExtValue(); | ||
JanekvO marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| return (isInlineable || isUInt<32>(Val)); | ||
| } | ||
|
|
||
| SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, | ||
| DAGCombinerInfo &DCI) const { | ||
| SelectionDAG &DAG = DCI.DAG; | ||
|
|
@@ -5345,6 +5375,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, | |
| SDValue Src = N->getOperand(0); | ||
| if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) { | ||
| SDLoc SL(N); | ||
| if (isInt64ImmLegal(C, DAG)) | ||
| break; | ||
|
||
| uint64_t CVal = C->getZExtValue(); | ||
| SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, | ||
| DAG.getConstant(Lo_32(CVal), SL, MVT::i32), | ||
|
|
@@ -5355,6 +5387,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, | |
| if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) { | ||
| const APInt &Val = C->getValueAPF().bitcastToAPInt(); | ||
| SDLoc SL(N); | ||
| if (isInt64ImmLegal(C, DAG)) | ||
| break; | ||
| uint64_t CVal = Val.getZExtValue(); | ||
| SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, | ||
| DAG.getConstant(Lo_32(CVal), SL, MVT::i32), | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,21 +7,18 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, | |
| ; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 | ||
| ; CHECK-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x0 | ||
| ; CHECK-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x10 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3e21eeb6 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v20, 0 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v30, 0x9037ab78 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v31, 0x3e21eeb6 | ||
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) | ||
| ; CHECK-NEXT: s_bitcmp1_b32 s0, 0 | ||
| ; CHECK-NEXT: s_cselect_b64 s[16:17], -1, 0 | ||
| ; CHECK-NEXT: s_xor_b64 s[18:19], s[16:17], -1 | ||
| ; CHECK-NEXT: s_bitcmp1_b32 s0, 8 | ||
| ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 | ||
| ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] | ||
| ; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0x9037ab78 | ||
| ; CHECK-NEXT: v_accvgpr_write_b32 a3, v1 | ||
| ; CHECK-NEXT: s_xor_b64 s[20:21], s[2:3], -1 | ||
| ; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 | ||
| ; CHECK-NEXT: s_and_b64 s[2:3], exec, s[2:3] | ||
| ; CHECK-NEXT: v_accvgpr_write_b32 a2, v0 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v2, 0xa17f65f6 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v3, 0xbe927e4f | ||
| ; CHECK-NEXT: v_mov_b32_e32 v4, 0x19f4ec90 | ||
|
|
@@ -37,14 +34,15 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, | |
| ; CHECK-NEXT: v_mov_b32_e32 v14, 0x8427b883 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v15, 0x3fae1bb4 | ||
| ; CHECK-NEXT: s_mov_b64 s[22:23], 0 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0x57b87036 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3fb3b136 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v20, 0x57b87036 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v21, 0x3fb3b136 | ||
| ; CHECK-NEXT: s_and_b64 s[4:5], exec, s[16:17] | ||
| ; CHECK-NEXT: v_mov_b32_e32 v18, 0x55555523 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v19, 0xbfd55555 | ||
| ; CHECK-NEXT: s_and_b64 s[6:7], exec, s[18:19] | ||
| ; CHECK-NEXT: v_mov_b32_e32 v21, v20 | ||
| ; CHECK-NEXT: ; implicit-def: $vgpr30_vgpr31 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0 | ||
| ; CHECK-NEXT: v_mov_b64_e32 v[16:17], 0 | ||
| ; CHECK-NEXT: ; implicit-def: $agpr0_agpr1 | ||
| ; CHECK-NEXT: ; implicit-def: $vgpr22_vgpr23 | ||
| ; CHECK-NEXT: s_branch .LBB0_2 | ||
| ; CHECK-NEXT: .LBB0_1: ; %Flow9 | ||
|
|
@@ -64,12 +62,11 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, | |
| ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 | ||
| ; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[14:15] | ||
| ; CHECK-NEXT: flat_load_dwordx2 v[24:25], v[24:25] | ||
| ; CHECK-NEXT: v_accvgpr_read_b32 v27, a3 | ||
| ; CHECK-NEXT: v_accvgpr_read_b32 v26, a2 | ||
| ; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[30:31] | ||
| ; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[2:3] | ||
| ; CHECK-NEXT: v_mov_b64_e32 v[16:17], v[0:1] | ||
| ; CHECK-NEXT: v_accvgpr_write_b32 a0, 0 | ||
| ; CHECK-NEXT: v_accvgpr_write_b32 a1, 0 | ||
| ; CHECK-NEXT: v_mov_b64_e32 v[16:17], v[20:21] | ||
| ; CHECK-NEXT: v_accvgpr_write_b32 a2, 0 | ||
| ; CHECK-NEXT: v_accvgpr_write_b32 a3, 0 | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[24:25] | ||
| ; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27] | ||
|
|
@@ -96,30 +93,32 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, | |
| ; CHECK-NEXT: .LBB0_6: ; %.preheader1855.i.i.i3329 | ||
| ; CHECK-NEXT: ; Parent Loop BB0_2 Depth=1 | ||
| ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 | ||
| ; CHECK-NEXT: v_accvgpr_read_b32 v29, a1 | ||
| ; CHECK-NEXT: v_accvgpr_read_b32 v28, a0 | ||
| ; CHECK-NEXT: v_accvgpr_read_b32 v29, a3 | ||
| ; CHECK-NEXT: v_accvgpr_read_b32 v28, a2 | ||
| ; CHECK-NEXT: s_mov_b64 s[24:25], -1 | ||
| ; CHECK-NEXT: s_mov_b64 s[8:9], -1 | ||
| ; CHECK-NEXT: s_mov_b64 vcc, s[2:3] | ||
| ; CHECK-NEXT: ; implicit-def: $agpr0_agpr1 | ||
| ; CHECK-NEXT: ; implicit-def: $agpr2_agpr3 | ||
| ; CHECK-NEXT: s_cbranch_vccz .LBB0_5 | ||
| ; CHECK-NEXT: ; %bb.7: ; %.lr.ph2070.i.i.i3291 | ||
| ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 | ||
| ; CHECK-NEXT: v_accvgpr_write_b32 a0, v30 | ||
| ; CHECK-NEXT: v_accvgpr_write_b32 a1, v31 | ||
| ; CHECK-NEXT: v_accvgpr_mov_b32 a3, a1 | ||
| ; CHECK-NEXT: v_accvgpr_mov_b32 a2, a0 | ||
| ; CHECK-NEXT: s_mov_b64 s[8:9], s[18:19] | ||
| ; CHECK-NEXT: s_mov_b64 vcc, s[6:7] | ||
| ; CHECK-NEXT: s_cbranch_vccz .LBB0_5 | ||
| ; CHECK-NEXT: ; %bb.8: ; %.preheader1856.preheader.i.i.i3325 | ||
| ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 | ||
| ; CHECK-NEXT: v_accvgpr_write_b32 a0, v26 | ||
| ; CHECK-NEXT: v_accvgpr_write_b32 a2, v26 | ||
| ; CHECK-NEXT: s_mov_b64 s[24:25], 0 | ||
| ; CHECK-NEXT: v_accvgpr_write_b32 a1, v27 | ||
| ; CHECK-NEXT: v_accvgpr_write_b32 a3, v27 | ||
| ; CHECK-NEXT: s_mov_b64 s[8:9], 0 | ||
| ; CHECK-NEXT: s_branch .LBB0_5 | ||
| ; CHECK-NEXT: .LBB0_9: ; in Loop: Header=BB0_2 Depth=1 | ||
| ; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[10:11] | ||
| ; CHECK-NEXT: v_accvgpr_write_b32 a0, v24 | ||
| ; CHECK-NEXT: s_mov_b64 s[22:23], 0 | ||
| ; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[10:11] | ||
| ; CHECK-NEXT: v_accvgpr_write_b32 a1, v25 | ||
| ; CHECK-NEXT: s_mov_b64 s[8:9], s[20:21] | ||
| ; CHECK-NEXT: s_branch .LBB0_15 | ||
| ; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_2 Depth=1 | ||
|
|
@@ -136,19 +135,21 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, | |
| ; CHECK-NEXT: v_cndmask_b32_e64 v23, v23, 0, s[16:17] | ||
| ; CHECK-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[16:17] | ||
| ; CHECK-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[8:9] | ||
| ; CHECK-NEXT: v_mov_b32_e32 v17, v16 | ||
| ; CHECK-NEXT: s_and_b64 s[8:9], exec, s[16:17] | ||
| ; CHECK-NEXT: global_store_dwordx2 v20, v[16:17], s[12:13] | ||
| ; CHECK-NEXT: v_mov_b32_e32 v17, v16 | ||
| ; CHECK-NEXT: s_cselect_b32 s23, s23, 0 | ||
| ; CHECK-NEXT: s_cselect_b32 s22, s22, 0 | ||
| ; CHECK-NEXT: s_mov_b64 s[8:9], -1 | ||
| ; CHECK-NEXT: global_store_dwordx2 v0, v[16:17], s[12:13] | ||
| ; CHECK-NEXT: s_branch .LBB0_14 | ||
| ; CHECK-NEXT: .LBB0_13: ; in Loop: Header=BB0_2 Depth=1 | ||
| ; CHECK-NEXT: s_mov_b64 s[8:9], 0 | ||
| ; CHECK-NEXT: v_mov_b64_e32 v[22:23], 0 | ||
| ; CHECK-NEXT: .LBB0_14: ; %Flow6 | ||
| ; CHECK-NEXT: .LBB0_14: ; %Flow8 | ||
| ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 | ||
| ; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[24:25] | ||
| ; CHECK-NEXT: v_accvgpr_write_b32 a0, v24 | ||
| ; CHECK-NEXT: v_mov_b64_e32 v[16:17], 0 | ||
| ; CHECK-NEXT: v_accvgpr_write_b32 a1, v25 | ||
|
Comment on lines
+150
to
+152
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks worse, we now end up with more movs inside a loop
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point, didn't think about the loop in this test. Looking into this.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This doesn't seem to be necessarily caused by this patch directly but a knock-on effect of a further constrained register with a kernel that requires agpr spilling. Becomes: which is expected for the patch but does limit the possible agpr-spillable instructions. The conversion seen here is the same as what previous occurred in bb4 which is at the same loop depth. Should it always emit |
||
| ; CHECK-NEXT: .LBB0_15: ; %Flow6 | ||
| ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 | ||
| ; CHECK-NEXT: s_mov_b64 s[24:25], -1 | ||
|
|
@@ -157,7 +158,7 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, | |
| ; CHECK-NEXT: ; %bb.16: ; %._crit_edge2105.i.i.i2330 | ||
| ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 | ||
| ; CHECK-NEXT: s_mov_b64 s[24:25], 0 | ||
| ; CHECK-NEXT: global_store_dwordx2 v20, v[20:21], s[12:13] | ||
| ; CHECK-NEXT: global_store_dwordx2 v0, v[16:17], s[12:13] | ||
| ; CHECK-NEXT: s_branch .LBB0_1 | ||
| ; CHECK-NEXT: .LBB0_17: ; %DummyReturnBlock | ||
| ; CHECK-NEXT: s_endpgm | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.