[AMDGPU] Elide bitcast fold i64 imm to build_vector #154115

JanekvO · 2025-08-18T13:25:49Z

Elide bitcast combine to build_vector in case of i64 immediate that can be materialized through 64b mov

llvmbot · 2025-08-18T13:26:24Z

@llvm/pr-subscribers-backend-amdgpu

Author: Janek van Oirschot (JanekvO)

Changes

Elide bitcast combine to build_vector in case of i64 immediate that can be materialized through 64b mov

Full diff: https://github.com/llvm/llvm-project/pull/154115.diff

3 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+11)
(modified) llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll (+12-7)
(modified) llvm/test/CodeGen/AMDGPU/flat-scratch.ll (+2-4)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 64e68ab7d753c..f3c5b0d41ca39 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5310,6 +5310,13 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
       break;
 
+    auto canMov64b = [&](uint64_t Val) -> bool {
+      if (!Subtarget->isGCN())
+        return false;
+      auto &ST = DAG.getSubtarget<GCNSubtarget>();
+      return ST.hasMovB64() && (ST.has64BitLiterals() || isUInt<32>(Val));
+    };
+
     // Fold bitcasts of constants.
     //
     // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
@@ -5318,6 +5325,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
       SDLoc SL(N);
       uint64_t CVal = C->getZExtValue();
+      if (canMov64b(CVal))
+        break;
       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
                                DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
                                DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
@@ -5328,6 +5337,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
       const APInt &Val = C->getValueAPF().bitcastToAPInt();
       SDLoc SL(N);
       uint64_t CVal = Val.getZExtValue();
+      if (canMov64b(CVal))
+        break;
       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
                                 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
                                 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
diff --git a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll
index 89fe0ab526a8a..6c421d50195e6 100644
--- a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll
@@ -16,10 +16,12 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    s_bitcmp1_b32 s0, 8
 ; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; CHECK-NEXT:    s_xor_b64 s[20:21], s[2:3], -1
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
-; CHECK-NEXT:    s_and_b64 s[2:3], exec, s[2:3]
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x9037ab78
+; CHECK-NEXT:    v_accvgpr_write_b32 a3, v1
+; CHECK-NEXT:    s_xor_b64 s[20:21], s[2:3], -1
+; CHECK-NEXT:    s_and_b64 s[2:3], exec, s[2:3]
+; CHECK-NEXT:    v_accvgpr_write_b32 a2, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0xbe927e4f
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0x19f4ec90
 ; CHECK-NEXT:    v_mov_b32_e32 v5, 0x3efa01a0
@@ -40,7 +42,8 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    v_mov_b32_e32 v18, 0x55555523
 ; CHECK-NEXT:    v_mov_b32_e32 v19, 0xbfd55555
 ; CHECK-NEXT:    s_and_b64 s[6:7], exec, s[18:19]
-; CHECK-NEXT:    v_mov_b32_e32 v20, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b64_e32 v[20:21], 0
 ; CHECK-NEXT:    ; implicit-def: $agpr0_agpr1
 ; CHECK-NEXT:    ; implicit-def: $vgpr22_vgpr23
 ; CHECK-NEXT:    s_branch .LBB0_2
@@ -61,9 +64,11 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    v_mov_b64_e32 v[24:25], s[14:15]
 ; CHECK-NEXT:    flat_load_dwordx2 v[24:25], v[24:25]
-; CHECK-NEXT:    v_mov_b64_e32 v[26:27], v[0:1]
+; CHECK-NEXT:    v_accvgpr_read_b32 v27, a3
+; CHECK-NEXT:    v_accvgpr_read_b32 v26, a2
 ; CHECK-NEXT:    v_mov_b64_e32 v[28:29], v[2:3]
 ; CHECK-NEXT:    v_mov_b64_e32 v[30:31], v[16:17]
+; CHECK-NEXT:    v_mov_b64_e32 v[20:21], 0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_fmac_f64_e32 v[26:27], 0, v[24:25]
 ; CHECK-NEXT:    v_fmac_f64_e32 v[28:29], 0, v[26:27]
@@ -134,10 +139,11 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    v_mov_b32_e32 v27, v26
 ; CHECK-NEXT:    s_and_b64 s[8:9], exec, s[16:17]
 ; CHECK-NEXT:    v_cndmask_b32_e64 v22, v22, 0, s[16:17]
-; CHECK-NEXT:    global_store_dwordx2 v20, v[26:27], s[12:13]
+; CHECK-NEXT:    global_store_dwordx2 v0, v[26:27], s[12:13]
 ; CHECK-NEXT:    s_cselect_b32 s23, s23, 0
 ; CHECK-NEXT:    s_cselect_b32 s22, s22, 0
 ; CHECK-NEXT:    s_mov_b64 s[8:9], -1
+; CHECK-NEXT:    v_mov_b64_e32 v[20:21], 0
 ; CHECK-NEXT:    s_branch .LBB0_14
 ; CHECK-NEXT:  .LBB0_13: ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    v_accvgpr_write_b32 a0, v24
@@ -153,9 +159,8 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_1
 ; CHECK-NEXT:  ; %bb.16: ; %._crit_edge2105.i.i.i2330
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    v_mov_b32_e32 v21, v20
 ; CHECK-NEXT:    s_mov_b64 s[24:25], 0
-; CHECK-NEXT:    global_store_dwordx2 v20, v[20:21], s[12:13]
+; CHECK-NEXT:    global_store_dwordx2 v0, v[20:21], s[12:13]
 ; CHECK-NEXT:    s_branch .LBB0_1
 ; CHECK-NEXT:  .LBB0_17: ; %DummyReturnBlock
 ; CHECK-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index fc8883924dfbc..4eaa1965c66f1 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -4152,8 +4152,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; GFX942-LABEL: store_load_i64_aligned:
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v2, 15
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], 15
 ; GFX942-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
@@ -4263,8 +4262,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX942-LABEL: store_load_i64_unaligned:
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v2, 15
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], 15
 ; GFX942-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1

llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll

arsenm · 2025-08-18T15:08:49Z

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Can you track down the existing tests for this combine, and add a gfx942 run line? I'm sure it's missing. Most of this code probably hasn't been revisited

The original patch didn't add any tests and only affected about 4 tests; I've checked what tests are affected by removing this combine altogether locally and spawned #154363 with tests for which I enable gfx942 (I did cut a bunch of tests for gfx942 enabling that seemed to be more bug/crash tests rather than actual codegen tests)

llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Enable gfx942 for tests that are affected by the an AMDGPU bitcast constant combine (#154115) Expecting to see more tests affected in aforementioned PR after rebase on top of this PR

llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll

…an be materialized

JanekvO · 2025-08-26T12:38:38Z

Rebase

JanekvO · 2025-08-28T12:41:25Z

ping

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

arsenm · 2025-09-04T13:39:28Z

llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll

+; CHECK-NEXT:    v_accvgpr_write_b32 a0, v24
+; CHECK-NEXT:    v_mov_b64_e32 v[16:17], 0
+; CHECK-NEXT:    v_accvgpr_write_b32 a1, v25


This looks worse, we now end up with more movs inside a loop

Good point, didn't think about the loop in this test. Looking into this.

This doesn't seem to be necessarily caused by this patch directly but a knock-on effect of a further constrained register with a kernel that requires agpr spilling.
After isel:

%149:sreg_32 = S_MOV_B32 0 %150:sreg_64 = REG_SEQUENCE %149:sreg_32, %subreg.sub0, %149:sreg_32, %subreg.sub1 %152:av_64_align2 = COPY %150:sreg_64

Becomes:

%150:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec

which is expected for the patch but does limit the possible agpr-spillable instructions.

The conversion seen here is the same as what previous occurred in bb4 which is at the same loop depth. Should it always emit av_mov_b64_* pseudo instead of the v_mov_b64_* pseudo?

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

arsenm

lgtm with nit

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

This reverts commit 341cdbc.

Reverts #154115 Co-authored-by: ronlieb <[email protected]>

…r" (#160325) Reverts llvm/llvm-project#154115 Co-authored-by: ronlieb <[email protected]>

JanekvO requested review from arsenm and ritter-x2a August 18, 2025 13:25

llvmbot added the backend:AMDGPU label Aug 18, 2025

JanekvO mentioned this pull request Aug 18, 2025

[AMDGPU] Legalize 64bit elements for BUILD_VECTOR on gfx942 #145052

Open

JanekvO commented Aug 18, 2025

View reviewed changes

llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll Outdated Show resolved Hide resolved

arsenm reviewed Aug 18, 2025

View reviewed changes

JanekvO mentioned this pull request Aug 19, 2025

[AMDGPU][NFC] Enable gfx942 for more tests #154363

Merged

arsenm reviewed Aug 20, 2025

View reviewed changes

llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll Outdated Show resolved Hide resolved

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Outdated Show resolved Hide resolved

JanekvO force-pushed the v_mov_64_bitcast_combine branch from e17e3e3 to bcc2b2d Compare August 21, 2025 14:38

JanekvO commented Aug 21, 2025

View reviewed changes

llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll Outdated Show resolved Hide resolved

JanekvO force-pushed the v_mov_64_bitcast_combine branch from 8df7137 to 60147e8 Compare August 22, 2025 14:11

JanekvO requested a review from arsenm August 26, 2025 09:05

[AMDGPU] Elide bitcast combine to build_vector in case i64 constant c…

2f4f1da

…an be materialized

JanekvO force-pushed the v_mov_64_bitcast_combine branch from 60147e8 to 2f4f1da Compare August 26, 2025 12:38

arsenm reviewed Sep 4, 2025

View reviewed changes

address feedback

07c7d9b

JanekvO requested a review from arsenm September 15, 2025 10:31

arsenm reviewed Sep 15, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Outdated Show resolved Hide resolved

arsenm reviewed Sep 15, 2025

View reviewed changes

Place conditionals for lazy evaluation

aae4aea

JanekvO requested a review from arsenm September 16, 2025 10:58

arsenm reviewed Sep 16, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Outdated Show resolved Hide resolved

Apply suggestions

0ceb3ce

arsenm approved these changes Sep 16, 2025

View reviewed changes

Remove unused variable

4d208c6

JanekvO merged commit 341cdbc into llvm:main Sep 16, 2025
9 checks passed

JanekvO added a commit that referenced this pull request Sep 23, 2025

Revert "[AMDGPU] Elide bitcast fold i64 imm to build_vector (#154115)"

ea66de8

This reverts commit 341cdbc.

JanekvO mentioned this pull request Sep 23, 2025

Revert "[AMDGPU] Elide bitcast fold i64 imm to build_vector" #160325

Merged

JanekvO added a commit that referenced this pull request Sep 23, 2025

Revert "[AMDGPU] Elide bitcast fold i64 imm to build_vector" (#160325)

a584bd9

Reverts #154115 Co-authored-by: ronlieb <[email protected]>

llvm-sync bot pushed a commit to arm/arm-toolchain that referenced this pull request Sep 23, 2025

Automerge: Revert "[AMDGPU] Elide bitcast fold i64 imm to build_vecto…

2921824

…r" (#160325) Reverts llvm/llvm-project#154115 Co-authored-by: ronlieb <[email protected]>

JanekvO mentioned this pull request Sep 24, 2025

[AMDGPU] Emit b32 movs if (a)v_mov_b64_pseudo dest vgprs are misaligned #160547

Open

JanekvO mentioned this pull request Oct 10, 2025

[AMDGPU] siloadstoreopt generate REG_SEQUENCE with aligned operands #162088

Open

[AMDGPU] Elide bitcast fold i64 imm to build_vector #154115

[AMDGPU] Elide bitcast fold i64 imm to build_vector #154115

Uh oh!

Conversation

JanekvO commented Aug 18, 2025

Uh oh!

llvmbot commented Aug 18, 2025

Uh oh!

Uh oh!

arsenm Aug 18, 2025

Choose a reason for hiding this comment

Uh oh!

JanekvO Aug 19, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

JanekvO commented Aug 26, 2025

Uh oh!

JanekvO commented Aug 28, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

arsenm Sep 4, 2025

Choose a reason for hiding this comment

Uh oh!

JanekvO Sep 9, 2025

Choose a reason for hiding this comment

Uh oh!

JanekvO Sep 12, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants