Skip to content

Conversation

@broxigarchen
Copy link
Contributor

@broxigarchen broxigarchen commented Aug 22, 2025

Update isel pattern with 16bit types to use vgp16 in true16 mode. This stop isel from generating illegal vgpr32 = copy vpgr16

This includes fcopysign, scalar_to_vector and i1 trunc. Updated lit test and added a few mir tests.

Stacking up these changes in one patch as I realized that doing these seperately could lead to unexpected failures in between.

@broxigarchen broxigarchen force-pushed the main-fix-true16-copy-3 branch from ffd3634 to ef99f09 Compare August 25, 2025 20:23
@broxigarchen broxigarchen changed the title use vgpr_16 for scalar_vector pattern use vgpr_16 for scalar_vector and trunc i1 pattern Aug 25, 2025
@broxigarchen broxigarchen force-pushed the main-fix-true16-copy-3 branch from ef99f09 to bc46fe4 Compare September 8, 2025 05:04
@broxigarchen broxigarchen changed the title use vgpr_16 for scalar_vector and trunc i1 pattern [AMDGPU][True16][CodeGen] update isel pattern with vgpr16 for 16 bit types Sep 8, 2025
@broxigarchen broxigarchen force-pushed the main-fix-true16-copy-3 branch from bc46fe4 to dc87997 Compare September 8, 2025 17:58
@broxigarchen broxigarchen marked this pull request as ready for review September 8, 2025 18:02
@broxigarchen broxigarchen requested a review from arsenm September 8, 2025 18:02
@llvmbot
Copy link
Member

llvmbot commented Sep 8, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Brox Chen (broxigarchen)

Changes

Update isel pattern with 16bit types to use vgp16 in true16 mode. This stop isel from generating illegal vgpr32 = copy vpgr16

This includes fcopysign, scalar_to_vector and i1 trunc. Updated lit test and added a few mir tests.


Patch is 245.06 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154875.diff

18 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+73-6)
  • (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+188-188)
  • (modified) llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll (+3-2)
  • (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+394-239)
  • (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+426-260)
  • (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll (+2-2)
  • (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll (+4-3)
  • (added) llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll (+82)
  • (modified) llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll (+2-2)
  • (modified) llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll (+18-12)
  • (modified) llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll (+89-62)
  • (modified) llvm/test/CodeGen/AMDGPU/frem.ll (+186-123)
  • (modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (+60-28)
  • (modified) llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll (+248-246)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.round.ll (+28-26)
  • (modified) llvm/test/CodeGen/AMDGPU/lround.ll (+3-7)
  • (added) llvm/test/CodeGen/AMDGPU/scalar_to_vector.gfx11plus.ll (+63)
  • (modified) llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll (+39-19)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index b6e697d34c3d3..fba5d9de28306 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2201,6 +2201,8 @@ def : GCNPat <
 }
 
 foreach fp16vt = [f16, bf16] in {
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
 def : GCNPat <
   (fcopysign fp16vt:$src0, fp16vt:$src1),
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
@@ -2231,6 +2233,42 @@ def : GCNPat <
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
              (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
 >;
+}
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+  (fcopysign fp16vt:$src0, fp16vt:$src1),
+  (EXTRACT_SUBREG (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)),
+    (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16),
+    (REG_SEQUENCE VGPR_32, $src1, lo16, (i16 (IMPLICIT_DEF)), hi16)), lo16)
+>;
+
+def : GCNPat <
+  (fcopysign f32:$src0, fp16vt:$src1),
+  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
+             (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16))
+>;
+
+def : GCNPat <
+  (fcopysign f64:$src0, fp16vt:$src1),
+  (REG_SEQUENCE VReg_64,
+    (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+    (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)),
+               (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16)), sub1)
+>;
+
+def : GCNPat <
+  (fcopysign fp16vt:$src0, f32:$src1),
+  (EXTRACT_SUBREG (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fff0000)),
+             (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src0, hi16), $src1), hi16)
+>;
+
+def : GCNPat <
+  (fcopysign fp16vt:$src0, f64:$src1),
+  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)),
+             (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16),
+             (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
+>;
+}
 } // End foreach fp16vt = [f16, bf16]
 
 
@@ -3154,6 +3192,11 @@ def : GCNPat<
   (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
   (COPY VSrc_b16:$src)
 >;
+
+def : GCNPat <
+  (i1 (DivergentUnaryFrag<trunc> i16:$a)),
+  (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
+>;
 }
 
 let True16Predicate = UseRealTrue16Insts in {
@@ -3171,6 +3214,11 @@ def : GCNPat<
   (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
   (REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16)
 >;
+
+def : GCNPat <
+  (i1 (DivergentUnaryFrag<trunc> i16:$a)),
+  (V_CMP_EQ_U16_t16_e64 (i32 0), (V_AND_B16_t16_e64 (i32 0), (i16 1), (i32 0), $a), (i32 0), (i16 1), (i32 0))
+>;
 }
 
 def : GCNPat <
@@ -3199,11 +3247,6 @@ def : GCNPat <
   (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
 >;
 
-def : GCNPat <
-  (i1 (DivergentUnaryFrag<trunc> i16:$a)),
-  (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
->;
-
 def IMMBitSelConst : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(1ULL << N->getZExtValue(), SDLoc(N),
                                    MVT::i32);
@@ -3807,7 +3850,8 @@ def : GCNPat <
   (v2i16 (S_PACK_HL_B32_B16 SReg_32:$src0, SReg_32:$src1))
 >;
 
-
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
 def : GCNPat <
   (v2f16 (scalar_to_vector f16:$src0)),
   (COPY $src0)
@@ -3827,6 +3871,29 @@ def : GCNPat <
   (v4f16 (scalar_to_vector f16:$src0)),
   (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
 >;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+  (v2f16 (scalar_to_vector f16:$src0)),
+  (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16)
+>;
+
+def : GCNPat <
+  (v2i16 (scalar_to_vector i16:$src0)),
+  (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16)
+>;
+
+def : GCNPat <
+  (v4i16 (scalar_to_vector i16:$src0)),
+  (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1)
+>;
+
+def : GCNPat <
+  (v4f16 (scalar_to_vector f16:$src0)),
+  (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1)
+>;
+}
 
 def : GCNPat <
   (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask,
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 49fe1eed9c514..44c719f3635c8 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -40442,11 +40442,11 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
 ; GFX11TRUE16-LABEL: v_vselect_v2bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT:    v_and_b16 v0.l, 1, v0.l
+; GFX11TRUE16-NEXT:    v_and_b16 v0.h, 1, v1.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v1
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 1, v0.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 1, v0.h
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v2.h, s0
@@ -42871,16 +42871,16 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
 ; GFX11TRUE16-LABEL: v_vselect_v4bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11TRUE16-NEXT:    v_and_b16 v0.h, 1, v2.l
+; GFX11TRUE16-NEXT:    v_and_b16 v0.l, 1, v0.l
+; GFX11TRUE16-NEXT:    v_and_b16 v1.l, 1, v1.l
+; GFX11TRUE16-NEXT:    v_and_b16 v1.h, 1, v3.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 1, v0.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v1
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v3
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 1, v1.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 1, v1.h
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v5.l, vcc_lo
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v4.l, s0
@@ -43195,28 +43195,28 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
 ; GFX11TRUE16-LABEL: v_vselect_v8bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT:    v_and_b16 v0.h, 1, v1.l
+; GFX11TRUE16-NEXT:    v_and_b16 v0.l, 1, v0.l
+; GFX11TRUE16-NEXT:    v_and_b16 v1.l, 1, v3.l
+; GFX11TRUE16-NEXT:    v_and_b16 v1.h, 1, v5.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 1, v0.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v3
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v6
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v4
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v5
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v7
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s4, 1, v2
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v0
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s3, 1, v1
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s5, 1, v3
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s6, 1, v4
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.l, v13.l, v9.l, s4
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 1, v1.l
+; GFX11TRUE16-NEXT:    v_and_b16 v0.l, 1, v6.l
+; GFX11TRUE16-NEXT:    v_and_b16 v0.h, 1, v4.l
+; GFX11TRUE16-NEXT:    v_and_b16 v1.l, 1, v2.l
+; GFX11TRUE16-NEXT:    v_and_b16 v2.l, 1, v7.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 1, v1.h
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 1, v0.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 1, v0.h
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 1, v1.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 1, v2.l
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v12.l, v8.l, s0
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.l, v15.l, v11.l, s2
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.l, v14.l, v10.l, s3
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v12.l, v8.l, s0
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.l, v13.l, v9.l, s4
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v12.h, v8.h, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.h, v13.h, v9.h, s1
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.h, v14.h, v10.h, s5
@@ -43872,38 +43872,38 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v3
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v2
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s3, 1, v5
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s4, 1, v4
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s5, 1, v7
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s6, 1, v6
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s7, 1, v9
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s8, 1, v8
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s9, 1, v11
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s10, 1, v12
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s11, 1, v13
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s12, 1, v10
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s13, 1, v15
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s14, 1, v14
+; GFX11TRUE16-NEXT:    v_and_b16 v0.h, 1, v1.l
+; GFX11TRUE16-NEXT:    v_and_b16 v0.l, 1, v0.l
+; GFX11TRUE16-NEXT:    v_and_b16 v1.l, 1, v3.l
+; GFX11TRUE16-NEXT:    v_and_b16 v1.h, 1, v2.l
+; GFX11TRUE16-NEXT:    v_and_b16 v2.l, 1, v5.l
+; GFX11TRUE16-NEXT:    v_and_b16 v2.h, 1, v4.l
+; GFX11TRUE16-NEXT:    v_and_b16 v3.l, 1, v7.l
+; GFX11TRUE16-NEXT:    v_and_b16 v3.h, 1, v6.l
+; GFX11TRUE16-NEXT:    v_and_b16 v4.l, 1, v9.l
+; GFX11TRUE16-NEXT:    v_and_b16 v4.h, 1, v8.l
+; GFX11TRUE16-NEXT:    v_and_b16 v5.l, 1, v11.l
+; GFX11TRUE16-NEXT:    v_and_b16 v5.h, 1, v10.l
+; GFX11TRUE16-NEXT:    v_and_b16 v6.l, 1, v13.l
+; GFX11TRUE16-NEXT:    v_and_b16 v6.h, 1, v12.l
+; GFX11TRUE16-NEXT:    v_and_b16 v7.l, 1, v15.l
+; GFX11TRUE16-NEXT:    v_and_b16 v7.h, 1, v14.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 1, v0.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 1, v1.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 1, v1.h
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 1, v2.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 1, v2.h
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 1, v3.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 1, v3.h
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 1, v4.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 1, v4.h
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 1, v5.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 1, v6.h
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 1, v6.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 1, v5.h
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 1, v7.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s14, 1, v7.h
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v6.l, v30.l, v22.l, s10
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v6.h, v30.h, v22.h, s11
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v5.l, v29.l, v21.l, s12
@@ -45512,149 +45512,149 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX11TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:68
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v33, off, s32 offset:72
-; GFX11TRUE16-NEXT:    scratch_load_b32 v34, off, s32 offset:124
-; GFX11TRUE16-NEXT:    scratch_load_b32 v35, off, s32 offset:128
-; GFX11TRUE16-NEXT:    scratch_load_b32 v36, off, s32 offset:64
-; GFX11TRUE16-NEXT:    scratch_load_b32 v37, off, s32 offset:60
-; GFX11TRUE16-NEXT:    scratch_load_b32 v38, off, s32 offset:120
-; GFX11TRUE16-NEXT:    scratch_load_b32 v39, off, s32 offset:56
-; GFX11TRUE16-NEXT:    scratch_load_b32 v48, off, s32 offset:116
-; GFX11TRUE16-NEXT:    scratch_load_b32 v49, off, s32 offset:52
-; GFX11TRUE16-NEXT:    scratch_load_b32 v50, off, s32 offset:112
-; GFX11TRUE16-NEXT:    scratch_load_b32 v51, off, s32 offset:48
-; GFX11TRUE16-NEXT:    scratch_load_b32 v52, off, s32 offset:108
-; GFX11TRUE16-NEXT:    scratch_load_b32 v53, off, s32 offset:44
-; GFX11TRUE16-NEXT:    scratch_load_b32 v54, off, s32 offset:104
-; GFX11TRUE16-NEXT:    scratch_load_b32 v55, off, s32 offset:40
-; GFX11TRUE16-NEXT:    scratch_load_b32 v64, off, s32 offset:100
-; GFX11TRUE16-NEXT:    scratch_load_b32 v65, off, s32 offset:36
-; GFX11TRUE16-NEXT:    scratch_load_b32 v66, off, s32 offset:96
-; GFX11TRUE16-NEXT:    scratch_load_b32 v67, off, s32 offset:32
-; GFX11TRUE16-NEXT:    scratch_load_b32 v68, off, s32 offset:92
-; GFX11TRUE16-NEXT:    scratch_load_b32 v69, off, s32 offset:28
-; GFX11TRUE16-NEXT:    scratch_load_b32 v70, off, s32 offset:88
-; GFX11TRUE16-NEXT:    scratch_load_b32 v71, off, s32 offset:24
-; GFX11TRUE16-NEXT:    scratch_load_b32 v80, off, s32 offset:84
-; GFX11TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:20
-; GFX11TRUE16-NEXT:    scratch_load_b32 v82, off, s32 offset:76
+; GFX11TRUE16-NEXT:    scratch_load_b32 v34, off, s32 offset:76
+; GFX11TRUE16-NEXT:    scratch_load_b32 v35, off, s32 offset:124
+; GFX11TRUE16-NEXT:    scratch_load_b32 v36, off, s32 offset:128
+; GFX11TRUE16-NEXT:    scratch_load_b32 v37, off, s32 offset:64
+; GFX11TRUE16-NEXT:    scratch_load_b32 v38, off, s32 offset:60
+; GFX11TRUE16-NEXT:    scratch_load_b32 v39, off, s32 offset:120
+; GFX11TRUE16-NEXT:    scratch_load_b32 v48, off, s32 offset:56
+; GFX11TRUE16-NEXT:    scratch_load_b32 v49, off, s32 offset:116
+; GFX11TRUE16-NEXT:    scratch_load_b32 v50, off, s32 offset:52
+; GFX11TRUE16-NEXT:    scratch_load_b32 v51, off, s32 offset:112
+; GFX11TRUE16-NEXT:    scratch_load_b32 v52, off, s32 offset:48
+; GFX11TRUE16-NEXT:    scratch_load_b32 v53, off, s32 offset:108
+; GFX11TRUE16-NEXT:    scratch_load_b32 v54, off, s32 offset:44
+; GFX11TRUE16-NEXT:    scratch_load_b32 v55, off, s32 offset:104
+; GFX11TRUE16-NEXT:    scratch_load_b32 v64, off, s32 offset:40
+; GFX11TRUE16-NEXT:    scratch_load_b32 v65, off, s32 offset:100
+; GFX11TRUE16-NEXT:    scratch_load_b32 v66, off, s32 offset:36
+; GFX11TRUE16-NEXT:    scratch_load_b32 v67, off, s32 offset:96
+; GFX11TRUE16-NEXT:    scratch_load_b32 v68, off, s32 offset:32
+; GFX11TRUE16-NEXT:    scratch_load_b32 v69, off, s32 offset:92
+; GFX11TRUE16-NEXT:    scratch_load_b32 v70, off, s32 offset:28
+; GFX11TRUE16-NEXT:    scratch_load_b32 v71, off, s32 offset:88
+; GFX11TRUE16-NEXT:    scratch_load_b32 v80, off, s32 offset:24
+; GFX11TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:84
+; GFX11TRUE16-NEXT:    scratch_load_b32 v82, off, s32 offset:20
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v83, off, s32 offset:80
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v84, off, s32 offset:16
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v85, off, s32 offset:12
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v86, off, s32 offset:8
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v87, off, s32 offset:4
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v16, 1, v16
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v17, 1, v17
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v19, 1, v19
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v18, 1, v18
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v20, 1, v20
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v23, 1, v23
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v22, 1, v22
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v25, 1, v25
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v24, 1, v24
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v27, 1, v27
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v26, 1, v26
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v29, 1, v29
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v28, 1, v28
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v30, 1, v30
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s16, 1, v16
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v3
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v2
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s3, 1, v5
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s4, 1, v4
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s5, 1, v7
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s6, 1, v6
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s7, 1, v9
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s8, 1, v8
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s9, 1, v11
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s10, 1, v10
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s11, 1, v13
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s12, 1, v12
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s13, 1, v15
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s14, 1, v14
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s15, 1, v17
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s17, 1, v19
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s18, 1, v18
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s19, 1, v21
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s20, 1, v20
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s21, 1, v23
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s22, 1, v22
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s23, 1, v25
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s24, 1, v24
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s25, 1, v27
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s26, 1, v30
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s27, 1, v28
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s28, 1, v29
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s29, 1, v26
+; GFX11TRUE16-NEXT:    v_and_b16 v0.h, 1, v1.l
+; GFX11TRUE16-NEXT:    v_and_b16 v0.l, 1, v0.l
+; GFX11TRUE16-NEXT:    v_and_b16 v1.l, 1, v3.l
+; GFX11TRUE16-NEXT:    v_and_b16 v1.h, 1, v2.l
+; GFX11TRUE16-NEXT:    v_and_b16 v2.l, 1, v5.l
+; GFX11TRUE16-NEXT:    v_and_b16 v2.h, 1, v4.l
+; GFX11TRUE16-NEXT:    v_and_b16 v3.l, 1, v7.l
+; GFX11TRUE16-NEXT:    v_and_b16 v3.h...
[truncated]

@broxigarchen broxigarchen requested a review from dstutt September 8, 2025 18:07
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor code quality issue, but can we get the folding of s0 into the bfi back?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There should be code there to do this folding in the foldoperand pass. Let me check why this is not working

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nvm. I think we don't do this folding (we know that s0 and v1 is the same i16 in ISel, but in folding pass this seems require some additional context analysis).

I don't think we can do much in ISel. In ISel we don't know if the 16bit operand is a vpgr or sgpr, and we force to use vgpr16 which leads to this additional copy. We might be able to add this folding in folding pass, but not sure if it worth

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it seems like a job for SIFoldOperands.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will open another patch for this

@broxigarchen broxigarchen force-pushed the main-fix-true16-copy-3 branch from dc87997 to e1b2852 Compare September 9, 2025 04:52
@broxigarchen
Copy link
Contributor Author

CI error not related

@broxigarchen broxigarchen merged commit 1290077 into llvm:main Sep 9, 2025
7 of 9 checks passed
@llvm-ci
Copy link
Collaborator

llvm-ci commented Sep 9, 2025

LLVM Buildbot has detected a new failure on builder ml-opt-dev-x86-64 running on ml-opt-dev-x86-64-b2 while building llvm at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/137/builds/25151

Here is the relevant piece of the build log for the reference
Step 6 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'LLVM :: CodeGen/AMDGPU/fcopysign.gfx11plus.ll' FAILED ********************
Exit Code: 2

Command Output (stderr):
--
/b/ml-opt-dev-x86-64-b1/build/bin/llc < /b/ml-opt-dev-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel | /b/ml-opt-dev-x86-64-b1/build/bin/FileCheck /b/ml-opt-dev-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll --check-prefixes=GFX11,GFX11-REAL16 # RUN: at line 2
+ /b/ml-opt-dev-x86-64-b1/build/bin/FileCheck /b/ml-opt-dev-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll --check-prefixes=GFX11,GFX11-REAL16
error: no check strings found with prefix 'GFX11:'
+ /b/ml-opt-dev-x86-64-b1/build/bin/llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel

--

********************


@llvm-ci
Copy link
Collaborator

llvm-ci commented Sep 9, 2025

LLVM Buildbot has detected a new failure on builder ml-opt-devrel-x86-64 running on ml-opt-devrel-x86-64-b1 while building llvm at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/175/builds/24992

Here is the relevant piece of the build log for the reference
Step 6 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'LLVM :: CodeGen/AMDGPU/fcopysign.gfx11plus.ll' FAILED ********************
Exit Code: 2

Command Output (stderr):
--
/b/ml-opt-devrel-x86-64-b1/build/bin/llc < /b/ml-opt-devrel-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel | /b/ml-opt-devrel-x86-64-b1/build/bin/FileCheck /b/ml-opt-devrel-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll --check-prefixes=GFX11,GFX11-REAL16 # RUN: at line 2
+ /b/ml-opt-devrel-x86-64-b1/build/bin/FileCheck /b/ml-opt-devrel-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll --check-prefixes=GFX11,GFX11-REAL16
error: no check strings found with prefix 'GFX11:'
+ /b/ml-opt-devrel-x86-64-b1/build/bin/llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel

--

********************


@boomanaiden154
Copy link
Contributor

boomanaiden154 commented Sep 9, 2025

CI error not related

The CI shows these two tests being broken:

  LLVM :: CodeGen/AMDGPU/fcopysign.gfx11plus.ll
  LLVM :: CodeGen/AMDGPU/scalar_to_vector.gfx11plus.ll

Those tests were added in this PR. How are the failures not related?

This change also broke postcommit buildbots. Please either revert or fix forward.

@llvm-ci
Copy link
Collaborator

llvm-ci commented Sep 9, 2025

LLVM Buildbot has detected a new failure on builder ml-opt-rel-x86-64 running on ml-opt-rel-x86-64-b2 while building llvm at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/185/builds/25001

Here is the relevant piece of the build log for the reference
Step 6 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'LLVM :: CodeGen/AMDGPU/fcopysign.gfx11plus.ll' FAILED ********************
Exit Code: 2

Command Output (stderr):
--
/b/ml-opt-rel-x86-64-b1/build/bin/llc < /b/ml-opt-rel-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel | /b/ml-opt-rel-x86-64-b1/build/bin/FileCheck /b/ml-opt-rel-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll --check-prefixes=GFX11,GFX11-REAL16 # RUN: at line 2
+ /b/ml-opt-rel-x86-64-b1/build/bin/llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel
+ /b/ml-opt-rel-x86-64-b1/build/bin/FileCheck /b/ml-opt-rel-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll --check-prefixes=GFX11,GFX11-REAL16
error: no check strings found with prefix 'GFX11:'

--

********************


@llvm-ci
Copy link
Collaborator

llvm-ci commented Sep 9, 2025

LLVM Buildbot has detected a new failure on builder lld-x86_64-ubuntu-fast running on as-builder-4 while building llvm at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/33/builds/22671

Here is the relevant piece of the build log for the reference
Step 6 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'LLVM :: CodeGen/AMDGPU/fcopysign.gfx11plus.ll' FAILED ********************
Exit Code: 2

Command Output (stderr):
--
/home/buildbot/worker/as-builder-4/ramdisk/lld-x86_64/build/bin/llc < /home/buildbot/worker/as-builder-4/ramdisk/lld-x86_64/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel | /home/buildbot/worker/as-builder-4/ramdisk/lld-x86_64/build/bin/FileCheck /home/buildbot/worker/as-builder-4/ramdisk/lld-x86_64/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll --check-prefixes=GFX11,GFX11-REAL16 # RUN: at line 2
+ /home/buildbot/worker/as-builder-4/ramdisk/lld-x86_64/build/bin/FileCheck /home/buildbot/worker/as-builder-4/ramdisk/lld-x86_64/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll --check-prefixes=GFX11,GFX11-REAL16
error: no check strings found with prefix 'GFX11:'
+ /home/buildbot/worker/as-builder-4/ramdisk/lld-x86_64/build/bin/llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel

--

********************


@broxigarchen
Copy link
Contributor Author

broxigarchen commented Sep 9, 2025

scalar_to_vector.gfx11plus.ll

Oh, that's weird. Not sure how I miss it. Fix is here #157684

broxigarchen added a commit that referenced this pull request Sep 9, 2025
llvm-sync bot pushed a commit to arm/arm-toolchain that referenced this pull request Sep 9, 2025
piotrAMD added a commit that referenced this pull request Sep 26, 2025
This is a temporary fix for a regression from #154875.

The new pattern sets the hi part of V_BFI result and that confuses
si-fix-sgpr-copies - where the proper fix is likely to be.

During si-fix-sgpr-copies, an incorrect fold happens:

 %86:vgpr_32 = V_BFI_B32_e64
 %87:sreg_32 = COPY %86.hi16:vgpr_32
%95:vgpr_32 = nofpexcept V_PACK_B32_F16_t16_e64 0, killed %87:sreg_32,
0, %63:vgpr_16, 0, 0

into

 %86:vgpr_32 = V_BFI_B32_e64
%95:vgpr_32 = nofpexcept V_PACK_B32_F16_t16_e64 0, %86.lo16:vgpr_32, 0,
%63:vgpr_16, 0, 0

Fixes: Vulkan CTS dEQP-VK.glsl.builtin.precision_fp16_storage32b.*.
mahesh-attarde pushed a commit to mahesh-attarde/llvm-project that referenced this pull request Oct 3, 2025
)

This is a temporary fix for a regression from llvm#154875.

The new pattern sets the hi part of V_BFI result and that confuses
si-fix-sgpr-copies - where the proper fix is likely to be.

During si-fix-sgpr-copies, an incorrect fold happens:

 %86:vgpr_32 = V_BFI_B32_e64
 %87:sreg_32 = COPY %86.hi16:vgpr_32
%95:vgpr_32 = nofpexcept V_PACK_B32_F16_t16_e64 0, killed %87:sreg_32,
0, %63:vgpr_16, 0, 0

into

 %86:vgpr_32 = V_BFI_B32_e64
%95:vgpr_32 = nofpexcept V_PACK_B32_F16_t16_e64 0, %86.lo16:vgpr_32, 0,
%63:vgpr_16, 0, 0

Fixes: Vulkan CTS dEQP-VK.glsl.builtin.precision_fp16_storage32b.*.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

6 participants