[AMDGPU][True16][CodeGen] update isel pattern with vgpr16 for 16 bit types #154875

broxigarchen · 2025-08-22T02:21:48Z

Update isel pattern with 16bit types to use vgp16 in true16 mode. This stop isel from generating illegal vgpr32 = copy vpgr16

This includes fcopysign, scalar_to_vector and i1 trunc. Updated lit test and added a few mir tests.

Stacking up these changes in one patch as I realized that doing these seperately could lead to unexpected failures in between.

llvmbot · 2025-09-08T18:03:05Z

@llvm/pr-subscribers-backend-amdgpu

Author: Brox Chen (broxigarchen)

Changes

Update isel pattern with 16bit types to use vgp16 in true16 mode. This stop isel from generating illegal vgpr32 = copy vpgr16

This includes fcopysign, scalar_to_vector and i1 trunc. Updated lit test and added a few mir tests.

Patch is 245.06 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154875.diff

18 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+73-6)
(modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+188-188)
(modified) llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll (+3-2)
(modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+394-239)
(modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+426-260)
(modified) llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll (+2-2)
(modified) llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll (+4-3)
(added) llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll (+82)
(modified) llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll (+2-2)
(modified) llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll (+18-12)
(modified) llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll (+89-62)
(modified) llvm/test/CodeGen/AMDGPU/frem.ll (+186-123)
(modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (+60-28)
(modified) llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll (+248-246)
(modified) llvm/test/CodeGen/AMDGPU/llvm.round.ll (+28-26)
(modified) llvm/test/CodeGen/AMDGPU/lround.ll (+3-7)
(added) llvm/test/CodeGen/AMDGPU/scalar_to_vector.gfx11plus.ll (+63)
(modified) llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll (+39-19)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index b6e697d34c3d3..fba5d9de28306 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2201,6 +2201,8 @@ def : GCNPat <
 }
 
 foreach fp16vt = [f16, bf16] in {
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
 def : GCNPat <
   (fcopysign fp16vt:$src0, fp16vt:$src1),
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
@@ -2231,6 +2233,42 @@ def : GCNPat <
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
              (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
 >;
+}
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+  (fcopysign fp16vt:$src0, fp16vt:$src1),
+  (EXTRACT_SUBREG (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)),
+    (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16),
+    (REG_SEQUENCE VGPR_32, $src1, lo16, (i16 (IMPLICIT_DEF)), hi16)), lo16)
+>;
+
+def : GCNPat <
+  (fcopysign f32:$src0, fp16vt:$src1),
+  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
+             (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16))
+>;
+
+def : GCNPat <
+  (fcopysign f64:$src0, fp16vt:$src1),
+  (REG_SEQUENCE VReg_64,
+    (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+    (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)),
+               (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16)), sub1)
+>;
+
+def : GCNPat <
+  (fcopysign fp16vt:$src0, f32:$src1),
+  (EXTRACT_SUBREG (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fff0000)),
+             (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src0, hi16), $src1), hi16)
+>;
+
+def : GCNPat <
+  (fcopysign fp16vt:$src0, f64:$src1),
+  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)),
+             (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16),
+             (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
+>;
+}
 } // End foreach fp16vt = [f16, bf16]
 
 
@@ -3154,6 +3192,11 @@ def : GCNPat<
   (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
   (COPY VSrc_b16:$src)
 >;
+
+def : GCNPat <
+  (i1 (DivergentUnaryFrag<trunc> i16:$a)),
+  (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
+>;
 }
 
 let True16Predicate = UseRealTrue16Insts in {
@@ -3171,6 +3214,11 @@ def : GCNPat<
   (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
   (REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16)
 >;
+
+def : GCNPat <
+  (i1 (DivergentUnaryFrag<trunc> i16:$a)),
+  (V_CMP_EQ_U16_t16_e64 (i32 0), (V_AND_B16_t16_e64 (i32 0), (i16 1), (i32 0), $a), (i32 0), (i16 1), (i32 0))
+>;
 }
 
 def : GCNPat <
@@ -3199,11 +3247,6 @@ def : GCNPat <
   (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
 >;
 
-def : GCNPat <
-  (i1 (DivergentUnaryFrag<trunc> i16:$a)),
-  (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
->;
-
 def IMMBitSelConst : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(1ULL << N->getZExtValue(), SDLoc(N),
                                    MVT::i32);
@@ -3807,7 +3850,8 @@ def : GCNPat <
   (v2i16 (S_PACK_HL_B32_B16 SReg_32:$src0, SReg_32:$src1))
 >;
 
-
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
 def : GCNPat <
   (v2f16 (scalar_to_vector f16:$src0)),
   (COPY $src0)
@@ -3827,6 +3871,29 @@ def : GCNPat <
   (v4f16 (scalar_to_vector f16:$src0)),
   (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
 >;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+  (v2f16 (scalar_to_vector f16:$src0)),
+  (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16)
+>;
+
+def : GCNPat <
+  (v2i16 (scalar_to_vector i16:$src0)),
+  (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16)
+>;
+
+def : GCNPat <
+  (v4i16 (scalar_to_vector i16:$src0)),
+  (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1)
+>;
+
+def : GCNPat <
+  (v4f16 (scalar_to_vector f16:$src0)),
+  (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1)
+>;
+}
 
 def : GCNPat <
   (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask,
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 49fe1eed9c514..44c719f3635c8 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -40442,11 +40442,11 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
 ; GFX11TRUE16-LABEL: v_vselect_v2bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT:    v_and_b16 v0.l, 1, v0.l
+; GFX11TRUE16-NEXT:    v_and_b16 v0.h, 1, v1.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v1
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 1, v0.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 1, v0.h
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v2.h, s0
@@ -42871,16 +42871,16 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
 ; GFX11TRUE16-LABEL: v_vselect_v4bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11TRUE16-NEXT:    v_and_b16 v0.h, 1, v2.l
+; GFX11TRUE16-NEXT:    v_and_b16 v0.l, 1, v0.l
+; GFX11TRUE16-NEXT:    v_and_b16 v1.l, 1, v1.l
+; GFX11TRUE16-NEXT:    v_and_b16 v1.h, 1, v3.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 1, v0.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v1
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v3
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 1, v1.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 1, v1.h
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v5.l, vcc_lo
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v4.l, s0
@@ -43195,28 +43195,28 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
 ; GFX11TRUE16-LABEL: v_vselect_v8bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT:    v_and_b16 v0.h, 1, v1.l
+; GFX11TRUE16-NEXT:    v_and_b16 v0.l, 1, v0.l
+; GFX11TRUE16-NEXT:    v_and_b16 v1.l, 1, v3.l
+; GFX11TRUE16-NEXT:    v_and_b16 v1.h, 1, v5.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 1, v0.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v3
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v6
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v4
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v5
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v7
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s4, 1, v2
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v0
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s3, 1, v1
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s5, 1, v3
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s6, 1, v4
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.l, v13.l, v9.l, s4
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 1, v1.l
+; GFX11TRUE16-NEXT:    v_and_b16 v0.l, 1, v6.l
+; GFX11TRUE16-NEXT:    v_and_b16 v0.h, 1, v4.l
+; GFX11TRUE16-NEXT:    v_and_b16 v1.l, 1, v2.l
+; GFX11TRUE16-NEXT:    v_and_b16 v2.l, 1, v7.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 1, v1.h
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 1, v0.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 1, v0.h
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 1, v1.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 1, v2.l
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v12.l, v8.l, s0
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.l, v15.l, v11.l, s2
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.l, v14.l, v10.l, s3
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v12.l, v8.l, s0
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.l, v13.l, v9.l, s4
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v12.h, v8.h, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.h, v13.h, v9.h, s1
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.h, v14.h, v10.h, s5
@@ -43872,38 +43872,38 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v3
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v2
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s3, 1, v5
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s4, 1, v4
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s5, 1, v7
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s6, 1, v6
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s7, 1, v9
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s8, 1, v8
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s9, 1, v11
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s10, 1, v12
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s11, 1, v13
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s12, 1, v10
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s13, 1, v15
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s14, 1, v14
+; GFX11TRUE16-NEXT:    v_and_b16 v0.h, 1, v1.l
+; GFX11TRUE16-NEXT:    v_and_b16 v0.l, 1, v0.l
+; GFX11TRUE16-NEXT:    v_and_b16 v1.l, 1, v3.l
+; GFX11TRUE16-NEXT:    v_and_b16 v1.h, 1, v2.l
+; GFX11TRUE16-NEXT:    v_and_b16 v2.l, 1, v5.l
+; GFX11TRUE16-NEXT:    v_and_b16 v2.h, 1, v4.l
+; GFX11TRUE16-NEXT:    v_and_b16 v3.l, 1, v7.l
+; GFX11TRUE16-NEXT:    v_and_b16 v3.h, 1, v6.l
+; GFX11TRUE16-NEXT:    v_and_b16 v4.l, 1, v9.l
+; GFX11TRUE16-NEXT:    v_and_b16 v4.h, 1, v8.l
+; GFX11TRUE16-NEXT:    v_and_b16 v5.l, 1, v11.l
+; GFX11TRUE16-NEXT:    v_and_b16 v5.h, 1, v10.l
+; GFX11TRUE16-NEXT:    v_and_b16 v6.l, 1, v13.l
+; GFX11TRUE16-NEXT:    v_and_b16 v6.h, 1, v12.l
+; GFX11TRUE16-NEXT:    v_and_b16 v7.l, 1, v15.l
+; GFX11TRUE16-NEXT:    v_and_b16 v7.h, 1, v14.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 1, v0.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 1, v1.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 1, v1.h
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 1, v2.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 1, v2.h
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 1, v3.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 1, v3.h
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 1, v4.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 1, v4.h
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 1, v5.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 1, v6.h
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 1, v6.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 1, v5.h
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 1, v7.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s14, 1, v7.h
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v6.l, v30.l, v22.l, s10
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v6.h, v30.h, v22.h, s11
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v5.l, v29.l, v21.l, s12
@@ -45512,149 +45512,149 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX11TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:68
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v33, off, s32 offset:72
-; GFX11TRUE16-NEXT:    scratch_load_b32 v34, off, s32 offset:124
-; GFX11TRUE16-NEXT:    scratch_load_b32 v35, off, s32 offset:128
-; GFX11TRUE16-NEXT:    scratch_load_b32 v36, off, s32 offset:64
-; GFX11TRUE16-NEXT:    scratch_load_b32 v37, off, s32 offset:60
-; GFX11TRUE16-NEXT:    scratch_load_b32 v38, off, s32 offset:120
-; GFX11TRUE16-NEXT:    scratch_load_b32 v39, off, s32 offset:56
-; GFX11TRUE16-NEXT:    scratch_load_b32 v48, off, s32 offset:116
-; GFX11TRUE16-NEXT:    scratch_load_b32 v49, off, s32 offset:52
-; GFX11TRUE16-NEXT:    scratch_load_b32 v50, off, s32 offset:112
-; GFX11TRUE16-NEXT:    scratch_load_b32 v51, off, s32 offset:48
-; GFX11TRUE16-NEXT:    scratch_load_b32 v52, off, s32 offset:108
-; GFX11TRUE16-NEXT:    scratch_load_b32 v53, off, s32 offset:44
-; GFX11TRUE16-NEXT:    scratch_load_b32 v54, off, s32 offset:104
-; GFX11TRUE16-NEXT:    scratch_load_b32 v55, off, s32 offset:40
-; GFX11TRUE16-NEXT:    scratch_load_b32 v64, off, s32 offset:100
-; GFX11TRUE16-NEXT:    scratch_load_b32 v65, off, s32 offset:36
-; GFX11TRUE16-NEXT:    scratch_load_b32 v66, off, s32 offset:96
-; GFX11TRUE16-NEXT:    scratch_load_b32 v67, off, s32 offset:32
-; GFX11TRUE16-NEXT:    scratch_load_b32 v68, off, s32 offset:92
-; GFX11TRUE16-NEXT:    scratch_load_b32 v69, off, s32 offset:28
-; GFX11TRUE16-NEXT:    scratch_load_b32 v70, off, s32 offset:88
-; GFX11TRUE16-NEXT:    scratch_load_b32 v71, off, s32 offset:24
-; GFX11TRUE16-NEXT:    scratch_load_b32 v80, off, s32 offset:84
-; GFX11TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:20
-; GFX11TRUE16-NEXT:    scratch_load_b32 v82, off, s32 offset:76
+; GFX11TRUE16-NEXT:    scratch_load_b32 v34, off, s32 offset:76
+; GFX11TRUE16-NEXT:    scratch_load_b32 v35, off, s32 offset:124
+; GFX11TRUE16-NEXT:    scratch_load_b32 v36, off, s32 offset:128
+; GFX11TRUE16-NEXT:    scratch_load_b32 v37, off, s32 offset:64
+; GFX11TRUE16-NEXT:    scratch_load_b32 v38, off, s32 offset:60
+; GFX11TRUE16-NEXT:    scratch_load_b32 v39, off, s32 offset:120
+; GFX11TRUE16-NEXT:    scratch_load_b32 v48, off, s32 offset:56
+; GFX11TRUE16-NEXT:    scratch_load_b32 v49, off, s32 offset:116
+; GFX11TRUE16-NEXT:    scratch_load_b32 v50, off, s32 offset:52
+; GFX11TRUE16-NEXT:    scratch_load_b32 v51, off, s32 offset:112
+; GFX11TRUE16-NEXT:    scratch_load_b32 v52, off, s32 offset:48
+; GFX11TRUE16-NEXT:    scratch_load_b32 v53, off, s32 offset:108
+; GFX11TRUE16-NEXT:    scratch_load_b32 v54, off, s32 offset:44
+; GFX11TRUE16-NEXT:    scratch_load_b32 v55, off, s32 offset:104
+; GFX11TRUE16-NEXT:    scratch_load_b32 v64, off, s32 offset:40
+; GFX11TRUE16-NEXT:    scratch_load_b32 v65, off, s32 offset:100
+; GFX11TRUE16-NEXT:    scratch_load_b32 v66, off, s32 offset:36
+; GFX11TRUE16-NEXT:    scratch_load_b32 v67, off, s32 offset:96
+; GFX11TRUE16-NEXT:    scratch_load_b32 v68, off, s32 offset:32
+; GFX11TRUE16-NEXT:    scratch_load_b32 v69, off, s32 offset:92
+; GFX11TRUE16-NEXT:    scratch_load_b32 v70, off, s32 offset:28
+; GFX11TRUE16-NEXT:    scratch_load_b32 v71, off, s32 offset:88
+; GFX11TRUE16-NEXT:    scratch_load_b32 v80, off, s32 offset:24
+; GFX11TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:84
+; GFX11TRUE16-NEXT:    scratch_load_b32 v82, off, s32 offset:20
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v83, off, s32 offset:80
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v84, off, s32 offset:16
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v85, off, s32 offset:12
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v86, off, s32 offset:8
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v87, off, s32 offset:4
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v16, 1, v16
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v17, 1, v17
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v19, 1, v19
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v18, 1, v18
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v20, 1, v20
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v23, 1, v23
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v22, 1, v22
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v25, 1, v25
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v24, 1, v24
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v27, 1, v27
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v26, 1, v26
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v29, 1, v29
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v28, 1, v28
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v30, 1, v30
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s16, 1, v16
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v3
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v2
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s3, 1, v5
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s4, 1, v4
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s5, 1, v7
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s6, 1, v6
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s7, 1, v9
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s8, 1, v8
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s9, 1, v11
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s10, 1, v10
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s11, 1, v13
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s12, 1, v12
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s13, 1, v15
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s14, 1, v14
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s15, 1, v17
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s17, 1, v19
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s18, 1, v18
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s19, 1, v21
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s20, 1, v20
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s21, 1, v23
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s22, 1, v22
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s23, 1, v25
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s24, 1, v24
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s25, 1, v27
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s26, 1, v30
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s27, 1, v28
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s28, 1, v29
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s29, 1, v26
+; GFX11TRUE16-NEXT:    v_and_b16 v0.h, 1, v1.l
+; GFX11TRUE16-NEXT:    v_and_b16 v0.l, 1, v0.l
+; GFX11TRUE16-NEXT:    v_and_b16 v1.l, 1, v3.l
+; GFX11TRUE16-NEXT:    v_and_b16 v1.h, 1, v2.l
+; GFX11TRUE16-NEXT:    v_and_b16 v2.l, 1, v5.l
+; GFX11TRUE16-NEXT:    v_and_b16 v2.h, 1, v4.l
+; GFX11TRUE16-NEXT:    v_and_b16 v3.l, 1, v7.l
+; GFX11TRUE16-NEXT:    v_and_b16 v3.h...
[truncated]

Sisyph · 2025-09-08T19:13:10Z

llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll

Minor code quality issue, but can we get the folding of s0 into the bfi back?

There should be code there to do this folding in the foldoperand pass. Let me check why this is not working

nvm. I think we don't do this folding (we know that s0 and v1 is the same i16 in ISel, but in folding pass this seems require some additional context analysis).

I don't think we can do much in ISel. In ISel we don't know if the 16bit operand is a vpgr or sgpr, and we force to use vgpr16 which leads to this additional copy. We might be able to add this folding in folding pass, but not sure if it worth

Yes, it seems like a job for SIFoldOperands.

Will open another patch for this

broxigarchen · 2025-09-09T14:25:46Z

CI error not related

llvm-ci · 2025-09-09T14:44:49Z

LLVM Buildbot has detected a new failure on builder ml-opt-dev-x86-64 running on ml-opt-dev-x86-64-b2 while building llvm at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/137/builds/25151

Here is the relevant piece of the build log for the reference

Step 6 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'LLVM :: CodeGen/AMDGPU/fcopysign.gfx11plus.ll' FAILED ********************
Exit Code: 2

Command Output (stderr):
--
/b/ml-opt-dev-x86-64-b1/build/bin/llc < /b/ml-opt-dev-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel | /b/ml-opt-dev-x86-64-b1/build/bin/FileCheck /b/ml-opt-dev-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll --check-prefixes=GFX11,GFX11-REAL16 # RUN: at line 2
+ /b/ml-opt-dev-x86-64-b1/build/bin/FileCheck /b/ml-opt-dev-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll --check-prefixes=GFX11,GFX11-REAL16
error: no check strings found with prefix 'GFX11:'
+ /b/ml-opt-dev-x86-64-b1/build/bin/llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel

--

********************

llvm-ci · 2025-09-09T14:45:37Z

LLVM Buildbot has detected a new failure on builder ml-opt-devrel-x86-64 running on ml-opt-devrel-x86-64-b1 while building llvm at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/175/builds/24992

Here is the relevant piece of the build log for the reference

Step 6 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'LLVM :: CodeGen/AMDGPU/fcopysign.gfx11plus.ll' FAILED ********************
Exit Code: 2

Command Output (stderr):
--
/b/ml-opt-devrel-x86-64-b1/build/bin/llc < /b/ml-opt-devrel-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel | /b/ml-opt-devrel-x86-64-b1/build/bin/FileCheck /b/ml-opt-devrel-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll --check-prefixes=GFX11,GFX11-REAL16 # RUN: at line 2
+ /b/ml-opt-devrel-x86-64-b1/build/bin/FileCheck /b/ml-opt-devrel-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll --check-prefixes=GFX11,GFX11-REAL16
error: no check strings found with prefix 'GFX11:'
+ /b/ml-opt-devrel-x86-64-b1/build/bin/llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel

--

********************

boomanaiden154 · 2025-09-09T14:49:01Z

CI error not related

The CI shows these two tests being broken:

  LLVM :: CodeGen/AMDGPU/fcopysign.gfx11plus.ll
  LLVM :: CodeGen/AMDGPU/scalar_to_vector.gfx11plus.ll

Those tests were added in this PR. How are the failures not related?

This change also broke postcommit buildbots. Please either revert or fix forward.

llvm-ci · 2025-09-09T14:51:53Z

LLVM Buildbot has detected a new failure on builder ml-opt-rel-x86-64 running on ml-opt-rel-x86-64-b2 while building llvm at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/185/builds/25001

Here is the relevant piece of the build log for the reference

Step 6 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'LLVM :: CodeGen/AMDGPU/fcopysign.gfx11plus.ll' FAILED ********************
Exit Code: 2

Command Output (stderr):
--
/b/ml-opt-rel-x86-64-b1/build/bin/llc < /b/ml-opt-rel-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel | /b/ml-opt-rel-x86-64-b1/build/bin/FileCheck /b/ml-opt-rel-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll --check-prefixes=GFX11,GFX11-REAL16 # RUN: at line 2
+ /b/ml-opt-rel-x86-64-b1/build/bin/llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel
+ /b/ml-opt-rel-x86-64-b1/build/bin/FileCheck /b/ml-opt-rel-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll --check-prefixes=GFX11,GFX11-REAL16
error: no check strings found with prefix 'GFX11:'

--

********************

llvm-ci · 2025-09-09T14:57:41Z

LLVM Buildbot has detected a new failure on builder lld-x86_64-ubuntu-fast running on as-builder-4 while building llvm at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/33/builds/22671

Here is the relevant piece of the build log for the reference

Step 6 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'LLVM :: CodeGen/AMDGPU/fcopysign.gfx11plus.ll' FAILED ********************
Exit Code: 2

Command Output (stderr):
--
/home/buildbot/worker/as-builder-4/ramdisk/lld-x86_64/build/bin/llc < /home/buildbot/worker/as-builder-4/ramdisk/lld-x86_64/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel | /home/buildbot/worker/as-builder-4/ramdisk/lld-x86_64/build/bin/FileCheck /home/buildbot/worker/as-builder-4/ramdisk/lld-x86_64/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll --check-prefixes=GFX11,GFX11-REAL16 # RUN: at line 2
+ /home/buildbot/worker/as-builder-4/ramdisk/lld-x86_64/build/bin/FileCheck /home/buildbot/worker/as-builder-4/ramdisk/lld-x86_64/llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll --check-prefixes=GFX11,GFX11-REAL16
error: no check strings found with prefix 'GFX11:'
+ /home/buildbot/worker/as-builder-4/ramdisk/lld-x86_64/build/bin/llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel

--

********************

broxigarchen · 2025-09-09T14:58:15Z

scalar_to_vector.gfx11plus.ll

Oh, that's weird. Not sure how I miss it. Fix is here #157684

#154875 failed two lit tests

llvm/llvm-project#154875 failed two lit tests

This is a temporary fix for a regression from #154875. The new pattern sets the hi part of V_BFI result and that confuses si-fix-sgpr-copies - where the proper fix is likely to be. During si-fix-sgpr-copies, an incorrect fold happens: %86:vgpr_32 = V_BFI_B32_e64 %87:sreg_32 = COPY %86.hi16:vgpr_32 %95:vgpr_32 = nofpexcept V_PACK_B32_F16_t16_e64 0, killed %87:sreg_32, 0, %63:vgpr_16, 0, 0 into %86:vgpr_32 = V_BFI_B32_e64 %95:vgpr_32 = nofpexcept V_PACK_B32_F16_t16_e64 0, %86.lo16:vgpr_32, 0, %63:vgpr_16, 0, 0 Fixes: Vulkan CTS dEQP-VK.glsl.builtin.precision_fp16_storage32b.*.

) This is a temporary fix for a regression from llvm#154875. The new pattern sets the hi part of V_BFI result and that confuses si-fix-sgpr-copies - where the proper fix is likely to be. During si-fix-sgpr-copies, an incorrect fold happens: %86:vgpr_32 = V_BFI_B32_e64 %87:sreg_32 = COPY %86.hi16:vgpr_32 %95:vgpr_32 = nofpexcept V_PACK_B32_F16_t16_e64 0, killed %87:sreg_32, 0, %63:vgpr_16, 0, 0 into %86:vgpr_32 = V_BFI_B32_e64 %95:vgpr_32 = nofpexcept V_PACK_B32_F16_t16_e64 0, %86.lo16:vgpr_32, 0, %63:vgpr_16, 0, 0 Fixes: Vulkan CTS dEQP-VK.glsl.builtin.precision_fp16_storage32b.*.

broxigarchen force-pushed the main-fix-true16-copy-3 branch from ffd3634 to ef99f09 Compare August 25, 2025 20:23

broxigarchen changed the title ~~use vgpr_16 for scalar_vector pattern~~ use vgpr_16 for scalar_vector and trunc i1 pattern Aug 25, 2025

broxigarchen force-pushed the main-fix-true16-copy-3 branch from ef99f09 to bc46fe4 Compare September 8, 2025 05:04

broxigarchen changed the title ~~use vgpr_16 for scalar_vector and trunc i1 pattern~~ [AMDGPU][True16][CodeGen] update isel pattern with vgpr16 for 16 bit types Sep 8, 2025

broxigarchen force-pushed the main-fix-true16-copy-3 branch from bc46fe4 to dc87997 Compare September 8, 2025 17:58

broxigarchen marked this pull request as ready for review September 8, 2025 18:02

broxigarchen requested a review from arsenm September 8, 2025 18:02

llvmbot added the backend:AMDGPU label Sep 8, 2025

broxigarchen requested review from Sisyph, jayfoad, kosarev and mariusz-sikora-at-amd September 8, 2025 18:02

broxigarchen requested a review from dstutt September 8, 2025 18:07

Sisyph approved these changes Sep 8, 2025

View reviewed changes

clean up all 16bit in vpgr32 pattern

e1b2852

broxigarchen force-pushed the main-fix-true16-copy-3 branch from dc87997 to e1b2852 Compare September 9, 2025 04:52

dstutt approved these changes Sep 9, 2025

View reviewed changes

broxigarchen merged commit 1290077 into llvm:main Sep 9, 2025
7 of 9 checks passed

broxigarchen mentioned this pull request Sep 9, 2025

[AMDGPU][True16][CodeGen] Fixed two lit tests #157684

Merged

broxigarchen added a commit that referenced this pull request Sep 9, 2025

[AMDGPU][True16][CodeGen] Fixed two lit tests (#157684)

81a4fcb

#154875 failed two lit tests

llvm-sync bot pushed a commit to arm/arm-toolchain that referenced this pull request Sep 9, 2025

Automerge: [AMDGPU][True16][CodeGen] Fixed two lit tests (#157684)

04aedfb

llvm/llvm-project#154875 failed two lit tests

piotrAMD mentioned this pull request Sep 26, 2025

[AMDGPU][True16][CodeGen] Avoid setting hi part in copysign #160891

Merged

broxigarchen mentioned this pull request Sep 26, 2025

AMDGPU: Delete redundant recursive copy handling code #157032

Merged

[AMDGPU][True16][CodeGen] update isel pattern with vgpr16 for 16 bit types #154875

[AMDGPU][True16][CodeGen] update isel pattern with vgpr16 for 16 bit types #154875

Uh oh!

Conversation

broxigarchen commented Aug 22, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Sep 8, 2025

Uh oh!

Sisyph Sep 8, 2025

Choose a reason for hiding this comment

Uh oh!

broxigarchen Sep 8, 2025

Choose a reason for hiding this comment

Uh oh!

broxigarchen Sep 8, 2025

Choose a reason for hiding this comment

Uh oh!

Sisyph Sep 8, 2025

Choose a reason for hiding this comment

Uh oh!

broxigarchen Sep 9, 2025

Choose a reason for hiding this comment

Uh oh!

broxigarchen commented Sep 9, 2025

Uh oh!

Uh oh!

llvm-ci commented Sep 9, 2025

Uh oh!

llvm-ci commented Sep 9, 2025

Uh oh!

boomanaiden154 commented Sep 9, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvm-ci commented Sep 9, 2025

Uh oh!

llvm-ci commented Sep 9, 2025

Uh oh!

broxigarchen commented Sep 9, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

6 participants

broxigarchen commented Aug 22, 2025 •

edited

Loading

boomanaiden154 commented Sep 9, 2025 •

edited

Loading

broxigarchen commented Sep 9, 2025 •

edited

Loading