Skip to content

Commit 1e01c02

Browse files
authored
[DAGCombiner] Remove NoSignedZerosFPMath uses in visitFADD (#160635)
Remove these global flags and use node level flags instead.
1 parent c4a134f commit 1e01c02

File tree

5 files changed

+318
-626
lines changed

5 files changed

+318
-626
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17770,7 +17770,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
1777017770
// N0 + -0.0 --> N0 (also allowed with +0.0 and fast-math)
1777117771
ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1, true);
1777217772
if (N1C && N1C->isZero())
17773-
if (N1C->isNegative() || Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())
17773+
if (N1C->isNegative() || Flags.hasNoSignedZeros())
1777417774
return N0;
1777517775

1777617776
if (SDValue NewSel = foldBinOpIntoSelect(N))
@@ -17823,11 +17823,10 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
1782317823
return DAG.getConstantFP(0.0, DL, VT);
1782417824
}
1782517825

17826-
// If 'unsafe math' or reassoc and nsz, fold lots of things.
17826+
// If reassoc and nsz, fold lots of things.
1782717827
// TODO: break out portions of the transformations below for which Unsafe is
1782817828
// considered and which do not require both nsz and reassoc
17829-
if ((Options.NoSignedZerosFPMath ||
17830-
(Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) &&
17829+
if (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros() &&
1783117830
AllowNewConst) {
1783217831
// fadd (fadd x, c1), c2 -> fadd x, c1 + c2
1783317832
if (N1CFP && N0.getOpcode() == ISD::FADD &&
@@ -17911,10 +17910,9 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
1791117910
DAG.getConstantFP(4.0, DL, VT));
1791217911
}
1791317912
}
17914-
} // enable-unsafe-fp-math && AllowNewConst
17913+
} // reassoc && nsz && AllowNewConst
1791517914

17916-
if ((Options.NoSignedZerosFPMath ||
17917-
(Flags.hasAllowReassociation() && Flags.hasNoSignedZeros()))) {
17915+
if (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros()) {
1791817916
// Fold fadd(vecreduce(x), vecreduce(y)) -> vecreduce(fadd(x, y))
1791917917
if (SDValue SD = reassociateReduction(ISD::VECREDUCE_FADD, ISD::FADD, DL,
1792017918
VT, N0, N1, Flags))

llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll

Lines changed: 97 additions & 236 deletions
Original file line numberDiff line numberDiff line change
@@ -581,145 +581,63 @@ define { half, half } @v_fneg_add_multi_use_fneg_x_f16(half %a, half %b, half %c
581581
ret { half, half } %insert.1
582582
}
583583

584-
; This one asserted with -enable-no-signed-zeros-fp-math
585-
define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x i32> %arg) #0 {
586-
; SI-SAFE-LABEL: fneg_fadd_0_f16:
587-
; SI-SAFE: ; %bb.0: ; %.entry
588-
; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
589-
; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, s1
590-
; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, s0
591-
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
592-
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
593-
; SI-SAFE-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0
594-
; SI-SAFE-NEXT: v_rcp_f32_e32 v3, v2
595-
; SI-SAFE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
596-
; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
597-
; SI-SAFE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
598-
; SI-SAFE-NEXT: v_fma_f32 v3, v5, v3, v3
599-
; SI-SAFE-NEXT: v_mul_f32_e32 v5, v4, v3
600-
; SI-SAFE-NEXT: v_fma_f32 v6, -v2, v5, v4
601-
; SI-SAFE-NEXT: v_fma_f32 v5, v6, v3, v5
602-
; SI-SAFE-NEXT: v_fma_f32 v2, -v2, v5, v4
603-
; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
604-
; SI-SAFE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
605-
; SI-SAFE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
606-
; SI-SAFE-NEXT: v_mad_f32 v0, v0, 0, 0
607-
; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
608-
; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
609-
; SI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000
610-
; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
611-
; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
612-
; SI-SAFE-NEXT: ; return to shader part epilog
613-
;
614-
; SI-NSZ-LABEL: fneg_fadd_0_f16:
615-
; SI-NSZ: ; %bb.0: ; %.entry
616-
; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
617-
; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, s1
618-
; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, s0
619-
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
620-
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
621-
; SI-NSZ-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0
622-
; SI-NSZ-NEXT: v_rcp_f32_e32 v3, v2
623-
; SI-NSZ-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
624-
; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
625-
; SI-NSZ-NEXT: v_fma_f32 v5, -v2, v3, 1.0
626-
; SI-NSZ-NEXT: v_fma_f32 v3, v5, v3, v3
627-
; SI-NSZ-NEXT: v_mul_f32_e32 v5, v4, v3
628-
; SI-NSZ-NEXT: v_fma_f32 v6, -v2, v5, v4
629-
; SI-NSZ-NEXT: v_fma_f32 v5, v6, v3, v5
630-
; SI-NSZ-NEXT: v_fma_f32 v2, -v2, v5, v4
631-
; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
632-
; SI-NSZ-NEXT: v_div_fmas_f32 v2, v2, v3, v5
633-
; SI-NSZ-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
634-
; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0
635-
; SI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1
636-
; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
637-
; SI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000
638-
; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
639-
; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
640-
; SI-NSZ-NEXT: ; return to shader part epilog
641-
;
642-
; VI-SAFE-LABEL: fneg_fadd_0_f16:
643-
; VI-SAFE: ; %bb.0: ; %.entry
644-
; VI-SAFE-NEXT: v_rcp_f16_e32 v0, s1
645-
; VI-SAFE-NEXT: v_mov_b32_e32 v1, s0
646-
; VI-SAFE-NEXT: v_mul_f16_e32 v0, 0, v0
647-
; VI-SAFE-NEXT: v_add_f16_e32 v0, 0, v0
648-
; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x8000, v0
649-
; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, s0, v0
650-
; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
651-
; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7e00
652-
; VI-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0
653-
; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
654-
; VI-SAFE-NEXT: ; return to shader part epilog
655-
;
656-
; VI-NSZ-LABEL: fneg_fadd_0_f16:
657-
; VI-NSZ: ; %bb.0: ; %.entry
658-
; VI-NSZ-NEXT: v_rcp_f16_e32 v0, s1
659-
; VI-NSZ-NEXT: v_mov_b32_e32 v1, s0
660-
; VI-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0
661-
; VI-NSZ-NEXT: v_cmp_nlt_f16_e64 vcc, -v0, s0
662-
; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
663-
; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7e00
664-
; VI-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0
665-
; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
666-
; VI-NSZ-NEXT: ; return to shader part epilog
667-
;
668-
; GFX11-SAFE-LABEL: fneg_fadd_0_f16:
669-
; GFX11-SAFE: ; %bb.0: ; %.entry
670-
; GFX11-SAFE-NEXT: v_rcp_f16_e32 v0, s1
671-
; GFX11-SAFE-NEXT: s_waitcnt_depctr 0xfff
672-
; GFX11-SAFE-NEXT: v_mul_f16_e32 v0, 0, v0
673-
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
674-
; GFX11-SAFE-NEXT: v_add_f16_e32 v0, 0, v0
675-
; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0
676-
; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0
677-
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
678-
; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
679-
; GFX11-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
680-
; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
681-
; GFX11-SAFE-NEXT: ; return to shader part epilog
682-
;
683-
; GFX11-NSZ-LABEL: fneg_fadd_0_f16:
684-
; GFX11-NSZ: ; %bb.0: ; %.entry
685-
; GFX11-NSZ-NEXT: v_rcp_f16_e32 v0, s1
686-
; GFX11-NSZ-NEXT: s_waitcnt_depctr 0xfff
687-
; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0
688-
; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
689-
; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0
690-
; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
691-
; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1)
692-
; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
693-
; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
694-
; GFX11-NSZ-NEXT: ; return to shader part epilog
695-
; GFX11-SAFE-TRUE16-LABEL: fneg_fadd_0_f16:
696-
; GFX11-SAFE-TRUE16: ; %bb.0: ; %.entry
697-
; GFX11-SAFE-TRUE16-NEXT: v_rcp_f16_e32 v0.l, s1
698-
; GFX11-SAFE-TRUE16-NEXT: s_waitcnt_depctr 0xfff
699-
; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0, v0.l
700-
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
701-
; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v0.l, 0, v0.l
702-
; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
703-
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0.l
704-
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
705-
; GFX11-SAFE-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v1
706-
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v0/*Invalid register, operand has 'VS_16' register class*/, s0, vcc_lo
707-
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
708-
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0.l
709-
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, 0, vcc_lo
710-
; GFX11-SAFE-TRUE16-NEXT: ; return to shader part epilog
711-
; GFX11-NSZ-TRUE16-LABEL: fneg_fadd_0_f16:
712-
; GFX11-NSZ-TRUE16: ; %bb.0: ; %.entry
713-
; GFX11-NSZ-TRUE16-NEXT: v_rcp_f16_e32 v0.l, s1
714-
; GFX11-NSZ-TRUE16-NEXT: s_waitcnt_depctr 0xfff
715-
; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0x8000, v0.l
716-
; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
717-
; GFX11-NSZ-TRUE16-NEXT: v_cmp_nlt_f16_e64 s1, -v0.l, s0
718-
; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, s0, s1
719-
; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
720-
; GFX11-NSZ-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0.l
721-
; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, 0, vcc_lo
722-
; GFX11-NSZ-TRUE16-NEXT: ; return to shader part epilog
584+
define amdgpu_ps half @fneg_fadd_0_safe_f16(half inreg %tmp2, half inreg %tmp6, <4 x i32> %arg) #0 {
585+
; SI-LABEL: fneg_fadd_0_safe_f16:
586+
; SI: ; %bb.0: ; %.entry
587+
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
588+
; SI-NEXT: v_cvt_f16_f32_e32 v0, s1
589+
; SI-NEXT: v_cvt_f16_f32_e32 v1, s0
590+
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
591+
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
592+
; SI-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0
593+
; SI-NEXT: v_rcp_f32_e32 v3, v2
594+
; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
595+
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
596+
; SI-NEXT: v_fma_f32 v5, -v2, v3, 1.0
597+
; SI-NEXT: v_fma_f32 v3, v5, v3, v3
598+
; SI-NEXT: v_mul_f32_e32 v5, v4, v3
599+
; SI-NEXT: v_fma_f32 v6, -v2, v5, v4
600+
; SI-NEXT: v_fma_f32 v5, v6, v3, v5
601+
; SI-NEXT: v_fma_f32 v2, -v2, v5, v4
602+
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
603+
; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5
604+
; SI-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
605+
; SI-NEXT: v_mad_f32 v0, v0, 0, 0
606+
; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
607+
; SI-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
608+
; SI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
609+
; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
610+
; SI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
611+
; SI-NEXT: ; return to shader part epilog
612+
;
613+
; VI-LABEL: fneg_fadd_0_safe_f16:
614+
; VI: ; %bb.0: ; %.entry
615+
; VI-NEXT: v_rcp_f16_e32 v0, s1
616+
; VI-NEXT: v_mov_b32_e32 v1, s0
617+
; VI-NEXT: v_mul_f16_e32 v0, 0, v0
618+
; VI-NEXT: v_add_f16_e32 v0, 0, v0
619+
; VI-NEXT: v_xor_b32_e32 v2, 0x8000, v0
620+
; VI-NEXT: v_cmp_ngt_f16_e32 vcc, s0, v0
621+
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
622+
; VI-NEXT: v_mov_b32_e32 v1, 0x7e00
623+
; VI-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0
624+
; VI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
625+
; VI-NEXT: ; return to shader part epilog
626+
;
627+
; GFX11-LABEL: fneg_fadd_0_safe_f16:
628+
; GFX11: ; %bb.0: ; %.entry
629+
; GFX11-NEXT: v_rcp_f16_e32 v0, s1
630+
; GFX11-NEXT: s_waitcnt_depctr 0xfff
631+
; GFX11-NEXT: v_mul_f16_e32 v0, 0, v0
632+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
633+
; GFX11-NEXT: v_add_f16_e32 v0, 0, v0
634+
; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v0
635+
; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0
636+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
637+
; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
638+
; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
639+
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
640+
; GFX11-NEXT: ; return to shader part epilog
723641
.entry:
724642
%tmp7 = fdiv half 1.000000e+00, %tmp6
725643
%tmp8 = fmul half 0.000000e+00, %tmp7
@@ -733,108 +651,51 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x
733651
ret half %.i198
734652
}
735653

736-
; This is a workaround because -enable-no-signed-zeros-fp-math does not set up
737-
; function attribute unsafe-fp-math automatically. Combine with the previous test
738-
; when that is done.
739654
define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, <4 x i32> %arg) #2 {
740-
; SI-SAFE-LABEL: fneg_fadd_0_nsz_f16:
741-
; SI-SAFE: ; %bb.0: ; %.entry
742-
; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, s0
743-
; SI-SAFE-NEXT: s_brev_b32 s0, 1
744-
; SI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000
745-
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
746-
; SI-SAFE-NEXT: v_min_legacy_f32_e32 v0, 0, v0
747-
; SI-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0
748-
; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
749-
; SI-SAFE-NEXT: ; return to shader part epilog
750-
;
751-
; SI-NSZ-LABEL: fneg_fadd_0_nsz_f16:
752-
; SI-NSZ: ; %bb.0: ; %.entry
753-
; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, s1
754-
; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, s0
755-
; SI-NSZ-NEXT: v_mov_b32_e32 v2, 0x7fc00000
756-
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
757-
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
758-
; SI-NSZ-NEXT: v_rcp_f32_e32 v0, v0
759-
; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0
760-
; SI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1
761-
; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
762-
; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
763-
; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
764-
; SI-NSZ-NEXT: ; return to shader part epilog
765-
;
766-
; VI-SAFE-LABEL: fneg_fadd_0_nsz_f16:
767-
; VI-SAFE: ; %bb.0: ; %.entry
768-
; VI-SAFE-NEXT: v_mov_b32_e32 v0, 0x8000
769-
; VI-SAFE-NEXT: v_mov_b32_e32 v1, s0
770-
; VI-SAFE-NEXT: v_cmp_ngt_f16_e64 vcc, s0, 0
771-
; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
772-
; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7e00
773-
; VI-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0
774-
; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
775-
; VI-SAFE-NEXT: ; return to shader part epilog
776-
;
777-
; VI-NSZ-LABEL: fneg_fadd_0_nsz_f16:
778-
; VI-NSZ: ; %bb.0: ; %.entry
779-
; VI-NSZ-NEXT: v_rcp_f16_e32 v0, s1
780-
; VI-NSZ-NEXT: v_mov_b32_e32 v1, s0
781-
; VI-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0
782-
; VI-NSZ-NEXT: v_cmp_nlt_f16_e64 vcc, -v0, s0
783-
; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
784-
; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7e00
785-
; VI-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0
786-
; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
787-
; VI-NSZ-NEXT: ; return to shader part epilog
788-
;
789-
; GFX11-SAFE-LABEL: fneg_fadd_0_nsz_f16:
790-
; GFX11-SAFE: ; %bb.0: ; %.entry
791-
; GFX11-SAFE-NEXT: v_mov_b32_e32 v0, s0
792-
; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 vcc_lo, s0, 0
793-
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
794-
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x8000, v0, vcc_lo
795-
; GFX11-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
796-
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3)
797-
; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
798-
; GFX11-SAFE-NEXT: ; return to shader part epilog
799-
;
800-
; GFX11-NSZ-LABEL: fneg_fadd_0_nsz_f16:
801-
; GFX11-NSZ: ; %bb.0: ; %.entry
802-
; GFX11-NSZ-NEXT: v_rcp_f16_e32 v0, s1
803-
; GFX11-NSZ-NEXT: s_waitcnt_depctr 0xfff
804-
; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0
805-
; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
806-
; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0
807-
; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
808-
; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1)
809-
; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
810-
; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
811-
; GFX11-NSZ-NEXT: ; return to shader part epilog
812-
; GFX11-SAFE-TRUE16-LABEL: fneg_fadd_0_nsz_f16:
813-
; GFX11-SAFE-TRUE16: ; %bb.0: ; %.entry
814-
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s1, s0, 0
815-
; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x8000
816-
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
817-
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, s0, s1
818-
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0.l
819-
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, 0, vcc_lo
820-
; GFX11-SAFE-TRUE16-NEXT: ; return to shader part epilog
821-
; GFX11-NSZ-TRUE16-LABEL: fneg_fadd_0_nsz_f16:
822-
; GFX11-NSZ-TRUE16: ; %bb.0: ; %.entry
823-
; GFX11-NSZ-TRUE16-NEXT: v_rcp_f16_e32 v0.l, s1
824-
; GFX11-NSZ-TRUE16-NEXT: s_waitcnt_depctr 0xfff
825-
; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0x8000, v0.l
826-
; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
827-
; GFX11-NSZ-TRUE16-NEXT: v_cmp_nlt_f16_e64 s1, -v0.l, s0
828-
; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, s0, s1
829-
; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
830-
; GFX11-NSZ-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0.l
831-
; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, 0, vcc_lo
832-
; GFX11-NSZ-TRUE16-NEXT: ; return to shader part epilog
655+
; SI-LABEL: fneg_fadd_0_nsz_f16:
656+
; SI: ; %bb.0: ; %.entry
657+
; SI-NEXT: v_cvt_f16_f32_e32 v0, s1
658+
; SI-NEXT: v_cvt_f16_f32_e32 v1, s0
659+
; SI-NEXT: v_mov_b32_e32 v2, 0x7fc00000
660+
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
661+
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
662+
; SI-NEXT: v_rcp_f32_e32 v0, v0
663+
; SI-NEXT: v_mul_f32_e32 v0, 0x80000000, v0
664+
; SI-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1
665+
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
666+
; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
667+
; SI-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
668+
; SI-NEXT: ; return to shader part epilog
669+
;
670+
; VI-LABEL: fneg_fadd_0_nsz_f16:
671+
; VI: ; %bb.0: ; %.entry
672+
; VI-NEXT: v_rcp_f16_e32 v0, s1
673+
; VI-NEXT: v_mov_b32_e32 v1, s0
674+
; VI-NEXT: v_mul_f16_e32 v0, 0x8000, v0
675+
; VI-NEXT: v_cmp_nlt_f16_e64 vcc, -v0, s0
676+
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
677+
; VI-NEXT: v_mov_b32_e32 v1, 0x7e00
678+
; VI-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0
679+
; VI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
680+
; VI-NEXT: ; return to shader part epilog
681+
;
682+
; GFX11-LABEL: fneg_fadd_0_nsz_f16:
683+
; GFX11: ; %bb.0: ; %.entry
684+
; GFX11-NEXT: v_rcp_f16_e32 v0, s1
685+
; GFX11-NEXT: s_waitcnt_depctr 0xfff
686+
; GFX11-NEXT: v_mul_f16_e32 v0, 0x8000, v0
687+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
688+
; GFX11-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0
689+
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
690+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
691+
; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
692+
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
693+
; GFX11-NEXT: ; return to shader part epilog
833694
.entry:
834695
%tmp7 = fdiv afn half 1.000000e+00, %tmp6
835696
%tmp8 = fmul contract half 0.000000e+00, %tmp7
836697
%tmp9 = fmul reassoc nnan arcp contract half 0.000000e+00, %tmp8
837-
%.i188 = fadd nnan ninf contract half %tmp9, 0.000000e+00
698+
%.i188 = fadd nsz half %tmp9, 0.000000e+00
838699
%tmp10 = fcmp uge half %.i188, %tmp2
839700
%tmp11 = fneg half %.i188
840701
%.i092 = select i1 %tmp10, half %tmp2, half %tmp11

0 commit comments

Comments
 (0)