Skip to content

Commit 192d13e

Browse files
authored
[RISCV] Undo fneg (fmul x, y) -> fmul x, (fneg y) transform (#157388)
InstCombine will hoist an fneg through an fmul, but not for fadd/fsub. This prevents us from matching fmsub and fnmadd in some cases. This patch adds a DAG combine to undo this in InstCombine, which helps some hot loops in 508.namd_r: @@ -983,18 +983,15 @@ fld ft2, 48(a5) fld ft3, 64(a5) fld ft4, 72(a5) - fneg.d fa0, fa0 - fneg.d ft0, ft0 - fneg.d ft2, ft2 fmul.d fa3, ft5, fa3 fmul.d fa0, fa3, fa0 fmul.d ft0, fa3, ft0 fmul.d fa3, fa3, ft2 fld ft2, 0(s1) fmul.d fa4, ft5, fa4 - fmadd.d fa2, fa4, fa2, fa0 - fmadd.d ft6, fa4, ft6, ft0 - fmadd.d fa4, fa4, ft1, fa3 + fmsub.d fa2, fa4, fa2, fa0 + fmsub.d ft6, fa4, ft6, ft0 + fmsub.d fa4, fa4, ft1, fa3 This gives a [1.77% improvement in both instruction count and runtime on 508.namd_r](https://lnt.lukelau.me/db_default/v4/nts/profile/1/1022/1021) This also causes some more fnegs to be sunk after a bitcast to integer, so they're now done as xor. From glancing at some of the schedules for WriteFSGN my guess is that this is also profitable.
1 parent 71f98a1 commit 192d13e

File tree

5 files changed

+412
-42
lines changed

5 files changed

+412
-42
lines changed

llvm/include/llvm/CodeGen/SDPatternMatch.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1076,6 +1076,10 @@ template <typename Opnd> inline UnaryOpc_match<Opnd> m_Cttz(const Opnd &Op) {
10761076
return UnaryOpc_match<Opnd>(ISD::CTTZ, Op);
10771077
}
10781078

1079+
template <typename Opnd> inline UnaryOpc_match<Opnd> m_FNeg(const Opnd &Op) {
1080+
return UnaryOpc_match<Opnd>(ISD::FNEG, Op);
1081+
}
1082+
10791083
// === Constants ===
10801084
struct ConstantInt_match {
10811085
APInt *BindVal;

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20248,6 +20248,17 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
2024820248
return V;
2024920249
break;
2025020250
case ISD::FMUL: {
20251+
using namespace SDPatternMatch;
20252+
SDLoc DL(N);
20253+
EVT VT = N->getValueType(0);
20254+
SDValue X, Y;
20255+
// InstCombine canonicalizes fneg (fmul x, y) -> fmul x, (fneg y), see
20256+
// hoistFNegAboveFMulFDiv.
20257+
// Undo this and sink the fneg so we match more fmsub/fnmadd patterns.
20258+
if (sd_match(N, m_FMul(m_Value(X), m_OneUse(m_FNeg(m_Value(Y))))))
20259+
return DAG.getNode(ISD::FNEG, DL, VT,
20260+
DAG.getNode(ISD::FMUL, DL, VT, X, Y));
20261+
2025120262
// fmul X, (copysign 1.0, Y) -> fsgnjx X, Y
2025220263
SDValue N0 = N->getOperand(0);
2025320264
SDValue N1 = N->getOperand(1);
@@ -20258,13 +20269,12 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
2025820269
ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0->getOperand(0));
2025920270
if (!C || !C->getValueAPF().isExactlyValue(+1.0))
2026020271
return SDValue();
20261-
EVT VT = N->getValueType(0);
2026220272
if (VT.isVector() || !isOperationLegal(ISD::FCOPYSIGN, VT))
2026320273
return SDValue();
2026420274
SDValue Sign = N0->getOperand(1);
2026520275
if (Sign.getValueType() != VT)
2026620276
return SDValue();
20267-
return DAG.getNode(RISCVISD::FSGNJX, SDLoc(N), VT, N1, N0->getOperand(1));
20277+
return DAG.getNode(RISCVISD::FSGNJX, DL, VT, N1, N0->getOperand(1));
2026820278
}
2026920279
case ISD::FADD:
2027020280
case ISD::UMAX:

llvm/test/CodeGen/RISCV/double-arith.ll

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -610,6 +610,86 @@ define double @fmsub_d(double %a, double %b, double %c) nounwind {
610610
ret double %1
611611
}
612612

613+
define double @fmsub_d_fmul_fneg(double %a, double %b, double %c, double %d) nounwind {
614+
; CHECKIFD-LABEL: fmsub_d_fmul_fneg:
615+
; CHECKIFD: # %bb.0:
616+
; CHECKIFD-NEXT: fmul.d fa5, fa2, fa3
617+
; CHECKIFD-NEXT: fmsub.d fa0, fa0, fa1, fa5
618+
; CHECKIFD-NEXT: ret
619+
;
620+
; RV32IZFINXZDINX-LABEL: fmsub_d_fmul_fneg:
621+
; RV32IZFINXZDINX: # %bb.0:
622+
; RV32IZFINXZDINX-NEXT: fmul.d a4, a4, a6
623+
; RV32IZFINXZDINX-NEXT: fmsub.d a0, a0, a2, a4
624+
; RV32IZFINXZDINX-NEXT: ret
625+
;
626+
; RV64IZFINXZDINX-LABEL: fmsub_d_fmul_fneg:
627+
; RV64IZFINXZDINX: # %bb.0:
628+
; RV64IZFINXZDINX-NEXT: fmul.d a2, a2, a3
629+
; RV64IZFINXZDINX-NEXT: fmsub.d a0, a0, a1, a2
630+
; RV64IZFINXZDINX-NEXT: ret
631+
;
632+
; RV32I-LABEL: fmsub_d_fmul_fneg:
633+
; RV32I: # %bb.0:
634+
; RV32I-NEXT: addi sp, sp, -32
635+
; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
636+
; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
637+
; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
638+
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
639+
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
640+
; RV32I-NEXT: mv s0, a3
641+
; RV32I-NEXT: mv s1, a2
642+
; RV32I-NEXT: mv s2, a1
643+
; RV32I-NEXT: mv s3, a0
644+
; RV32I-NEXT: lui a0, 524288
645+
; RV32I-NEXT: xor a3, a7, a0
646+
; RV32I-NEXT: mv a0, a4
647+
; RV32I-NEXT: mv a1, a5
648+
; RV32I-NEXT: mv a2, a6
649+
; RV32I-NEXT: call __muldf3
650+
; RV32I-NEXT: mv a4, a0
651+
; RV32I-NEXT: mv a5, a1
652+
; RV32I-NEXT: mv a0, s3
653+
; RV32I-NEXT: mv a1, s2
654+
; RV32I-NEXT: mv a2, s1
655+
; RV32I-NEXT: mv a3, s0
656+
; RV32I-NEXT: call fma
657+
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
658+
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
659+
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
660+
; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
661+
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
662+
; RV32I-NEXT: addi sp, sp, 32
663+
; RV32I-NEXT: ret
664+
;
665+
; RV64I-LABEL: fmsub_d_fmul_fneg:
666+
; RV64I: # %bb.0:
667+
; RV64I-NEXT: addi sp, sp, -32
668+
; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
669+
; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
670+
; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
671+
; RV64I-NEXT: mv s0, a1
672+
; RV64I-NEXT: mv s1, a0
673+
; RV64I-NEXT: li a0, -1
674+
; RV64I-NEXT: slli a0, a0, 63
675+
; RV64I-NEXT: xor a1, a3, a0
676+
; RV64I-NEXT: mv a0, a2
677+
; RV64I-NEXT: call __muldf3
678+
; RV64I-NEXT: mv a2, a0
679+
; RV64I-NEXT: mv a0, s1
680+
; RV64I-NEXT: mv a1, s0
681+
; RV64I-NEXT: call fma
682+
; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
683+
; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
684+
; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
685+
; RV64I-NEXT: addi sp, sp, 32
686+
; RV64I-NEXT: ret
687+
%negd = fneg double %d
688+
%fmul = fmul double %c, %negd
689+
%1 = call double @llvm.fma.f64(double %a, double %b, double %fmul)
690+
ret double %1
691+
}
692+
613693
define double @fnmadd_d(double %a, double %b, double %c) nounwind {
614694
; RV32IFD-LABEL: fnmadd_d:
615695
; RV32IFD: # %bb.0:
@@ -877,6 +957,88 @@ define double @fnmadd_d_3(double %a, double %b, double %c) nounwind {
877957
ret double %neg
878958
}
879959

960+
define double @fnmadd_d_fmul_fneg(double %a, double %b, double %c, double %d) nounwind {
961+
; CHECKIFD-LABEL: fnmadd_d_fmul_fneg:
962+
; CHECKIFD: # %bb.0:
963+
; CHECKIFD-NEXT: fmul.d fa5, fa1, fa0
964+
; CHECKIFD-NEXT: fmsub.d fa0, fa2, fa3, fa5
965+
; CHECKIFD-NEXT: ret
966+
;
967+
; RV32IZFINXZDINX-LABEL: fnmadd_d_fmul_fneg:
968+
; RV32IZFINXZDINX: # %bb.0:
969+
; RV32IZFINXZDINX-NEXT: fmul.d a0, a2, a0
970+
; RV32IZFINXZDINX-NEXT: fmsub.d a0, a4, a6, a0
971+
; RV32IZFINXZDINX-NEXT: ret
972+
;
973+
; RV64IZFINXZDINX-LABEL: fnmadd_d_fmul_fneg:
974+
; RV64IZFINXZDINX: # %bb.0:
975+
; RV64IZFINXZDINX-NEXT: fmul.d a0, a1, a0
976+
; RV64IZFINXZDINX-NEXT: fmsub.d a0, a2, a3, a0
977+
; RV64IZFINXZDINX-NEXT: ret
978+
;
979+
; RV32I-LABEL: fnmadd_d_fmul_fneg:
980+
; RV32I: # %bb.0:
981+
; RV32I-NEXT: addi sp, sp, -32
982+
; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
983+
; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
984+
; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
985+
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
986+
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
987+
; RV32I-NEXT: mv s0, a7
988+
; RV32I-NEXT: mv s1, a6
989+
; RV32I-NEXT: mv s2, a5
990+
; RV32I-NEXT: mv s3, a4
991+
; RV32I-NEXT: mv a5, a3
992+
; RV32I-NEXT: mv a4, a0
993+
; RV32I-NEXT: lui a3, 524288
994+
; RV32I-NEXT: xor a3, a1, a3
995+
; RV32I-NEXT: mv a0, a2
996+
; RV32I-NEXT: mv a1, a5
997+
; RV32I-NEXT: mv a2, a4
998+
; RV32I-NEXT: call __muldf3
999+
; RV32I-NEXT: mv a4, a0
1000+
; RV32I-NEXT: mv a5, a1
1001+
; RV32I-NEXT: mv a0, s3
1002+
; RV32I-NEXT: mv a1, s2
1003+
; RV32I-NEXT: mv a2, s1
1004+
; RV32I-NEXT: mv a3, s0
1005+
; RV32I-NEXT: call fma
1006+
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
1007+
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
1008+
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
1009+
; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
1010+
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
1011+
; RV32I-NEXT: addi sp, sp, 32
1012+
; RV32I-NEXT: ret
1013+
;
1014+
; RV64I-LABEL: fnmadd_d_fmul_fneg:
1015+
; RV64I: # %bb.0:
1016+
; RV64I-NEXT: addi sp, sp, -32
1017+
; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
1018+
; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
1019+
; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
1020+
; RV64I-NEXT: mv s0, a3
1021+
; RV64I-NEXT: mv s1, a2
1022+
; RV64I-NEXT: mv a2, a1
1023+
; RV64I-NEXT: li a1, -1
1024+
; RV64I-NEXT: slli a1, a1, 63
1025+
; RV64I-NEXT: xor a1, a0, a1
1026+
; RV64I-NEXT: mv a0, a2
1027+
; RV64I-NEXT: call __muldf3
1028+
; RV64I-NEXT: mv a2, a0
1029+
; RV64I-NEXT: mv a0, s1
1030+
; RV64I-NEXT: mv a1, s0
1031+
; RV64I-NEXT: call fma
1032+
; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
1033+
; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
1034+
; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
1035+
; RV64I-NEXT: addi sp, sp, 32
1036+
; RV64I-NEXT: ret
1037+
%nega = fneg double %a
1038+
%mul = fmul double %b, %nega
1039+
%1 = call double @llvm.fma.f64(double %c, double %d, double %mul)
1040+
ret double %1
1041+
}
8801042

8811043
define double @fnmadd_nsz(double %a, double %b, double %c) nounwind {
8821044
; CHECKIFD-LABEL: fnmadd_nsz:

0 commit comments

Comments
 (0)