-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[LoongArch] Custom legalizing ConstantFP to avoid float loads #158050
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-loongarch Author: ZhaoQi (zhaoqi5) ChangesPatch is 145.19 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/158050.diff 15 Files Affected:
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
index c45975431d833..690dd73014e57 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
@@ -17,6 +17,8 @@ def NotBoolXor : PatFrags<(ops node:$val),
// LoongArch specific DAG Nodes.
//===----------------------------------------------------------------------===//
+def SDT_LoongArchMOVGR2FR_W
+ : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisVT<1, i32>]>;
def SDT_LoongArchMOVGR2FR_W_LA64
: SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisVT<1, i64>]>;
def SDT_LoongArchMOVFR2GR_S_LA64
@@ -28,6 +30,8 @@ def SDT_LoongArchFRSQRTE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>;
// ISD::BRCOND is custom-lowered to LoongArchISD::BRCOND for floating-point
// comparisons to prevent recursive lowering.
def loongarch_brcond : SDNode<"LoongArchISD::BRCOND", SDTBrcond, [SDNPHasChain]>;
+def loongarch_movgr2fr_w
+ : SDNode<"LoongArchISD::MOVGR2FR_W", SDT_LoongArchMOVGR2FR_W>;
def loongarch_movgr2fr_w_la64
: SDNode<"LoongArchISD::MOVGR2FR_W_LA64", SDT_LoongArchMOVGR2FR_W_LA64>;
def loongarch_movfr2gr_s_la64
@@ -185,6 +189,14 @@ def : PatFpr<fneg, FNEG_S, FPR32>;
def : PatFpr<fabs, FABS_S, FPR32>;
def : PatFpr<fsqrt, FSQRT_S, FPR32>;
def : Pat<(fdiv fpimm1, (fsqrt FPR32:$fj)), (FRSQRT_S FPR32:$fj)>;
+let Predicates = [HasBasicF, IsLA64] in {
+def : Pat<(fdiv (loongarch_movgr2fr_w_la64 (i64 1065353216)), (fsqrt FPR32:$fj)),
+ (FRSQRT_S FPR32:$fj)>;
+} // Predicates = [HasBasicF, IsLA64]
+let Predicates = [HasBasicF, IsLA32] in {
+def : Pat<(fdiv (loongarch_movgr2fr_w (i32 1065353216)), (fsqrt FPR32:$fj)),
+ (FRSQRT_S FPR32:$fj)>;
+} // Predicates = [HasBasicF, IsLA32]
def : Pat<(fcanonicalize FPR32:$fj), (FMAX_S $fj, $fj)>;
def : Pat<(is_fpclass FPR32:$fj, (i32 timm:$mask)),
(SLTU R0, (ANDI (MOVFR2GR_S (FCLASS_S FPR32:$fj)),
@@ -295,6 +307,14 @@ def : Pat<(loongarch_ftint FPR32:$src), (FTINTRZ_W_S FPR32:$src)>;
// FP reciprocal operation
def : Pat<(fdiv fpimm1, FPR32:$src), (FRECIP_S $src)>;
+let Predicates = [HasBasicF, IsLA64] in {
+def : Pat<(fdiv (loongarch_movgr2fr_w_la64 (i64 1065353216)), FPR32:$src),
+ (FRECIP_S $src)>;
+} // Predicates = [HasBasicF, IsLA64]
+let Predicates = [HasBasicF, IsLA32] in {
+def : Pat<(fdiv (loongarch_movgr2fr_w (i32 1065353216)), FPR32:$src),
+ (FRECIP_S $src)>;
+} // Predicates = [HasBasicF, IsLA32]
let Predicates = [HasFrecipe] in {
// FP approximate reciprocal operation
@@ -350,6 +370,7 @@ def : PatFpr<frint, FRINT_S, FPR32>;
let Predicates = [HasBasicF, IsLA32] in {
// GPR -> FPR
def : Pat<(bitconvert (i32 GPR:$src)), (MOVGR2FR_W GPR:$src)>;
+def : Pat<(loongarch_movgr2fr_w (i32 GPR:$src)), (MOVGR2FR_W GPR:$src)>;
// FPR -> GPR
def : Pat<(i32 (bitconvert FPR32:$src)), (MOVFR2GR_S FPR32:$src)>;
// int -> f32
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
index 965ad8a0a35c6..daefbaa52d42a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
@@ -10,6 +10,21 @@
//
//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// LoongArch specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDT_LoongArchMOVGR2FR_D
+ : SDTypeProfile<1, 1, [SDTCisVT<0, f64>, SDTCisVT<1, i64>]>;
+def SDT_LoongArchMOVGR2FR_D_LO_HI
+ : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>,
+ SDTCisSameAs<1, 2>]>;
+
+def loongarch_movgr2fr_d
+ : SDNode<"LoongArchISD::MOVGR2FR_D", SDT_LoongArchMOVGR2FR_D>;
+def loongarch_movgr2fr_d_lo_hi
+ : SDNode<"LoongArchISD::MOVGR2FR_D_LO_HI", SDT_LoongArchMOVGR2FR_D_LO_HI>;
+
//===----------------------------------------------------------------------===//
// Instructions
//===----------------------------------------------------------------------===//
@@ -147,6 +162,11 @@ def : PatFpr<fneg, FNEG_D, FPR64>;
def : PatFpr<fabs, FABS_D, FPR64>;
def : PatFpr<fsqrt, FSQRT_D, FPR64>;
def : Pat<(fdiv fpimm1, (fsqrt FPR64:$fj)), (FRSQRT_D FPR64:$fj)>;
+let Predicates = [IsLA32] in {
+def : Pat<(fdiv (loongarch_movgr2fr_d_lo_hi (i32 0), (i32 1072693248)),
+ (fsqrt FPR64:$fj)),
+ (FRSQRT_D FPR64:$fj)>;
+} // Predicates = [IsLA32]
def : Pat<(fcopysign FPR64:$fj, FPR32:$fk),
(FCOPYSIGN_D FPR64:$fj, (FCVT_D_S FPR32:$fk))>;
def : Pat<(fcopysign FPR32:$fj, FPR64:$fk),
@@ -252,6 +272,10 @@ def : Pat<(f64 (fpextend FPR32:$src)), (FCVT_D_S FPR32:$src)>;
// FP reciprocal operation
def : Pat<(fdiv fpimm1, FPR64:$src), (FRECIP_D $src)>;
+let Predicates = [IsLA32] in {
+def : Pat<(fdiv (loongarch_movgr2fr_d_lo_hi (i32 0), (i32 1072693248)), FPR64:$src),
+ (FRECIP_D FPR64:$src)>;
+} // Predicates = [IsLA32]
let Predicates = [HasFrecipe] in {
// FP approximate reciprocal operation
@@ -307,9 +331,13 @@ def : Pat<(f64 (sint_to_fp (i64 (sexti32 (i64 GPR:$src))))),
def : Pat<(f64 (sint_to_fp GPR:$src)), (FFINT_D_L (MOVGR2FR_D GPR:$src))>;
def : Pat<(bitconvert GPR:$src), (MOVGR2FR_D GPR:$src)>;
+def : Pat<(loongarch_movgr2fr_d GPR:$src), (MOVGR2FR_D GPR:$src)>;
} // Predicates = [HasBasicD, IsLA64]
let Predicates = [HasBasicD, IsLA32] in {
def : Pat<(f64 (sint_to_fp (i32 GPR:$src))), (FFINT_D_W (MOVGR2FR_W GPR:$src))>;
+
+def : Pat<(f64 (loongarch_movgr2fr_d_lo_hi (i32 GPR:$lo), (i32 GPR:$hi))),
+ (MOVGR2FRH_W (MOVGR2FR_W_64 GPR:$lo), GPR:$hi)>;
} // Predicates = [HasBasicD, IsLA32]
// Convert FP to int
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 634914d3b3fd0..2f10dc9704445 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -190,6 +190,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
setCondCodeAction(FPCCToExpand, MVT::f32, Expand);
+ setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
setOperationAction(ISD::FMA, MVT::f32, Legal);
@@ -237,6 +238,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
setCondCodeAction(FPCCToExpand, MVT::f64, Expand);
+ setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
setOperationAction(ISD::BR_CC, MVT::f64, Expand);
setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Legal);
@@ -549,10 +551,58 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
case ISD::VECREDUCE_UMAX:
case ISD::VECREDUCE_UMIN:
return lowerVECREDUCE(Op, DAG);
+ case ISD::ConstantFP:
+ return lowerConstantFP(Op, DAG);
}
return SDValue();
}
+SDValue LoongArchTargetLowering::lowerConstantFP(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
+ const APFloat &FPVal = CFP->getValueAPF();
+ SDLoc DL(CFP);
+
+ assert((VT == MVT::f32 && Subtarget.hasBasicF()) ||
+ (VT == MVT::f64 && Subtarget.hasBasicD()));
+
+ // If value is 0.0 or -0.0, just ignore it.
+ if (FPVal.isZero())
+ return SDValue();
+
+ // If lsx enabled, use cheaper 'vldi' instruction if possible.
+ if (Subtarget.hasExtLSX() && isFPImmVLDILegal(FPVal, VT))
+ return SDValue();
+
+ // Construct as integer, and move to float register.
+ APInt INTVal = FPVal.bitcastToAPInt();
+ switch (VT.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("Unexpected floating point type!");
+ break;
+ case MVT::f32: {
+ SDValue NewVal = DAG.getConstant(INTVal, DL, MVT::i32);
+ if (Subtarget.is64Bit())
+ NewVal = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, NewVal);
+ return DAG.getNode(Subtarget.is64Bit() ? LoongArchISD::MOVGR2FR_W_LA64
+ : LoongArchISD::MOVGR2FR_W,
+ DL, VT, NewVal);
+ }
+ case MVT::f64: {
+ if (Subtarget.is64Bit()) {
+ SDValue NewVal = DAG.getConstant(INTVal, DL, MVT::i64);
+ return DAG.getNode(LoongArchISD::MOVGR2FR_D, DL, VT, NewVal);
+ }
+ SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
+ SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
+ return DAG.getNode(LoongArchISD::MOVGR2FR_D_LO_HI, DL, VT, Lo, Hi);
+ }
+ }
+
+ return SDValue();
+}
+
// Lower vecreduce_add using vhaddw instructions.
// For Example:
// call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
@@ -7041,7 +7091,10 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(SRL_W)
NODE_NAME_CASE(BSTRINS)
NODE_NAME_CASE(BSTRPICK)
+ NODE_NAME_CASE(MOVGR2FR_W)
NODE_NAME_CASE(MOVGR2FR_W_LA64)
+ NODE_NAME_CASE(MOVGR2FR_D)
+ NODE_NAME_CASE(MOVGR2FR_D_LO_HI)
NODE_NAME_CASE(MOVFR2GR_S_LA64)
NODE_NAME_CASE(FTINT)
NODE_NAME_CASE(BUILD_PAIR_F64)
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 9d14934a9d363..c98b29d400dd6 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -57,7 +57,10 @@ enum NodeType : unsigned {
MOD_WU,
// FPR<->GPR transfer operations
+ MOVGR2FR_W,
MOVGR2FR_W_LA64,
+ MOVGR2FR_D,
+ MOVGR2FR_D_LO_HI,
MOVFR2GR_S_LA64,
MOVFCSR2GR,
MOVGR2FCSR,
@@ -397,6 +400,7 @@ class LoongArchTargetLowering : public TargetLowering {
SDValue lowerBF16_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVECREDUCE_ADD(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerConstantFP(SDValue Op, SelectionDAG &DAG) const;
bool isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const override;
diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-half.ll b/llvm/test/CodeGen/LoongArch/calling-conv-half.ll
index da8c3e93f6842..d111cf2fcfc07 100644
--- a/llvm/test/CodeGen/LoongArch/calling-conv-half.ll
+++ b/llvm/test/CodeGen/LoongArch/calling-conv-half.ll
@@ -226,8 +226,8 @@ define i32 @caller_half_in_fregs() nounwind {
; LA32F-ILP32D: # %bb.0:
; LA32F-ILP32D-NEXT: addi.w $sp, $sp, -16
; LA32F-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32F-ILP32D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0)
-; LA32F-ILP32D-NEXT: fld.s $fa0, $a0, %pc_lo12(.LCPI1_0)
+; LA32F-ILP32D-NEXT: lu12i.w $a0, -12
+; LA32F-ILP32D-NEXT: movgr2fr.w $fa0, $a0
; LA32F-ILP32D-NEXT: ori $a0, $zero, 1
; LA32F-ILP32D-NEXT: ori $a1, $zero, 2
; LA32F-ILP32D-NEXT: ori $a2, $zero, 3
@@ -264,8 +264,8 @@ define i32 @caller_half_in_fregs() nounwind {
; LA32D-ILP32D: # %bb.0:
; LA32D-ILP32D-NEXT: addi.w $sp, $sp, -16
; LA32D-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32D-ILP32D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0)
-; LA32D-ILP32D-NEXT: fld.s $fa0, $a0, %pc_lo12(.LCPI1_0)
+; LA32D-ILP32D-NEXT: lu12i.w $a0, -12
+; LA32D-ILP32D-NEXT: movgr2fr.w $fa0, $a0
; LA32D-ILP32D-NEXT: ori $a0, $zero, 1
; LA32D-ILP32D-NEXT: ori $a1, $zero, 2
; LA32D-ILP32D-NEXT: ori $a2, $zero, 3
@@ -283,8 +283,9 @@ define i32 @caller_half_in_fregs() nounwind {
; LA64S: # %bb.0:
; LA64S-NEXT: addi.d $sp, $sp, -16
; LA64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
-; LA64S-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0)
-; LA64S-NEXT: fld.s $fa0, $a0, %pc_lo12(.LCPI1_0)
+; LA64S-NEXT: lu12i.w $a0, -12
+; LA64S-NEXT: lu32i.d $a0, 0
+; LA64S-NEXT: movgr2fr.w $fa0, $a0
; LA64S-NEXT: ori $a0, $zero, 1
; LA64S-NEXT: ori $a1, $zero, 2
; LA64S-NEXT: ori $a2, $zero, 3
@@ -324,8 +325,9 @@ define i32 @caller_half_in_fregs() nounwind {
; LA64F-LP64D: # %bb.0:
; LA64F-LP64D-NEXT: addi.d $sp, $sp, -16
; LA64F-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
-; LA64F-LP64D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0)
-; LA64F-LP64D-NEXT: fld.s $fa0, $a0, %pc_lo12(.LCPI1_0)
+; LA64F-LP64D-NEXT: lu12i.w $a0, -12
+; LA64F-LP64D-NEXT: lu32i.d $a0, 0
+; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a0
; LA64F-LP64D-NEXT: ori $a0, $zero, 1
; LA64F-LP64D-NEXT: ori $a1, $zero, 2
; LA64F-LP64D-NEXT: ori $a2, $zero, 3
@@ -365,8 +367,9 @@ define i32 @caller_half_in_fregs() nounwind {
; LA64D-LP64D: # %bb.0:
; LA64D-LP64D-NEXT: addi.d $sp, $sp, -16
; LA64D-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
-; LA64D-LP64D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0)
-; LA64D-LP64D-NEXT: fld.s $fa0, $a0, %pc_lo12(.LCPI1_0)
+; LA64D-LP64D-NEXT: lu12i.w $a0, -12
+; LA64D-LP64D-NEXT: lu32i.d $a0, 0
+; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a0
; LA64D-LP64D-NEXT: ori $a0, $zero, 1
; LA64D-LP64D-NEXT: ori $a1, $zero, 2
; LA64D-LP64D-NEXT: ori $a2, $zero, 3
@@ -606,24 +609,24 @@ define i32 @caller_half_in_gregs() nounwind {
; LA32F-ILP32D: # %bb.0:
; LA32F-ILP32D-NEXT: addi.w $sp, $sp, -16
; LA32F-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32F-ILP32D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; LA32F-ILP32D-NEXT: fld.s $fa0, $a0, %pc_lo12(.LCPI3_0)
-; LA32F-ILP32D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_1)
-; LA32F-ILP32D-NEXT: fld.s $fa1, $a0, %pc_lo12(.LCPI3_1)
-; LA32F-ILP32D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_2)
-; LA32F-ILP32D-NEXT: fld.s $fa2, $a0, %pc_lo12(.LCPI3_2)
-; LA32F-ILP32D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_3)
-; LA32F-ILP32D-NEXT: fld.s $fa3, $a0, %pc_lo12(.LCPI3_3)
-; LA32F-ILP32D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_4)
-; LA32F-ILP32D-NEXT: fld.s $fa4, $a0, %pc_lo12(.LCPI3_4)
-; LA32F-ILP32D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_5)
-; LA32F-ILP32D-NEXT: fld.s $fa5, $a0, %pc_lo12(.LCPI3_5)
-; LA32F-ILP32D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_6)
-; LA32F-ILP32D-NEXT: fld.s $fa6, $a0, %pc_lo12(.LCPI3_6)
-; LA32F-ILP32D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_7)
-; LA32F-ILP32D-NEXT: fld.s $fa7, $a0, %pc_lo12(.LCPI3_7)
-; LA32F-ILP32D-NEXT: lu12i.w $a0, -12
-; LA32F-ILP32D-NEXT: ori $a0, $a0, 2176
+; LA32F-ILP32D-NEXT: lu12i.w $a1, -12
+; LA32F-ILP32D-NEXT: movgr2fr.w $fa1, $a1
+; LA32F-ILP32D-NEXT: ori $a0, $a1, 2176
+; LA32F-ILP32D-NEXT: lu12i.w $a2, -13
+; LA32F-ILP32D-NEXT: ori $a2, $a2, 3072
+; LA32F-ILP32D-NEXT: movgr2fr.w $fa0, $a2
+; LA32F-ILP32D-NEXT: ori $a2, $a1, 512
+; LA32F-ILP32D-NEXT: movgr2fr.w $fa2, $a2
+; LA32F-ILP32D-NEXT: ori $a2, $a1, 1024
+; LA32F-ILP32D-NEXT: movgr2fr.w $fa3, $a2
+; LA32F-ILP32D-NEXT: ori $a2, $a1, 1280
+; LA32F-ILP32D-NEXT: movgr2fr.w $fa4, $a2
+; LA32F-ILP32D-NEXT: ori $a2, $a1, 1536
+; LA32F-ILP32D-NEXT: movgr2fr.w $fa5, $a2
+; LA32F-ILP32D-NEXT: ori $a2, $a1, 1792
+; LA32F-ILP32D-NEXT: movgr2fr.w $fa6, $a2
+; LA32F-ILP32D-NEXT: ori $a1, $a1, 2048
+; LA32F-ILP32D-NEXT: movgr2fr.w $fa7, $a1
; LA32F-ILP32D-NEXT: ori $a1, $zero, 10
; LA32F-ILP32D-NEXT: bl callee_half_in_gregs
; LA32F-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload
@@ -656,24 +659,24 @@ define i32 @caller_half_in_gregs() nounwind {
; LA32D-ILP32D: # %bb.0:
; LA32D-ILP32D-NEXT: addi.w $sp, $sp, -16
; LA32D-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32D-ILP32D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; LA32D-ILP32D-NEXT: fld.s $fa0, $a0, %pc_lo12(.LCPI3_0)
-; LA32D-ILP32D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_1)
-; LA32D-ILP32D-NEXT: fld.s $fa1, $a0, %pc_lo12(.LCPI3_1)
-; LA32D-ILP32D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_2)
-; LA32D-ILP32D-NEXT: fld.s $fa2, $a0, %pc_lo12(.LCPI3_2)
-; LA32D-ILP32D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_3)
-; LA32D-ILP32D-NEXT: fld.s $fa3, $a0, %pc_lo12(.LCPI3_3)
-; LA32D-ILP32D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_4)
-; LA32D-ILP32D-NEXT: fld.s $fa4, $a0, %pc_lo12(.LCPI3_4)
-; LA32D-ILP32D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_5)
-; LA32D-ILP32D-NEXT: fld.s $fa5, $a0, %pc_lo12(.LCPI3_5)
-; LA32D-ILP32D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_6)
-; LA32D-ILP32D-NEXT: fld.s $fa6, $a0, %pc_lo12(.LCPI3_6)
-; LA32D-ILP32D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_7)
-; LA32D-ILP32D-NEXT: fld.s $fa7, $a0, %pc_lo12(.LCPI3_7)
-; LA32D-ILP32D-NEXT: lu12i.w $a0, -12
-; LA32D-ILP32D-NEXT: ori $a0, $a0, 2176
+; LA32D-ILP32D-NEXT: lu12i.w $a1, -12
+; LA32D-ILP32D-NEXT: movgr2fr.w $fa1, $a1
+; LA32D-ILP32D-NEXT: ori $a0, $a1, 2176
+; LA32D-ILP32D-NEXT: lu12i.w $a2, -13
+; LA32D-ILP32D-NEXT: ori $a2, $a2, 3072
+; LA32D-ILP32D-NEXT: movgr2fr.w $fa0, $a2
+; LA32D-ILP32D-NEXT: ori $a2, $a1, 512
+; LA32D-ILP32D-NEXT: movgr2fr.w $fa2, $a2
+; LA32D-ILP32D-NEXT: ori $a2, $a1, 1024
+; LA32D-ILP32D-NEXT: movgr2fr.w $fa3, $a2
+; LA32D-ILP32D-NEXT: ori $a2, $a1, 1280
+; LA32D-ILP32D-NEXT: movgr2fr.w $fa4, $a2
+; LA32D-ILP32D-NEXT: ori $a2, $a1, 1536
+; LA32D-ILP32D-NEXT: movgr2fr.w $fa5, $a2
+; LA32D-ILP32D-NEXT: ori $a2, $a1, 1792
+; LA32D-ILP32D-NEXT: movgr2fr.w $fa6, $a2
+; LA32D-ILP32D-NEXT: ori $a1, $a1, 2048
+; LA32D-ILP32D-NEXT: movgr2fr.w $fa7, $a1
; LA32D-ILP32D-NEXT: ori $a1, $zero, 10
; LA32D-ILP32D-NEXT: bl callee_half_in_gregs
; LA32D-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload
@@ -684,25 +687,33 @@ define i32 @caller_half_in_gregs() nounwind {
; LA64S: # %bb.0:
; LA64S-NEXT: addi.d $sp, $sp, -16
; LA64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
-; LA64S-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; LA64S-NEXT: fld.s $ft0, $a0, %pc_lo12(.LCPI3_0)
-; LA64S-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_1)
-; LA64S-NEXT: fld.s $fa0, $a0, %pc_lo12(.LCPI3_1)
-; LA64S-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_2)
-; LA64S-NEXT: fld.s $fa1, $a0, %pc_lo12(.LCPI3_2)
-; LA64S-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_3)
-; LA64S-NEXT: fld.s $fa2, $a0, %pc_lo12(.LCPI3_3)
-; LA64S-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_4)
-; LA64S-NEXT: fld.s $fa3, $a0, %pc_lo12(.LCPI3_4)
-; LA64S-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_5)
-; LA64S-NEXT: fld.s $fa4, $a0, %pc_lo12(.LCPI3_5)
-; LA64S-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_6)
-; LA64S-NEXT: fld.s $fa5, $a0, %pc_lo12(.LCPI3_6)
-; LA64S-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_7)
-; LA64S-NEXT: fld.s $fa6, $a0, %pc_lo12(.LCPI3_7)
-; LA64S-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_8)
-; LA64S-NEXT: fld.s $fa7, $a0, %pc_lo12(.LCPI3_8)
-; LA64S-NEXT: movfr2gr.s $a0, $ft0
+; LA64S-NEXT: lu12i.w $a1, -12
+; LA64S-NEXT: ori $a0, $a1, 2176
+; LA64S-NEXT: ori $a2, $a1, 512
+; LA64S-NEXT: ori $a3, $a1, 1024
+; LA64S-NEXT: ori $a4, $a1, 1280
+; LA64S-NEXT: ori $a5, $a1, 1536
+; LA64S-NEXT: ori $a6, $a1, 1792
+; LA64S-NEXT: ori $a7, $a1, 2048
+; LA64S-NEXT: lu32i.d $a1, 0
+; LA64S-NEXT: movgr2fr.w $fa1, $a1
+; LA64S-NEXT: lu12i.w $a1, -13
+; LA64S-NEXT: ori $a1, $a1, 3072
+; LA64S-NEXT: lu32i.d $a1, 0
+; LA64S-NEXT: movgr2fr.w $fa0, $a1
+; LA64S-NEXT: lu32i.d $a2, 0
+; LA64S-NEXT: movgr2fr.w $fa2, $a2
+; LA64S-NEXT: lu32i.d $a3, 0
+; LA64S-NEXT: movgr2fr.w $fa3, $a3
+; LA64S-NEXT: lu32i.d $a4, 0
+; LA64S-NEXT: movgr2fr.w $fa4, $a4
+; LA64S-NEXT: lu32i.d $a5, 0
+; LA64S-NEXT: movgr2fr.w $fa5, $a5
+; LA64S-NEXT: lu32i.d $a0, 0
+; LA64S-NEXT: lu32i.d $a6, 0
+; LA64S-NEXT: movgr2fr.w $fa6, $a6
+; LA64S-NEXT: lu32i.d $a7, 0
+; LA64S-NEXT: movgr2fr.w $fa7, $a7
; LA64S-NEXT: ori $a1, $zero, 10
; LA64S-NEXT: pcaddu18i $ra, %call36(callee_half_in_gregs)
; LA64S-NEXT: jirl $ra, $ra, 0
@@ -714,35 +725,27 @@ define i32 @caller_half_in_gregs() nounwind {
; LA64F-LP64S: # %bb.0:
; LA64F-LP64S-NEXT: addi.d $sp, $sp, -32
; LA64F-LP64S-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill
-; LA64F-LP64S-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; LA64F-LP64S-NEXT: fld.s $fa0, $a0, %pc_lo...
[truncated]
|
9f20903
to
df48119
Compare
// generate the INTVal, fallback to use floating point load from the | ||
// constant pool. | ||
auto Seq = LoongArchMatInt::generateInstSeq(INTVal.getSExtValue()); | ||
if (Seq.size() > MaterializeFPImmInsNum && !FPVal.isExactlyValue(+1.0)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should the maximum instruction threshold also apply to f32
? and why is +1.0
treated as a special case?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should the maximum instruction threshold also apply to
f32
?
f32
requires a maximum of two instructions and a movgr2fr.w
, which maybe always cheaper than loading from constant pool. But if we wish to also control the behavior of dealing with f32
using this option, it can also be applied. Do you think it is necessary?
and why is
+1.0
treated as a special case?
+1.0
only need one lu52i.d
, so we hope it always entering the following custom processing. The special treatment here is just to prevent it from returning when MaterializeFPImmInsNum
equals 0. Otherwise, it will match worse code in .td
.
By the way, la32
maybe should always load non-zero f64
from constant pool, because it seems no profit got from customing it. What do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
By the way,
la32
maybe should always load non-zerof64
from constant pool, because it seems no profit got from customing it. What do you think?
Oh, perhaps can benefit from customing it if several values loaded or the loaded value will move to integer register Immediately.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
f32
requires a maximum of two instructions and amovgr2fr.w
, which maybe always cheaper than loading from constant pool. But if we wish to also control the behavior of dealing withf32
using this option, it can also be applied. Do you think it is necessary?
The MaterializeFPImmInsNum
option applies to a range of 0-4 instructions, and f32
also falls within this range. Unless explicitly documented, making f32
a special case would be confusing.
In addition, Should MaterializeFPImmInsNum
include the instruction count for movgr2fr[h]
?
la32
maybe should always load non-zerof64
from constant pool, because it seems no profit got from customing it. What do you think?
I’d prefer this to apply to both LA32 and LA64. If we count all instructions, how about using different thresholds for LA32 and LA64 (when a single value doesn’t work for both)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for your suggestions!
Taking movgr2fr[h]
into account is reasonable. What do you think about setting MaterializeFPImmInsNum
to 3
as default for both LA32 and LA64? Which means:
- For
f32
on both LA32 and LA64:2 insts + movgr2fr.w
; (will cover allf32
values) - For
f64
on LA64:2 insts + movgr2fr.d
; - For
f64
on LA32:1 inst + movgr2fr.w + movgr2frh.w
. (same inst latency as using constant pool)
The range of MaterializeFPImmInsNum
will be 0,2-6
. (6 behaves same as 5 on LA64.)
TODO: `frecip` and `frsqrt` cannot match.
df48119
to
9efea04
Compare
9efea04
to
fb14bc2
Compare
Rust corebenches (geomean):
|
No description provided.