@@ -3529,50 +3529,100 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
35293529 case ISD::SREM:
35303530 case ISD::SDIV:
35313531 /*
3532- For sdiv, typical sequence of instructions as per the type and divisor
3533- property is as follows:
3534- Scalar power-of-2: cmp + csel + asr
3535- Vector power-of-2: usra + sshr
3536-
3537- Scalar non-power-2: smulh/smull + asr/lsr + add/sub + asr + add
3538- Vector non-power-2:
3539- a) <2 x i64>: 2 * (smulh + asr + add) --> This yeilds scalarized form.
3540- b) <4 x i32>: smull2 + smull + uzp2 + add + sshr + usra
3541-
3542- SVE versions should have more or less the same cost because sometimes they
3543- yeild native sdiv instructions, which should have less cost or the same
3544- sequence of neon instructions.
3545-
3546- For srem, typical sequence of instructions as per the type and divisor
3547- property is as follows:
3548- Scalar version: <set of sdiv instructions> + msub
3549- Vector version: <set of sdiv instructions> + 2-msub/mls
3532+ Notes for sdiv/srem specific costs:
3533+ 1. This only considers the cases where the divisor is constant, uniform and
3534+ (pow-of-2/non-pow-of-2). Other cases are not important since they either
3535+ result in some form of (ldr + adrp), corresponding to constant vectors, or
3536+ scalarization of the division operation.
3537+ 2. Constant divisors, either negative in whole or partially, don't result in
3538+ significantly different codegen as compared to positive constant divisors.
3539+ So, we don't consider negative divisors seperately.
3540+ 3. If the codegen is significantly different with SVE, it has been indicated
3541+ using comments at appropriate places.
3542+
3543+ sdiv specific cases:
3544+ -----------------------------------------------------------------------
3545+ codegen | pow-of-2 | Type
3546+ -----------------------------------------------------------------------
3547+ add + cmp + csel + asr | Y | i64
3548+ add + cmp + csel + asr | Y | i32
3549+ -----------------------------------------------------------------------
3550+
3551+ srem specific cases:
3552+ -----------------------------------------------------------------------
3553+ codegen | pow-of-2 | Type
3554+ -----------------------------------------------------------------------
3555+ negs + and + and + csneg | Y | i64
3556+ negs + and + and + csneg | Y | i32
3557+ -----------------------------------------------------------------------
3558+
3559+ other sdiv/srem cases:
3560+ -------------------------------------------------------------------------
3561+ commom codegen | + srem | + sdiv | pow-of-2 | Type
3562+ -------------------------------------------------------------------------
3563+ smulh + asr + add + add | - | - | N | i64
3564+ smull + lsr + add + add | - | - | N | i32
3565+ usra | and + sub | sshr | Y | <2 x i64>
3566+ 2 * (scalar code) | - | - | N | <2 x i64>
3567+ usra | bic + sub | sshr + neg | Y | <4 x i32>
3568+ smull2 + smull + uzp2 | mls | - | N | <4 x i32>
3569+ + sshr + usra | | | |
3570+ -------------------------------------------------------------------------
35503571 */
3551- if (Op2Info.isConstant ()) {
3552- InstructionCost AsrCost =
3553- getArithmeticInstrCost (Instruction::AShr, Ty, CostKind,
3554- Op1Info.getNoProps (), Op2Info.getNoProps ());
3572+ if (Op2Info.isConstant () && Op2Info.isUniform ()) {
35553573 InstructionCost AddCost =
35563574 getArithmeticInstrCost (Instruction::Add, Ty, CostKind,
35573575 Op1Info.getNoProps (), Op2Info.getNoProps ());
3576+ InstructionCost AsrCost =
3577+ getArithmeticInstrCost (Instruction::AShr, Ty, CostKind,
3578+ Op1Info.getNoProps (), Op2Info.getNoProps ());
35583579 InstructionCost MulCost =
35593580 getArithmeticInstrCost (Instruction::Mul, Ty, CostKind,
35603581 Op1Info.getNoProps (), Op2Info.getNoProps ());
3561-
3562- bool HasSMUL = !Op2Info.isPowerOf2 ();
3563- unsigned NumOfSMUL = HasSMUL ? (LT.second .isVector () ? 2 : 1 ) : 0 ;
3564- bool HasExtraAsr =
3565- (LT.second .isVector () || LT.second == MVT::i32 ) && HasSMUL;
3566-
3567- InstructionCost CommonCost = AsrCost + AddCost;
3568- // We typicall get 1 msub for scalar and 2-msub/1-mls for the vector form.
3569- // Typically, the cost of msub is same and mls is twice as costly as
3570- // add/sub/mul.
3571- InstructionCost MlsOrMSubCost = (LT.second .isVector () ? 2 : 1 ) * MulCost;
3572- InstructionCost DivCost =
3573- CommonCost + (MulCost * NumOfSMUL) /* SMULH/SMULH */ +
3574- (AsrCost * HasExtraAsr); // Coming with second SMULH
3575- return DivCost + (ISD == ISD::SREM ? MlsOrMSubCost : 0 );
3582+ // add/cmp/csel/csneg should have similar cost while asr/negs/and should
3583+ // have similar cost.
3584+ if (LT.second .isScalarInteger ()) {
3585+ if (Op2Info.isPowerOf2 ()) {
3586+ return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
3587+ : (3 * AsrCost + AddCost);
3588+ } else {
3589+ return MulCost + AsrCost + 2 * AddCost;
3590+ }
3591+ } else {
3592+ InstructionCost UsraCost = 2 * AsrCost;
3593+ if (Op2Info.isPowerOf2 ()) {
3594+ // Division with scalable types corresponds to native 'asrd'
3595+ // instruction when SVE is available.
3596+ // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
3597+ if (Ty->isScalableTy () && ST->hasSVE ())
3598+ return 2 * AsrCost;
3599+ return UsraCost +
3600+ (ISD == ISD::SDIV
3601+ ? (LT.second .getScalarType () == MVT::i64 ? 1 : 2 ) *
3602+ AsrCost
3603+ : 2 * AddCost);
3604+ } else if (LT.second .is128BitVector () &&
3605+ LT.second .getScalarType () == MVT::i64 ) {
3606+ auto VT = TLI->getValueType (DL, Ty);
3607+ return VT.getVectorNumElements () *
3608+ getArithmeticInstrCost (Opcode, Ty->getScalarType (), CostKind,
3609+ Op1Info.getNoProps (),
3610+ Op2Info.getNoProps ());
3611+ } else {
3612+ // When SVE is available, we get:
3613+ // smulh + lsr + add/sub + asr + add/sub.
3614+ if (Ty->isScalableTy () && ST->hasSVE ())
3615+ return 2 * MulCost /* smulh cost*/ + 2 * AddCost + 2 * AsrCost;
3616+ return 2 * MulCost + AddCost /* uzp2 cost*/ + AsrCost + UsraCost;
3617+ }
3618+ }
3619+ }
3620+ if (Op2Info.isConstant () && !Op2Info.isUniform () &&
3621+ LT.second .isFixedLengthVector ()) {
3622+ auto VT = TLI->getValueType (DL, Ty);
3623+ return VT.getVectorNumElements () *
3624+ getArithmeticInstrCost (Opcode, Ty->getScalarType (), CostKind,
3625+ Op1Info.getNoProps (), Op2Info.getNoProps ());
35763626 }
35773627 [[fallthrough]];
35783628 case ISD::UDIV:
@@ -3612,23 +3662,6 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
36123662 AddCost * 2 + ShrCost;
36133663 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0 );
36143664 }
3615-
3616- // TODO: Fix SDIV and SREM costs, similar to the above.
3617- if (TLI->isOperationLegalOrCustom (ISD::MULHU, VT) &&
3618- Op2Info.isUniform () && !VT.isScalableVector ()) {
3619- // Vector signed division by constant are expanded to the
3620- // sequence MULHS + ADD/SUB + SRA + SRL + ADD.
3621- InstructionCost MulCost =
3622- getArithmeticInstrCost (Instruction::Mul, Ty, CostKind,
3623- Op1Info.getNoProps (), Op2Info.getNoProps ());
3624- InstructionCost AddCost =
3625- getArithmeticInstrCost (Instruction::Add, Ty, CostKind,
3626- Op1Info.getNoProps (), Op2Info.getNoProps ());
3627- InstructionCost ShrCost =
3628- getArithmeticInstrCost (Instruction::AShr, Ty, CostKind,
3629- Op1Info.getNoProps (), Op2Info.getNoProps ());
3630- return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1 ;
3631- }
36323665 }
36333666
36343667 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
0 commit comments