llvm
diff --git a/‎llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp‎
Lines changed: 87 additions & 54 deletions b/‎llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp‎
Lines changed: 87 additions & 54 deletions
@@ -3529,50 +3529,100 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
   case ISD::SREM:
   case ISD::SDIV:
     /*
-    For sdiv, typical sequence of instructions as per the type and divisor
-    property is as follows:
-    Scalar power-of-2: cmp + csel + asr
-    Vector power-of-2: usra + sshr
-
-    Scalar non-power-2: smulh/smull + asr/lsr + add/sub + asr + add
-    Vector non-power-2:
-      a) <2 x i64>: 2 * (smulh + asr + add)   --> This yeilds scalarized form.
-      b) <4 x i32>: smull2 + smull + uzp2 + add + sshr + usra
-
-    SVE versions should have more or less the same cost because sometimes they
-    yeild native sdiv instructions, which should have less cost or the same
-    sequence of neon instructions.
-
-    For srem, typical sequence of instructions as per the type and divisor
-    property is as follows:
-    Scalar version: <set of sdiv instructions> + msub
-    Vector version: <set of sdiv instructions> + 2-msub/mls
+    Notes for sdiv/srem specific costs:
+    1. This only considers the cases where the divisor is constant, uniform and
+    (pow-of-2/non-pow-of-2). Other cases are not important since they either
+    result in some form of (ldr + adrp), corresponding to constant vectors, or
+    scalarization of the division operation.
+    2. Constant divisors, either negative in whole or partially, don't result in
+    significantly different codegen as compared to positive constant divisors.
+    So, we don't consider negative divisors seperately.
+    3. If the codegen is significantly different with SVE, it has been indicated
+    using comments at appropriate places.
+
+    sdiv specific cases:
+    -----------------------------------------------------------------------
+    codegen                       | pow-of-2               | Type
+    -----------------------------------------------------------------------
+    add + cmp + csel + asr        | Y                      | i64
+    add + cmp + csel + asr        | Y                      | i32
+    -----------------------------------------------------------------------
+
+    srem specific cases:
+    -----------------------------------------------------------------------
+    codegen                       | pow-of-2               | Type
+    -----------------------------------------------------------------------
+    negs + and + and + csneg      | Y                      | i64
+    negs + and + and + csneg      | Y                      | i32
+    -----------------------------------------------------------------------
+
+    other sdiv/srem cases:
+    -------------------------------------------------------------------------
+    commom codegen            | + srem     | + sdiv     | pow-of-2  | Type
+    -------------------------------------------------------------------------
+    smulh + asr + add + add   | -          | -          | N         | i64
+    smull + lsr + add + add   | -          | -          | N         | i32
+    usra                      | and + sub  | sshr       | Y         | <2 x i64>
+    2 * (scalar code)         | -          | -          | N         | <2 x i64>
+    usra                      | bic + sub  | sshr + neg | Y         | <4 x i32>
+    smull2 + smull + uzp2     | mls        | -          | N         | <4 x i32>
+           + sshr  + usra     |            |            |           |
+    -------------------------------------------------------------------------
     */
-    if (Op2Info.isConstant()) {
-      InstructionCost AsrCost =
-          getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
-                                 Op1Info.getNoProps(), Op2Info.getNoProps());
+    if (Op2Info.isConstant() && Op2Info.isUniform()) {
       InstructionCost AddCost =
           getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
                                  Op1Info.getNoProps(), Op2Info.getNoProps());
+      InstructionCost AsrCost =
+          getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
+                                 Op1Info.getNoProps(), Op2Info.getNoProps());
       InstructionCost MulCost =
           getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
                                  Op1Info.getNoProps(), Op2Info.getNoProps());
-
-      bool HasSMUL = !Op2Info.isPowerOf2();
-      unsigned NumOfSMUL = HasSMUL ? (LT.second.isVector() ? 2 : 1) : 0;
-      bool HasExtraAsr =
-          (LT.second.isVector() || LT.second == MVT::i32) && HasSMUL;
-
-      InstructionCost CommonCost = AsrCost + AddCost;
-      // We typicall get 1 msub for scalar and 2-msub/1-mls for the vector form.
-      // Typically, the cost of msub is same and mls is twice as costly as
-      // add/sub/mul.
-      InstructionCost MlsOrMSubCost = (LT.second.isVector() ? 2 : 1) * MulCost;
-      InstructionCost DivCost =
-          CommonCost + (MulCost * NumOfSMUL) /* SMULH/SMULH */ +
-          (AsrCost * HasExtraAsr); // Coming with second SMULH
-      return DivCost + (ISD == ISD::SREM ? MlsOrMSubCost : 0);
+      // add/cmp/csel/csneg should have similar cost while asr/negs/and should
+      // have similar cost.
+      if (LT.second.isScalarInteger()) {
+        if (Op2Info.isPowerOf2()) {
+          return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
+                                  : (3 * AsrCost + AddCost);
+        } else {
+          return MulCost + AsrCost + 2 * AddCost;
+        }
+      } else {
+        InstructionCost UsraCost = 2 * AsrCost;
+        if (Op2Info.isPowerOf2()) {
+          // Division with scalable types corresponds to native 'asrd'
+          // instruction when SVE is available.
+          // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
+          if (Ty->isScalableTy() && ST->hasSVE())
+            return 2 * AsrCost;
+          return UsraCost +
+                 (ISD == ISD::SDIV
+                      ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) *
+                            AsrCost
+                      : 2 * AddCost);
+        } else if (LT.second.is128BitVector() &&
+                   LT.second.getScalarType() == MVT::i64) {
+          auto VT = TLI->getValueType(DL, Ty);
+          return VT.getVectorNumElements() *
+                 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
+                                        Op1Info.getNoProps(),
+                                        Op2Info.getNoProps());
+        } else {
+          // When SVE is available, we get:
+          // smulh + lsr + add/sub + asr + add/sub.
+          if (Ty->isScalableTy() && ST->hasSVE())
+            return 2 * MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
+          return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
+        }
+      }
+    }
+    if (Op2Info.isConstant() && !Op2Info.isUniform() &&
+        LT.second.isFixedLengthVector()) {
+      auto VT = TLI->getValueType(DL, Ty);
+      return VT.getVectorNumElements() *
+             getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
+                                    Op1Info.getNoProps(), Op2Info.getNoProps());
     }
     [[fallthrough]];
   case ISD::UDIV:
@@ -3612,23 +3662,6 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
                                   AddCost * 2 + ShrCost;
         return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
       }
-
-      // TODO: Fix SDIV and SREM costs, similar to the above.
-      if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT) &&
-          Op2Info.isUniform() && !VT.isScalableVector()) {
-        // Vector signed division by constant are expanded to the
-        // sequence MULHS + ADD/SUB + SRA + SRL + ADD.
-        InstructionCost MulCost =
-            getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
-                                   Op1Info.getNoProps(), Op2Info.getNoProps());
-        InstructionCost AddCost =
-            getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
-                                   Op1Info.getNoProps(), Op2Info.getNoProps());
-        InstructionCost ShrCost =
-            getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
-                                   Op1Info.getNoProps(), Op2Info.getNoProps());
-        return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
-      }
     }
 
     // div i128's are lowered as libcalls.  Pass nullptr as (u)divti3 calls are