[RISCV] Use SLLI.UW in double-SHL_ADD multiplications (#166728)

pfusik · web-flow · commit 28fdda6ae1ba · 2025-11-07T06:47:01.000+01:00
Similarily to muls by 3/5/9 &lt;&lt; N, emit the SHL first for other SHL_ADD
multiplications, if it can be fold into SLLI.UW.
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16496,30 +16496,50 @@ static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG,
 }
 
 static SDValue getShlAddShlAdd(SDNode *N, SelectionDAG &DAG, unsigned ShX,
-                               unsigned ShY, bool AddX) {
+                               unsigned ShY, bool AddX, unsigned Shift) {
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
   SDValue X = N->getOperand(0);
-  SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+  // Put the shift first if we can fold a zext into the shift forming a slli.uw.
+  using namespace SDPatternMatch;
+  if (Shift != 0 &&
+      sd_match(X, m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
+    X = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(Shift, DL, VT));
+    Shift = 0;
+  }
+  SDValue ShlAdd = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
                                DAG.getTargetConstant(ShY, DL, VT), X);
-  return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
-                     DAG.getTargetConstant(ShX, DL, VT), AddX ? X : Mul359);
+  if (ShX != 0)
+    ShlAdd = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, ShlAdd,
+                         DAG.getTargetConstant(ShX, DL, VT), AddX ? X : ShlAdd);
+  if (Shift == 0)
+    return ShlAdd;
+  // Otherwise, put the shl last so that it can fold with following instructions
+  // (e.g. sext or add).
+  return DAG.getNode(ISD::SHL, DL, VT, ShlAdd, DAG.getConstant(Shift, DL, VT));
 }
 
 static SDValue expandMulToShlAddShlAdd(SDNode *N, SelectionDAG &DAG,
-                                       uint64_t MulAmt) {
-  // 3/5/9 * 3/5/9 -> (shXadd (shYadd X, X), (shYadd X, X))
+                                       uint64_t MulAmt, unsigned Shift) {
   switch (MulAmt) {
+  // 3/5/9 -> (shYadd X, X)
+  case 3:
+    return getShlAddShlAdd(N, DAG, 0, 1, /*AddX=*/false, Shift);
+  case 5:
+    return getShlAddShlAdd(N, DAG, 0, 2, /*AddX=*/false, Shift);
+  case 9:
+    return getShlAddShlAdd(N, DAG, 0, 3, /*AddX=*/false, Shift);
+  // 3/5/9 * 3/5/9 -> (shXadd (shYadd X, X), (shYadd X, X))
   case 5 * 3:
-    return getShlAddShlAdd(N, DAG, 2, 1, /*AddX=*/false);
+    return getShlAddShlAdd(N, DAG, 2, 1, /*AddX=*/false, Shift);
   case 9 * 3:
-    return getShlAddShlAdd(N, DAG, 3, 1, /*AddX=*/false);
+    return getShlAddShlAdd(N, DAG, 3, 1, /*AddX=*/false, Shift);
   case 5 * 5:
-    return getShlAddShlAdd(N, DAG, 2, 2, /*AddX=*/false);
+    return getShlAddShlAdd(N, DAG, 2, 2, /*AddX=*/false, Shift);
   case 9 * 5:
-    return getShlAddShlAdd(N, DAG, 3, 2, /*AddX=*/false);
+    return getShlAddShlAdd(N, DAG, 3, 2, /*AddX=*/false, Shift);
   case 9 * 9:
-    return getShlAddShlAdd(N, DAG, 3, 3, /*AddX=*/false);
+    return getShlAddShlAdd(N, DAG, 3, 3, /*AddX=*/false, Shift);
   default:
     break;
   }
@@ -16529,7 +16549,7 @@ static SDValue expandMulToShlAddShlAdd(SDNode *N, SelectionDAG &DAG,
   if (int ShY = isShifted359(MulAmt - 1, ShX)) {
     assert(ShX != 0 && "MulAmt=4,6,10 handled before");
     if (ShX <= 3)
-      return getShlAddShlAdd(N, DAG, ShX, ShY, /*AddX=*/true);
+      return getShlAddShlAdd(N, DAG, ShX, ShY, /*AddX=*/true, Shift);
   }
   return SDValue();
 }
@@ -16569,42 +16589,18 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
   // real regressions, and no other target properly freezes X in these cases
   // either.
   if (Subtarget.hasShlAdd(3)) {
-    SDValue X = N->getOperand(0);
-    int Shift;
-    if (int ShXAmount = isShifted359(MulAmt, Shift)) {
-      // 3/5/9 * 2^N -> shl (shXadd X, X), N
-      SDLoc DL(N);
-      // Put the shift first if we can fold a zext into the shift forming
-      // a slli.uw.
-      if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) &&
-          X.getConstantOperandVal(1) == UINT64_C(0xffffffff)) {
-        SDValue Shl =
-            DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(Shift, DL, VT));
-        return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl,
-                           DAG.getTargetConstant(ShXAmount, DL, VT), Shl);
-      }
-      // Otherwise, put the shl second so that it can fold with following
-      // instructions (e.g. sext or add).
-      SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                                   DAG.getTargetConstant(ShXAmount, DL, VT), X);
-      return DAG.getNode(ISD::SHL, DL, VT, Mul359,
-                         DAG.getConstant(Shift, DL, VT));
-    }
-
+    // 3/5/9 * 2^N -> (shl (shXadd X, X), N)
     // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples
     // of 25 which happen to be quite common.
     // (2/4/8 * 3/5/9 + 1) * 2^N
-    Shift = llvm::countr_zero(MulAmt);
-    if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt >> Shift)) {
-      if (Shift == 0)
-        return V;
-      SDLoc DL(N);
-      return DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Shift, DL, VT));
-    }
+    unsigned Shift = llvm::countr_zero(MulAmt);
+    if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt >> Shift, Shift))
+      return V;
 
     // If this is a power 2 + 2/4/8, we can use a shift followed by a single
     // shXadd. First check if this a sum of two power of 2s because that's
     // easy. Then count how many zeros are up to the first bit.
+    SDValue X = N->getOperand(0);
     if (Shift >= 1 && Shift <= 3 && isPowerOf2_64(MulAmt & (MulAmt - 1))) {
       unsigned ShiftAmt = llvm::countr_zero((MulAmt & (MulAmt - 1)));
       SDLoc DL(N);
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -1459,6 +1459,34 @@ define i64 @mul288(i64 %a) {
   ret i64 %c
 }
 
+define i64 @zext_mul44(i32 signext %a) {
+; RV64I-LABEL: zext_mul44:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 11
+; RV64I-NEXT:    slli a1, a1, 34
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    mulhu a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: zext_mul44:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli.uw a0, a0, 2
+; RV64ZBA-NEXT:    sh2add a1, a0, a0
+; RV64ZBA-NEXT:    sh1add a0, a1, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: zext_mul44:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    slli a0, a0, 32
+; RV64XANDESPERF-NEXT:    srli a0, a0, 30
+; RV64XANDESPERF-NEXT:    nds.lea.w a1, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a0, a1
+; RV64XANDESPERF-NEXT:    ret
+  %b = zext i32 %a to i64
+  %c = mul i64 %b, 44
+  ret i64 %c
+}
+
 define i64 @zext_mul68(i32 signext %a) {
 ; RV64I-LABEL: zext_mul68:
 ; RV64I:       # %bb.0:
@@ -1511,6 +1539,34 @@ define i64 @zext_mul96(i32 signext %a) {
   ret i64 %c
 }
 
+define i64 @zext_mul100(i32 signext %a) {
+; RV64I-LABEL: zext_mul100:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 25
+; RV64I-NEXT:    slli a1, a1, 34
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    mulhu a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: zext_mul100:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli.uw a0, a0, 2
+; RV64ZBA-NEXT:    sh2add a0, a0, a0
+; RV64ZBA-NEXT:    sh2add a0, a0, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: zext_mul100:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    slli a0, a0, 32
+; RV64XANDESPERF-NEXT:    srli a0, a0, 30
+; RV64XANDESPERF-NEXT:    nds.lea.w a0, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.w a0, a0, a0
+; RV64XANDESPERF-NEXT:    ret
+  %b = zext i32 %a to i64
+  %c = mul i64 %b, 100
+  ret i64 %c
+}
+
 define i64 @zext_mul160(i32 signext %a) {
 ; RV64I-LABEL: zext_mul160:
 ; RV64I:       # %bb.0: