Responding to review comments

nasherm · nasherm · commit a0873eea5596 · 2025-07-15T13:31:05.000+01:00
- making sure transform only operates on smin nodes
- adding extra tests dealing with interesting edge cases

Change-Id: Ia1114ec9b93c4de3552b867e0d745beccdae69f1
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1143,6 +1143,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
                        ISD::SIGN_EXTEND_INREG, ISD::CONCAT_VECTORS,
                        ISD::EXTRACT_SUBVECTOR, ISD::INSERT_SUBVECTOR,
                        ISD::STORE, ISD::BUILD_VECTOR});
+  setTargetDAGCombine(ISD::SMIN);
   setTargetDAGCombine(ISD::TRUNCATE);
   setTargetDAGCombine(ISD::LOAD);
 
@@ -20998,23 +20999,18 @@ static SDValue performBuildVectorCombine(SDNode *N,
 
 // A special combine for the sqdmulh family of instructions.
 // smin( sra ( mul( sext v0, sext v1 ) ), SHIFT_AMOUNT ),
-// SATURATING_VAL ) can be reduced to sext(sqdmulh(...))
+// SATURATING_VAL ) can be reduced to sqdmulh(...)
 static SDValue trySQDMULHCombine(SDNode *N, SelectionDAG &DAG) {
 
-  if (N->getOpcode() != ISD::TRUNCATE)
+  if (N->getOpcode() != ISD::SMIN)
     return SDValue();
 
   EVT VT = N->getValueType(0);
 
   if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
     return SDValue();
 
-  SDValue SMin = N->getOperand(0);
-
-  if (SMin.getOpcode() != ISD::SMIN)
-    return SDValue();
-
-  ConstantSDNode *Clamp = isConstOrConstSplat(SMin.getOperand(1));
+  ConstantSDNode *Clamp = isConstOrConstSplat(N->getOperand(1));
 
   if (!Clamp)
     return SDValue();
@@ -21034,8 +21030,8 @@ static SDValue trySQDMULHCombine(SDNode *N, SelectionDAG &DAG) {
     return SDValue();
   }
 
-  SDValue Sra = SMin.getOperand(0);
-  if (Sra.getOpcode() != ISD::SRA)
+  SDValue Sra = N->getOperand(0);
+  if (Sra.getOpcode() != ISD::SRA || !Sra.hasOneUse())
     return SDValue();
 
   ConstantSDNode *RightShiftVec = isConstOrConstSplat(Sra.getOperand(1));
@@ -21062,11 +21058,27 @@ static SDValue trySQDMULHCombine(SDNode *N, SelectionDAG &DAG) {
       SExt0Type.getFixedSizeInBits() > 128)
     return SDValue();
 
-  SDValue V0 = SExt0.getOperand(0);
-  SDValue V1 = SExt1.getOperand(0);
+  // Source vectors with width < 64 are illegal and will need to be extended
+  unsigned SourceVectorWidth = SExt0Type.getFixedSizeInBits();
+  SDValue V0 = (SourceVectorWidth < 64) ? SExt0 : SExt0.getOperand(0);
+  SDValue V1 = (SourceVectorWidth < 64) ? SExt1 : SExt1.getOperand(0);
+
+  SDLoc DL(N);
+  SDValue SQDMULH =
+      DAG.getNode(AArch64ISD::SQDMULH, DL, V0.getValueType(), V0, V1);
+  EVT DestVT = N->getValueType(0);
+  if (DestVT.getScalarSizeInBits() > SExt0Type.getScalarSizeInBits())
+    return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, SQDMULH);
+
+  return SQDMULH;
+}
+
+static SDValue performSMINCombine(SDNode *N, SelectionDAG &DAG) {
+  if (SDValue V = trySQDMULHCombine(N, DAG)) {
+    return V;
+  }
 
-  SDLoc DL(SMin);
-  return DAG.getNode(AArch64ISD::SQDMULH, DL, SExt0Type, V0, V1);
+  return SDValue();
 }
 
 static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
@@ -21083,10 +21095,6 @@ static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
     return DAG.getNode(N0.getOpcode(), DL, VT, Op);
   }
 
-  if (SDValue V = trySQDMULHCombine(N, DAG)) {
-    return DAG.getNode(ISD::TRUNCATE, DL, VT, V);
-  }
-
   // Performing the following combine produces a preferable form for ISEL.
   // i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
   if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
@@ -26824,6 +26832,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performAddSubCombine(N, DCI);
   case ISD::BUILD_VECTOR:
     return performBuildVectorCombine(N, DCI, DAG);
+  case ISD::SMIN:
+    return performSMINCombine(N, DAG);
   case ISD::TRUNCATE:
     return performTruncateCombine(N, DAG, DCI);
   case AArch64ISD::ANDS:
diff --git a/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll b/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll
@@ -1,6 +1,25 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=aarch64-none-elf < %s | FileCheck %s
 
+
+define <2 x i16> @saturating_2xi16(<2 x i16> %a, <2 x i16> %b) {
+; CHECK-LABEL: saturating_2xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-NEXT:    sshr v1.2s, v1.2s, #16
+; CHECK-NEXT:    sqdmulh v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    ret
+  %as = sext <2 x i16> %a to <2 x i32>
+  %bs = sext <2 x i16> %b to <2 x i32>
+  %m = mul <2 x i32> %bs, %as
+  %sh = ashr <2 x i32> %m, splat (i32 15)
+  %ma = tail call <2 x i32> @llvm.smin.v4i32(<2 x i32> %sh, <2 x i32> splat (i32 32767))
+  %t = trunc <2 x i32> %ma to <2 x i16>
+  ret <2 x i16> %t
+}
+
 define <4 x i16> @saturating_4xi16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-LABEL: saturating_4xi16:
 ; CHECK:       // %bb.0:
@@ -71,3 +90,53 @@ define <8 x i32> @saturating_8xi32(<8 x i32> %a, <8 x i32> %b) {
   %t = trunc <8 x i64> %ma to <8 x i32>
   ret <8 x i32> %t
 }
+
+define <2 x i64> @saturating_2xi32_2xi64(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: saturating_2xi32_2xi64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqdmulh v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ret
+  %as = sext <2 x i32> %a to <2 x i64>
+  %bs = sext <2 x i32> %b to <2 x i64>
+  %m = mul <2 x i64> %bs, %as
+  %sh = ashr <2 x i64> %m, splat (i64 31)
+  %ma = tail call <2 x i64> @llvm.smin.v8i64(<2 x i64> %sh, <2 x i64> splat (i64 2147483647))
+  ret <2 x i64> %ma
+}
+
+define <4 x i16> @unsupported_saturation_value_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: unsupported_saturation_value_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smull v0.4s, v1.4h, v0.4h
+; CHECK-NEXT:    movi v1.4s, #42
+; CHECK-NEXT:    sshr v0.4s, v0.4s, #15
+; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %as = sext <4 x i16> %a to <4 x i32>
+  %bs = sext <4 x i16> %b to <4 x i32>
+  %m = mul <4 x i32> %bs, %as
+  %sh = ashr <4 x i32> %m, splat (i32 15)
+  %ma = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %sh, <4 x i32> splat (i32 42))
+  %t = trunc <4 x i32> %ma to <4 x i16>
+  ret <4 x i16> %t
+}
+
+define <4 x i16> @unsupported_shift_value_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: unsupported_shift_value_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smull v0.4s, v1.4h, v0.4h
+; CHECK-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-NEXT:    sshr v0.4s, v0.4s, #3
+; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %as = sext <4 x i16> %a to <4 x i32>
+  %bs = sext <4 x i16> %b to <4 x i32>
+  %m = mul <4 x i32> %bs, %as
+  %sh = ashr <4 x i32> %m, splat (i32 3)
+  %ma = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %sh, <4 x i32> splat (i32 32767))
+  %t = trunc <4 x i32> %ma to <4 x i16>
+  ret <4 x i16> %t
+}