[DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes if they have sufficient leading zero/sign bits -2

houngkoungting · houngkoungting · commit 24287f7f08d3 · 2025-08-08T00:04:32.000+08:00
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16300,45 +16300,35 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
         TLI.isOperationLegal(N0.getOpcode(), VT)) {
       SDValue X = N0.getOperand(0);
       SDValue Y = N0.getOperand(1);
-
-      KnownBits KnownX = DAG.computeKnownBits(X);
-      KnownBits KnownY = DAG.computeKnownBits(Y);
-
       unsigned SrcBits = X.getScalarValueSizeInBits();
       unsigned DstBits = VT.getScalarSizeInBits();
-      unsigned NeededLeadingZeros = SrcBits - DstBits + 1;
-
-      if (KnownX.countMinLeadingZeros() >= NeededLeadingZeros &&
-          KnownY.countMinLeadingZeros() >= NeededLeadingZeros) {
+      unsigned MaxBitsX = DAG.ComputeMaxSignificantBits(X);
+      unsigned MaxBitsY = DAG.ComputeMaxSignificantBits(Y);
+      if (MaxBitsX <= DstBits && MaxBitsY <= DstBits) {
         SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
         SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
         return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty);
       }
     }
     break;
-
   case ISD::AVGCEILS:
   case ISD::AVGFLOORS:
     if (!LegalOperations && N0.hasOneUse() &&
         TLI.isOperationLegal(N0.getOpcode(), VT)) {
       SDValue X = N0.getOperand(0);
       SDValue Y = N0.getOperand(1);
-
       unsigned SignBitsX = DAG.ComputeNumSignBits(X);
       unsigned SignBitsY = DAG.ComputeNumSignBits(Y);
-
       unsigned SrcBits = X.getScalarValueSizeInBits();
       unsigned DstBits = VT.getScalarSizeInBits();
       unsigned NeededSignBits = SrcBits - DstBits + 1;
-
       if (SignBitsX >= NeededSignBits && SignBitsY >= NeededSignBits) {
         SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
         SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
         return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty);
       }
     }
     break;
-
   case ISD::ADD:
   case ISD::SUB:
   case ISD::MUL:
diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
@@ -1,38 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=aarch64-- -O2 -mattr=+neon < %s | FileCheck %s
 
-; CHECK-LABEL: test_avgceil_u
-; CHECK: uhadd v0.8b, v0.8b, v1.8b
+
 define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) {
-  %ta = trunc <8 x i16> %a to <8 x i8>
-  %tb = trunc <8 x i16> %b to <8 x i8>
-  %res = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+; CHECK-LABEL: test_avgceil_u:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    xtn v1.8b, v1.8h
+; CHECK-NEXT:    uhadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+
+  %mask = insertelement <8 x i16> undef, i16 255, i32 0
+  %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> undef, <8 x i32> zeroinitializer
+  %ta16 = and <8 x i16> %a, %mask.splat
+  %tb16 = and <8 x i16> %b, %mask.splat
+  %ta8 = trunc <8 x i16> %ta16 to <8 x i8>
+  %tb8 = trunc <8 x i16> %tb16 to <8 x i8>
+  %res = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
   ret <8 x i8> %res
 }
 
-; CHECK-LABEL: test_avgceil_s
-; CHECK: shadd v0.8b, v0.8b, v1.8b
+
 define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
-  %ta = trunc <8 x i16> %a to <8 x i8>
-  %tb = trunc <8 x i16> %b to <8 x i8>
-  %res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+; CHECK-LABEL: test_avgceil_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqxtn v0.8b, v0.8h
+; CHECK-NEXT:    sqxtn v1.8b, v1.8h
+; CHECK-NEXT:    shadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+
+  %smin = insertelement <8 x i16> undef, i16 -128, i32 0
+  %smax = insertelement <8 x i16> undef, i16 127, i32 0
+  %min = shufflevector <8 x i16> %smin, <8 x i16> undef, <8 x i32> zeroinitializer
+  %max = shufflevector <8 x i16> %smax, <8 x i16> undef, <8 x i32> zeroinitializer
+
+  %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max)
+  %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min)
+  %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max)
+  %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min)
+
+  %ta8 = trunc <8 x i16> %ta16.clamped to <8 x i8>
+  %tb8 = trunc <8 x i16> %tb16.clamped to <8 x i8>
+  %res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
   ret <8 x i8> %res
 }
 
-; CHECK-LABEL: test_avgfloor_u
-; CHECK: urhadd v0.8b, v0.8b, v1.8b
+
 define <8 x i8> @test_avgfloor_u(<8 x i16> %a, <8 x i16> %b) {
-  %ta = trunc <8 x i16> %a to <8 x i8>
-  %tb = trunc <8 x i16> %b to <8 x i8>
-  %res = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+; CHECK-LABEL: test_avgfloor_u:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    xtn v1.8b, v1.8h
+; CHECK-NEXT:    urhadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+
+  %mask = insertelement <8 x i16> undef, i16 255, i32 0
+  %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> undef, <8 x i32> zeroinitializer
+  %ta16 = and <8 x i16> %a, %mask.splat
+  %tb16 = and <8 x i16> %b, %mask.splat
+  %ta8 = trunc <8 x i16> %ta16 to <8 x i8>
+  %tb8 = trunc <8 x i16> %tb16 to <8 x i8>
+  %res = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
   ret <8 x i8> %res
 }
 
-; CHECK-LABEL: test_avgfloor_s
-; CHECK: srhadd v0.8b, v0.8b, v1.8b
+
 define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) {
-  %ta = trunc <8 x i16> %a to <8 x i8>
-  %tb = trunc <8 x i16> %b to <8 x i8>
-  %res = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+; CHECK-LABEL: test_avgfloor_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqxtn v0.8b, v0.8h
+; CHECK-NEXT:    sqxtn v1.8b, v1.8h
+; CHECK-NEXT:    srhadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+
+  %smin = insertelement <8 x i16> undef, i16 -128, i32 0
+  %smax = insertelement <8 x i16> undef, i16 127, i32 0
+  %min = shufflevector <8 x i16> %smin, <8 x i16> undef, <8 x i32> zeroinitializer
+  %max = shufflevector <8 x i16> %smax, <8 x i16> undef, <8 x i32> zeroinitializer
+
+  %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max)
+  %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min)
+  %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max)
+  %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min)
+
+  %ta8 = trunc <8 x i16> %ta16.clamped to <8 x i8>
+  %tb8 = trunc <8 x i16> %tb16.clamped to <8 x i8>
+  %res = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
   ret <8 x i8> %res
 }
 
@@ -41,3 +94,6 @@ declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>)
 declare <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>)
 declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>)
 
+declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>)
+