[DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes if they have sufficient leading zero/sign bits -3

houngkoungting · houngkoungting · commit c8cc2a980301 · 2025-08-08T10:51:33.000+08:00
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16302,9 +16302,10 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       SDValue Y = N0.getOperand(1);
       unsigned SrcBits = X.getScalarValueSizeInBits();
       unsigned DstBits = VT.getScalarSizeInBits();
-      unsigned MaxBitsX = DAG.ComputeMaxSignificantBits(X);
-      unsigned MaxBitsY = DAG.ComputeMaxSignificantBits(Y);
-      if (MaxBitsX <= DstBits && MaxBitsY <= DstBits) {
+      KnownBits KnownX = DAG.computeKnownBits(X);
+      KnownBits KnownY = DAG.computeKnownBits(Y);
+      if (KnownX.countMinLeadingZeros() >= (SrcBits - DstBits) &&
+          KnownY.countMinLeadingZeros() >= (SrcBits - DstBits)) {
         SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
         SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
         return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty);
@@ -16322,6 +16323,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       unsigned SrcBits = X.getScalarValueSizeInBits();
       unsigned DstBits = VT.getScalarSizeInBits();
       unsigned NeededSignBits = SrcBits - DstBits + 1;
+
       if (SignBitsX >= NeededSignBits && SignBitsY >= NeededSignBits) {
         SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
         SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
@@ -10,8 +10,8 @@ define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-NEXT:    uhadd v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
 
-  %mask = insertelement <8 x i16> undef, i16 255, i32 0
-  %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> undef, <8 x i32> zeroinitializer
+  %mask = insertelement <8 x i16> poison, i16 255, i32 0
+  %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer
   %ta16 = and <8 x i16> %a, %mask.splat
   %tb16 = and <8 x i16> %b, %mask.splat
   %ta8 = trunc <8 x i16> %ta16 to <8 x i8>
@@ -29,10 +29,10 @@ define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-NEXT:    shadd v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
 
-  %smin = insertelement <8 x i16> undef, i16 -128, i32 0
-  %smax = insertelement <8 x i16> undef, i16 127, i32 0
-  %min = shufflevector <8 x i16> %smin, <8 x i16> undef, <8 x i32> zeroinitializer
-  %max = shufflevector <8 x i16> %smax, <8 x i16> undef, <8 x i32> zeroinitializer
+  %smin = insertelement <8 x i16> poison, i16 -128, i32 0
+  %smax = insertelement <8 x i16> poison, i16 127, i32 0
+  %min = shufflevector <8 x i16> %smin, <8 x i16> poison, <8 x i32> zeroinitializer
+  %max = shufflevector <8 x i16> %smax, <8 x i16> poison, <8 x i32> zeroinitializer
 
   %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max)
   %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min)
@@ -73,10 +73,10 @@ define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-NEXT:    srhadd v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
 
-  %smin = insertelement <8 x i16> undef, i16 -128, i32 0
-  %smax = insertelement <8 x i16> undef, i16 127, i32 0
-  %min = shufflevector <8 x i16> %smin, <8 x i16> undef, <8 x i32> zeroinitializer
-  %max = shufflevector <8 x i16> %smax, <8 x i16> undef, <8 x i32> zeroinitializer
+  %smin = insertelement <8 x i16> poison, i16 -128, i32 0
+  %smax = insertelement <8 x i16> poison, i16 127, i32 0
+  %min = shufflevector <8 x i16> %smin, <8 x i16> poison, <8 x i32> zeroinitializer
+  %max = shufflevector <8 x i16> %smax, <8 x i16> poison, <8 x i32> zeroinitializer
 
   %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max)
   %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min)