llvm · RKSimon · Aug 18, 2025 · Aug 6, 2025 · Aug 7, 2025 · Aug 8, 2025
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16294,6 +16294,43 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   // because targets may prefer a wider type during later combines and invert
   // this transform.
   switch (N0.getOpcode()) {
+  case ISD::AVGCEILU:
+  case ISD::AVGFLOORU:
+    if (!LegalOperations && N0.hasOneUse() &&
+        TLI.isOperationLegal(N0.getOpcode(), VT)) {
+      SDValue X = N0.getOperand(0);
+      SDValue Y = N0.getOperand(1);
+      unsigned SrcBits = X.getScalarValueSizeInBits();
+      unsigned DstBits = VT.getScalarSizeInBits();
+      KnownBits KnownX = DAG.computeKnownBits(X);
+      KnownBits KnownY = DAG.computeKnownBits(Y);
+      if (KnownX.countMinLeadingZeros() >= (SrcBits - DstBits) &&
+          KnownY.countMinLeadingZeros() >= (SrcBits - DstBits)) {
-      if (KnownX.countMinLeadingZeros() >= (SrcBits - DstBits) &&
-          KnownY.countMinLeadingZeros() >= (SrcBits - DstBits)) {
+      if (KnownX.countMaxActiveBits() <= DstBits &&
+          KnownY.countMaxActiveBits() <= DstBits) {
-      if (KnownX.countMinLeadingZeros() >= (SrcBits - DstBits) &&
-          KnownY.countMinLeadingZeros() >= (SrcBits - DstBits)) {
+      if (KnownX.countMaxActiveBits() <= DstBits &&
+          KnownY.countMaxActiveBits() <= DstBits) {
+        SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
+        SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
+        return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty);
+      }
+    }
+    break;
+  case ISD::AVGCEILS:
+  case ISD::AVGFLOORS:
+    if (!LegalOperations && N0.hasOneUse() &&
+        TLI.isOperationLegal(N0.getOpcode(), VT)) {
+      SDValue X = N0.getOperand(0);
+      SDValue Y = N0.getOperand(1);
+      unsigned SignBitsX = DAG.ComputeNumSignBits(X);
+      unsigned SignBitsY = DAG.ComputeNumSignBits(Y);
+      unsigned SrcBits = X.getScalarValueSizeInBits();
+      unsigned DstBits = VT.getScalarSizeInBits();
+      unsigned NeededSignBits = SrcBits - DstBits + 1;
+
+      if (SignBitsX >= NeededSignBits && SignBitsY >= NeededSignBits) {
+        SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
+        SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
+        return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty);
+      }
+    }
+    break;
   case ISD::ADD:
   case ISD::SUB:
   case ISD::MUL:

diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-- -O2 -mattr=+neon < %s | FileCheck %s
+
+define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_avgceil_u:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bic v0.8h, #255, lsl #8
+; CHECK-NEXT:    bic v1.8h, #255, lsl #8
+; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    ret
+  %mask = insertelement <8 x i16> poison, i16 255, i32 0
+  %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer
+  %ta16 = and <8 x i16> %a, %mask.splat
+  %tb16 = and <8 x i16> %b, %mask.splat
+  %avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %ta16, <8 x i16> %tb16)
+  %res = trunc <8 x i16> %avg16 to <8 x i8>
+  ret <8 x i8> %res
+}
+
+define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_avgceil_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.8h, #127
+; CHECK-NEXT:    mvni v3.8h, #127
+; CHECK-NEXT:    smin v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    smin v1.8h, v1.8h, v2.8h
+; CHECK-NEXT:    smax v0.8h, v0.8h, v3.8h
+; CHECK-NEXT:    smax v1.8h, v1.8h, v3.8h
+; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    ret
+  %min = insertelement <8 x i16> poison, i16 -128, i32 0
+  %min.splat = shufflevector <8 x i16> %min, <8 x i16> poison, <8 x i32> zeroinitializer
+  %max = insertelement <8 x i16> poison, i16 127, i32 0
+  %max.splat = shufflevector <8 x i16> %max, <8 x i16> poison, <8 x i32> zeroinitializer
+  %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max.splat)
+  %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min.splat)
+  %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max.splat)
+  %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min.splat)
+  %avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %ta16.clamped, <8 x i16> %tb16.clamped)
+  %res = trunc <8 x i16> %avg16 to <8 x i8>
+  ret <8 x i8> %res
+}
+
+define <8 x i8> @test_avgfloor_u(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_avgfloor_u:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bic v0.8h, #255, lsl #8
+; CHECK-NEXT:    bic v1.8h, #255, lsl #8
+; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    ret
+  %mask = insertelement <8 x i16> poison, i16 255, i32 0
+  %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer
+  %ta16 = and <8 x i16> %a, %mask.splat
+  %tb16 = and <8 x i16> %b, %mask.splat
+  %avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %ta16, <8 x i16> %tb16)
+  %res = trunc <8 x i16> %avg16 to <8 x i8>
+  ret <8 x i8> %res
+}
+
+define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_avgfloor_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.8h, #127
+; CHECK-NEXT:    mvni v3.8h, #127
+; CHECK-NEXT:    smin v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    smin v1.8h, v1.8h, v2.8h
+; CHECK-NEXT:    smax v0.8h, v0.8h, v3.8h
+; CHECK-NEXT:    smax v1.8h, v1.8h, v3.8h
+; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    ret
+  %min = insertelement <8 x i16> poison, i16 -128, i32 0
+  %min.splat = shufflevector <8 x i16> %min, <8 x i16> poison, <8 x i32> zeroinitializer
+  %max = insertelement <8 x i16> poison, i16 127, i32 0
+  %max.splat = shufflevector <8 x i16> %max, <8 x i16> poison, <8 x i32> zeroinitializer
+  %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max.splat)
+  %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min.splat)
+  %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max.splat)
+  %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min.splat)
+  %avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %ta16.clamped, <8 x i16> %tb16.clamped)
+  %res = trunc <8 x i16> %avg16 to <8 x i8>
+  ret <8 x i8> %res
+}
+
+declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>)
+