Skip to content

Conversation

@RKSimon
Copy link
Collaborator

@RKSimon RKSimon commented Apr 21, 2025

Also add computeKnownBitsForTargetNode handling for X86ISD::FAND

Fixes #136368

…ND/FOR

Also add computeKnownBitsForTargetNode handling for X86ISD::FAND

Fixes llvm#136368
@llvmbot
Copy link
Member

llvmbot commented Apr 21, 2025

@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

Changes

Also add computeKnownBitsForTargetNode handling for X86ISD::FAND

Fixes #136368


Full diff: https://github.com/llvm/llvm-project/pull/136618.diff

2 Files Affected:

  • (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+56-1)
  • (modified) llvm/test/CodeGen/X86/combine-fcopysign.ll (+12-18)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 993118c52564e..5e46708c7e877 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -38487,11 +38487,17 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     Known.Zero |= Known2.One;
     break;
   }
+  case X86ISD::FAND: {
+    KnownBits Known2;
+    Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known &= Known2;
+    break;
+  }
   case X86ISD::FOR: {
     KnownBits Known2;
     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
-
     Known |= Known2;
     break;
   }
@@ -44147,6 +44153,55 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
     Known.Zero |= Known2.One;
     break;
   }
+  case X86ISD::FAND: {
+    KnownBits Known2;
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
+                             Known, TLO, Depth + 1))
+      return true;
+
+    if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
+                             OriginalDemandedElts, Known2, TLO, Depth + 1))
+      return true;
+
+    // If all of the demanded bits are known one on one side, return the other.
+    // These bits cannot contribute to the result of the 'and'.
+    if (OriginalDemandedBits.isSubsetOf(Known2.Zero | Known.One))
+      return TLO.CombineTo(Op, Op0);
+    if (OriginalDemandedBits.isSubsetOf(Known.Zero | Known2.One))
+      return TLO.CombineTo(Op, Op1);
+    // If all of the demanded bits in the inputs are known zeros, return zero.
+    if (OriginalDemandedBits.isSubsetOf(Known.Zero | Known2.Zero))
+      return TLO.CombineTo(Op, TLO.DAG.getConstantFP(0.0, SDLoc(Op), VT));
+
+    Known &= Known2;
+    break;
+  }
+  case X86ISD::FOR: {
+    KnownBits Known2;
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
+                             Known, TLO, Depth + 1))
+      return true;
+
+    if (SimplifyDemandedBits(Op0, ~Known.One & OriginalDemandedBits,
+                             OriginalDemandedElts, Known2, TLO, Depth + 1))
+      return true;
+
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'or'.
+    if (OriginalDemandedBits.isSubsetOf(Known2.One | Known.Zero))
+      return TLO.CombineTo(Op, Op0);
+    if (OriginalDemandedBits.isSubsetOf(Known.One | Known2.Zero))
+      return TLO.CombineTo(Op, Op1);
+
+    Known |= Known2;
+    break;
+  }
   case X86ISD::VSHLI: {
     SDValue Op0 = Op.getOperand(0);
     SDValue Op1 = Op.getOperand(1);
diff --git a/llvm/test/CodeGen/X86/combine-fcopysign.ll b/llvm/test/CodeGen/X86/combine-fcopysign.ll
index d7031be3addd9..0443d3aee3801 100644
--- a/llvm/test/CodeGen/X86/combine-fcopysign.ll
+++ b/llvm/test/CodeGen/X86/combine-fcopysign.ll
@@ -252,28 +252,22 @@ define double @PR136368(double %x) {
 ; SSE-LABEL: PR136368:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movapd {{.*#+}} xmm1 = [NaN,NaN]
-; SSE-NEXT:    movapd %xmm0, %xmm2
-; SSE-NEXT:    andpd %xmm1, %xmm2
-; SSE-NEXT:    movsd {{.*#+}} xmm3 = [1.5707963267948966E+0,0.0E+0]
-; SSE-NEXT:    movapd %xmm3, %xmm4
-; SSE-NEXT:    cmpltsd %xmm2, %xmm4
-; SSE-NEXT:    andpd %xmm3, %xmm4
-; SSE-NEXT:    andpd %xmm1, %xmm4
-; SSE-NEXT:    andnpd %xmm0, %xmm1
-; SSE-NEXT:    orpd %xmm4, %xmm1
-; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    andpd %xmm0, %xmm1
+; SSE-NEXT:    movsd {{.*#+}} xmm2 = [1.5707963267948966E+0,0.0E+0]
+; SSE-NEXT:    movapd %xmm2, %xmm3
+; SSE-NEXT:    cmpltsd %xmm1, %xmm3
+; SSE-NEXT:    andpd %xmm2, %xmm3
+; SSE-NEXT:    andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    orpd %xmm3, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: PR136368:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [NaN,NaN]
-; AVX-NEXT:    # xmm1 = mem[0,0]
-; AVX-NEXT:    vandpd %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovsd {{.*#+}} xmm3 = [1.5707963267948966E+0,0.0E+0]
-; AVX-NEXT:    vcmpltsd %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vandpd %xmm3, %xmm2, %xmm2
-; AVX-NEXT:    vandnpd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vandpd %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX-NEXT:    vmovsd {{.*#+}} xmm2 = [1.5707963267948966E+0,0.0E+0]
+; AVX-NEXT:    vcmpltsd %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vandpd %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    vorpd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
   %fabs = tail call double @llvm.fabs.f64(double %x)

Known.Zero |= Known2.One;
break;
}
case X86ISD::FAND: {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why there's no X86ISD::AND/OR in this function?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

X86ISD::AND/OR are scalar (RESULT,EFLAGS) nodes - not relevant to this (I'm not sure we have any tests that need them yet either).

; SSE-NEXT: movapd %xmm2, %xmm3
; SSE-NEXT: cmpltsd %xmm1, %xmm3
; SSE-NEXT: andpd %xmm2, %xmm3
; SSE-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are loading extra 128-bit constant?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes - its not a bitselect pattern so canonicalizeBitSelect doesn't match, we just happen to have 0x80...0 and 0x7f...f in codegen.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't look good. We just reducing one andnpd here (omitting 2 movapd and one more register usage) but have to do one more 128-bit load.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

Inefficient codegen for copysign(known_zero_sign_bit, x)

3 participants