llvm · chrisjbris · Jul 2, 2025 · Jul 2, 2025 · Jul 7, 2025 · Jul 7, 2025
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -602,6 +602,7 @@ namespace {
     SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1,
                                    SDValue N2, SDValue N3, ISD::CondCode CC);
     SDValue foldSelectOfBinops(SDNode *N);
+    SDValue bitmaskOperandsToSignInstructions(SDNode *N);
     SDValue foldSextSetcc(SDNode *N);
     SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
                               const SDLoc &DL);
@@ -12175,6 +12176,73 @@ SDValue DAGCombiner::foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True,
   return SDValue();
 }
 
+// Replace bitwise operations that modify the sign bit of integers
+// with FABS and FNEG.
+static SDValue getBitMaskToInstruction(SDValue N, SelectionDAG &DAG) {
+
+  unsigned Opc = N.getNode()->getOpcode();
+  if (Opc != ISD::AND && Opc != ISD::XOR && Opc != ISD::OR)
+    return SDValue();
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.shouldFoldSelectWithIdentityConstant(
+          N.getOpcode(), N->getValueType(0), ISD::SELECT, LHS, RHS))
+    return SDValue();
+
+  ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
+  if (!CRHS)
+    return SDValue();
+
+  EVT VT = RHS.getValueType();
+  EVT FT = MVT::getFloatingPointVT(VT.getScalarSizeInBits());
+  EVT FVT = VT.isVector() ? VT.changeVectorElementType(FT) : FT;
+  SDLoc SL = SDLoc(N);
+
+  switch (Opc) {
+  case ISD::XOR:
+    if (CRHS->getAPIntValue().isSignMask())
+      return DAG.getNode(ISD::FNEG, SL, FVT,
+                         DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
+    break;
+  case ISD::OR:
+    if (CRHS->getAPIntValue().isSignMask()) {
+      SDValue Abs = DAG.getNode(ISD::FABS, SL, FVT,
+                                DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
+      return DAG.getNode(ISD::FNEG, SL, FVT, Abs);
+    }
+    break;
+  case ISD::AND:
+    if (CRHS->getAPIntValue().isMaxSignedValue())
+      return DAG.getNode(ISD::FABS, SL, FVT,
+                         DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
+    break;
+  default:
+    return SDValue();
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::bitmaskOperandsToSignInstructions(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  EVT VT = N->getValueType(0);
+  SDValue SrcModN1 = getBitMaskToInstruction(N1, DAG);
+  SDValue SrcModN2 = getBitMaskToInstruction(N2, DAG);
+  if (SrcModN1 || SrcModN2) {
+    SDLoc SL(N);
+    EVT FVT = SrcModN1 ? SrcModN1.getValueType() : SrcModN2.getValueType();
+    SDValue FN1 = SrcModN1 ? SrcModN1 : DAG.getNode(ISD::BITCAST, SL, FVT, N1);
+    SDValue FN2 = SrcModN2 ? SrcModN2 : DAG.getNode(ISD::BITCAST, SL, FVT, N2);
+    SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, N0, FN1, FN2);
+    return DAG.getNode(ISD::BITCAST, SL, VT, FSelect);
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitSELECT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -12390,6 +12458,11 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   if (SDValue R = combineSelectAsExtAnd(N0, N1, N2, DL, DAG))
     return R;
 
+  // Identify bitmask operations that modify only the sign bit
+  // and replace with FNEG or FABS as appropriate.
+  if (SDValue F = bitmaskOperandsToSignInstructions(N))
+    return F;
+
   return SDValue();
 }
 

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15493,6 +15493,27 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
   return SDValue();
 }
 
+bool SITargetLowering::shouldFoldSelectWithIdentityConstant(
+    unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
+    SDValue Y) const {
+
+  if (BinOpcode != ISD::AND && BinOpcode != ISD::OR && BinOpcode != ISD::XOR)
+    return false;
+
+  ConstantSDNode *CY = isConstOrConstSplat(Y);
+  if (!CY)
+    return false;
+
+  if (!CY->getAPIntValue().isSignMask() &&
+      !CY->getAPIntValue().isMaxSignedValue())
+    return false;
+
+  if (VT.getScalarType() != MVT::i32)
+    return false;
+
+  return true;
+}
+
 SDValue SITargetLowering::performSetCCCombine(SDNode *N,
                                               DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -264,6 +264,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override;
 
+  bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT,
+                                            unsigned SelectOpcode, SDValue X,
+                                            SDValue Y) const override;
+
 private:
   // Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
   // the three offsets (voffset, soffset and instoffset) into the SDValue[3]

diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -349,29 +349,24 @@ define i32 @select_fneg_xor_select_i32(i1 %cond0, i1 %cond1, i32 %arg0, i32 %arg
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v2, v3, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -v0, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_xor_select_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v2, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, -v0, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = xor i32 %arg0, -2147483648
   %select0 = select i1 %cond0, i32 %arg1, i32 %fneg0
@@ -550,31 +545,25 @@ define i64 @select_fneg_xor_select_i64(i1 %cond0, i1 %cond1, i64 %arg0, i64 %arg
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
-; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, -v3, v5, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, -v2, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_xor_select_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_and_b32 v1, 1, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, -v3, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, -v2, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = xor i64 %arg0, 9223372036854775808
   %select0 = select i1 %cond0, i64 %arg1, i64 %fneg0
@@ -936,25 +925,23 @@ define double @cospiD_pattern0(i32 %arg, double %arg1, double %arg2) {
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
-; GCN-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; GCN-NEXT:    v_xor_b32_e32 v1, v1, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, -v1, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v0, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: cospiD_pattern0:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v5, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v5
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
 ; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 0x80000000, vcc_lo
-; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, -v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %i = and i32 %arg, 1
   %i3 = icmp eq i32 %i, 0