Remove unnnecessary node duplication

chrisjbris · chrisjbris · commit 45e974fb4c94 · 2025-08-07T15:10:49.000-05:00
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -719,6 +719,18 @@ static bool selectSupportsSourceMods(const SDNode *N) {
   return N->getValueType(0) == MVT::f32;
 }
 
+LLVM_READONLY
+static bool buildVectorSupportsSourceMods(const SDNode *N) {
+  if (N->getValueType(0) != MVT::v2f32)
+    return true;
+
+  if (N->getOperand(0)->getOpcode() != ISD::SELECT ||
+      N->getOperand(1)->getOpcode() != ISD::SELECT)
+    return true;
+
+  return false;
+}
+
 // Most FP instructions support source modifiers, but this could be refined
 // slightly.
 LLVM_READONLY
@@ -752,6 +764,8 @@ static bool hasSourceMods(const SDNode *N) {
       return true;
     }
   }
+  case ISD::BUILD_VECTOR:
+    return buildVectorSupportsSourceMods(N);
   case ISD::SELECT:
     return selectSupportsSourceMods(N);
   default:
@@ -4087,9 +4101,10 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
                                      LHSAND, Zero);
             SDValue Hi =
                 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
-            SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
-            SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
-            SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
+            SDValue LoAnd =
+                DAG.getNode(ISD::AND, SL, MVT::i32, Lo, RHSAND->getOperand(0));
+            SDValue HiAnd =
+                DAG.getNode(ISD::AND, SL, MVT::i32, Hi, RHSAND->getOperand(0));
             SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
             if (AndIndex == 0 || AndIndex == 1)
               return DAG.getNode(ISD::SHL, SL, MVT::i32, Trunc,
@@ -4858,24 +4873,6 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
     if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
       return SDValue();
 
-    // select c, (fneg (f32 bitcast i32 x)), (fneg (f32 bitcast i32 y)) can be
-    // lowered directly to a V_CNDMASK_. So prevent the fneg from being pulled
-    // out in this case. For now I've made the logic as specific to the case as
-    // possible, hopefully this can be relaxed in future.
-    if (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG) {
-      SDValue LHSB = LHS.getOperand(0);
-      SDValue RHSB = RHS.getOperand(0);
-      if (LHSB.getOpcode() == ISD::BITCAST &&
-          RHSB->getOpcode() == ISD::BITCAST) {
-        EVT LHSBOpTy = LHSB->getOperand(0).getValueType();
-        EVT RHSBOpTy = RHSB->getOperand(0).getValueType();
-        if (LHSB.getValueType() == MVT::f32 &&
-            RHSB.getValueType() == MVT::f32 && LHSBOpTy == MVT::i32 &&
-            RHSBOpTy == MVT::i32)
-          return SDValue();
-      }
-    }
-
     return distributeOpThroughSelect(DCI, LHS.getOpcode(), SDLoc(N), Cond, LHS,
                                      RHS);
   }
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -1634,12 +1634,12 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_cselect_b32 s1, s1, s3
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, -v0, -v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX7-NEXT:    s_cselect_b32 s0, s0, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, -v0, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -1658,10 +1658,10 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_cselect_b32 s1, s1, s3
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, -v0, -v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    s_cselect_b32 s0, s0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v0, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
@@ -1672,17 +1672,17 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x10
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x18
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-NEXT:    s_bitcmp1_b32 s6, 0
 ; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, -s3, -v0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, s3, v0, vcc_lo
 ; GFX11-NEXT:    s_and_b32 s6, vcc_lo, exec_lo
 ; GFX11-NEXT:    s_cselect_b32 s1, s1, s3
 ; GFX11-NEXT:    s_cselect_b32 s0, s0, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, s1, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, s1, -v0, vcc_lo
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_endpgm