llvm
diff --git a/‎llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Lines changed: 74 additions & 1 deletion b/‎llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Lines changed: 74 additions & 1 deletion
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Lines changed: 0 additions & 89 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Lines changed: 0 additions & 89 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Lines changed: 8 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Lines changed: 8 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIISelLowering.h
Lines changed: 4 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/SIISelLowering.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
Lines changed: 10 additions & 8 deletions b/‎llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
Lines changed: 10 additions & 8 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
Lines changed: 9 additions & 9 deletions b/‎llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
Lines changed: 9 additions & 9 deletions
@@ -684,7 +684,7 @@ namespace {
                                   SDValue VecIn2, unsigned LeftIdx,
                                   bool DidSplitVec);
     SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast);
-
+    SDValue getBitwiseToSrcModifierOp(SDValue N);
     /// Walk up chain skipping non-aliasing memory nodes,
     /// looking for aliasing nodes and adding them to the Aliases vector.
     void GatherAllAliases(SDNode *N, SDValue OriginalChain,
@@ -12172,6 +12172,56 @@ SDValue DAGCombiner::foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True,
   return SDValue();
 }
 
+static EVT getFloatVT(EVT VT) {
+  EVT FT = MVT::getFloatingPointVT(VT.getScalarSizeInBits());
+  return VT.isVector() ? VT.changeVectorElementType(FT) : FT;
+}
+
+SDValue DAGCombiner::getBitwiseToSrcModifierOp(SDValue N) {
+
+  unsigned Opc = N.getNode()->getOpcode();
+  if (Opc != ISD::AND && Opc != ISD::XOR && Opc != ISD::OR)
+    return SDValue();
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  if(!TLI.shouldFoldSelectWithIdentityConstant(N.getOpcode(), N->getValueType(0), ISD::SELECT, LHS, RHS))
+    return SDValue();
+
+  ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
+
+  if (!CRHS)
+    return SDValue();
+
+  EVT VT = RHS.getValueType();
+  EVT FVT = getFloatVT(VT);
+  SDLoc SL = SDLoc(N);
+
+  switch (Opc) {
+  case ISD::XOR:
+    if (CRHS->getAPIntValue().isSignMask())
+      return DAG.getNode(ISD::FNEG, SL, FVT,
+                         DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
+    break;
+  case ISD::OR:
+    if (CRHS->getAPIntValue().isSignMask()) {
+      SDValue Abs = DAG.getNode(ISD::FABS, SL, FVT,
+                                DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
+      return DAG.getNode(ISD::FNEG, SL, FVT, Abs);
+    }
+    break;
+  case ISD::AND:
+    if (CRHS->getAPIntValue().isMaxSignedValue())
+      return DAG.getNode(ISD::FABS, SL, FVT,
+                         DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
+    break;
+  default:
+    return SDValue();
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitSELECT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -12387,6 +12437,29 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   if (SDValue R = combineSelectAsExtAnd(N0, N1, N2, DL, DAG))
     return R;
 
+  auto FoldSrcMods = [&](SDValue LHS, SDValue RHS, EVT VT) -> SDValue {
+    SDValue SrcModTrue = getBitwiseToSrcModifierOp(LHS);
+    SDValue SrcModFalse = getBitwiseToSrcModifierOp(RHS);
+    if (SrcModTrue || SrcModFalse) {
+      SDLoc SL(N);
+      EVT FVT =
+          SrcModTrue ? SrcModTrue.getValueType() : SrcModFalse.getValueType();
+      SDValue FLHS =
+          SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, LHS);
+      SDValue FRHS =
+          SrcModFalse ? SrcModFalse : DAG.getNode(ISD::BITCAST, SL, FVT, RHS);
+      SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, N0, FLHS, FRHS);
+      return DAG.getNode(ISD::BITCAST, SL, VT, FSelect);
+    }
+    return SDValue();
+  };
+
+  // Identify bitmask operations that are source mods and create
+  // the relevant fneg, fabs or fneg+fabs.
+  if (VT == MVT::i32 || VT == MVT::v2i32)
+    if (SDValue F = FoldSrcMods(N1, N2, VT))
+      return F;
+
   return SDValue();
 }
 
 
@@ -4936,95 +4936,6 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
       return SDValue();
     };
 
-    // Support source modifiers on integer operands.
-    if (VT == MVT::i32 || VT == MVT::v2i32)
-      if (SDValue F = FoldSrcMods(True, False, VT))
-        return F;
-
-    // auto SplitSelect = [&]() -> std::pair(
-    //  For i64 if a source modifier is to be folded in we split into two i32
-    //  select of high and low values. The Operator need only be applied to the
-    //  high values in order to change the sign bit.
-    if (VT == MVT::i64) {
-      bool TrueHasModifierOp =
-          (True.getOpcode() == ISD::AND || True.getOpcode() == ISD::OR ||
-           True.getOpcode() == ISD::XOR);
-
-      bool FalseHasModifierOp =
-          (False.getOpcode() == ISD::AND || False.getOpcode() == ISD::OR ||
-           False.getOpcode() == ISD::XOR);
-
-      ConstantSDNode *CTrueRHS = nullptr;
-      if (TrueHasModifierOp) {
-        SDValue TrueRHS = True->getOperand(1);
-        CTrueRHS = dyn_cast<ConstantSDNode>(TrueRHS);
-      }
-
-      ConstantSDNode *CFalseRHS = nullptr;
-      if (FalseHasModifierOp) {
-        SDValue FalseRHS = False->getOperand(1);
-        CFalseRHS = dyn_cast<ConstantSDNode>(FalseRHS);
-      }
-
-      // If True or False is a candidate for source modifier folding, extract
-      // the high value using APInt and reconstruct a ConstantSDNode.
-      SDValue TrueHiOp;
-      SDValue BCTrue = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, True);
-      SDValue TrueLo;
-      SDValue TrueHi;
-      if (CTrueRHS) {
-        SDValue TrueLHS = True->getOperand(0);
-        SDValue TrueLHSHiVal = getHiHalf64(BCTrue, DAG);
-        TrueLo = getLoHalf64(TrueLHS, DAG);
-        APInt CTrueRHSHiBits =
-            CTrueRHS->getAPIntValue().getHiBits(32).trunc(32);
-        SDValue CTrueRHSHiVal =
-            DAG.getConstant(CTrueRHSHiBits, SDLoc(N), MVT::i32);
-        unsigned OpcTrue = True.getOpcode();
-        TrueHiOp = DAG.getNode(OpcTrue, SDLoc(N), MVT::i32, TrueLHSHiVal,
-                               CTrueRHSHiVal);
-      } else {
-        TrueLo = getLoHalf64(BCTrue, DAG);
-        TrueHi = getHiHalf64(BCTrue, DAG);
-      }
-
-      SDValue FalseHiOp;
-      SDValue BCFalse = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, False);
-      SDValue FalseLo;
-      SDValue FalseHi;
-      if (CFalseRHS) {
-        SDValue FalseLHS = False->getOperand(0);
-        FalseLo = getLoHalf64(FalseLHS, DAG);
-        SDValue FalseLHSHiVal = getHiHalf64(BCFalse, DAG);
-        APInt CFalseRHSHiBits =
-            CFalseRHS->getAPIntValue().getHiBits(32).trunc(32);
-        SDValue CFalseRHSHiVal =
-            DAG.getConstant(CFalseRHSHiBits, SDLoc(N), MVT::i32);
-        unsigned OpcFalse = False.getOpcode();
-        FalseHiOp = DAG.getNode(OpcFalse, SDLoc(N), MVT::i32, FalseLHSHiVal,
-                                CFalseRHSHiVal);
-      } else {
-        FalseLo = getLoHalf64(BCFalse, DAG);
-        FalseHi = getHiHalf64(BCFalse, DAG);
-      }
-
-      if (CTrueRHS || CFalseRHS) {
-        // Place the low bits directly into the select. The operator is unneeded
-        // for these.
-        SDValue LoSelect =
-            DAG.getNode(ISD::SELECT, SDLoc(N), MVT::i32, Cond, TrueLo, FalseLo);
-        // If a source modifier may be folded use the bitwise-op of the high
-        // values, otherwise just pass the high part of the value.
-        SDValue FoldedHi =
-            FoldSrcMods(CTrueRHS ? TrueHiOp : TrueHi,
-                        CFalseRHS ? FalseHiOp : FalseHi, MVT::i32);
-
-        SDValue ResV =
-            DAG.getBuildVector(MVT::v2i32, SDLoc(N), {FoldedHi, LoSelect});
-        SDValue Res = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, ResV);
-        return Res;
-      }
-    }
   }
 
   // There's no reason to not do this if the condition has other uses.
 
@@ -15491,6 +15491,14 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
   return SDValue();
 }
 
+bool SITargetLowering::shouldFoldSelectWithIdentityConstant(
+    unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
+    SDValue Y) const {
+  return (BinOpcode == ISD::AND || BinOpcode == ISD::OR ||
+          BinOpcode == ISD::XOR) &&
+         (VT.getScalarType() == MVT::i32);
+}
+
 SDValue SITargetLowering::performSetCCCombine(SDNode *N,
                                               DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
 
@@ -264,6 +264,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override;
 
+  bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT,
+                                            unsigned SelectOpcode, SDValue X,
+                                            SDValue Y) const override;
+
 private:
   // Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
   // the three offsets (voffset, soffset and instoffset) into the SDValue[3]
 
@@ -7145,12 +7145,13 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
 ; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
-; GFX7LESS-NEXT:    v_or_b32_e32 v0, s4, v0
+; GFX7LESS-NEXT:    s_or_b32 s5, s4, s6
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX7LESS-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -8847,12 +8848,13 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
 ; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
-; GFX7LESS-NEXT:    v_or_b32_e32 v0, s4, v0
+; GFX7LESS-NEXT:    s_or_b32 s5, s4, s6
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX7LESS-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 
@@ -913,15 +913,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr3 = V_OR_B32_e32 killed $vgpr11, killed $vgpr19, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr53, 0, $vgpr3, 0, 0, 6, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr10 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr10, killed $vgpr2, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr17, 0, $vgpr3, 0, 0, 6, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec
-  ; GFX90A-NEXT:   DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr3 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 $vgpr3, killed $vgpr2, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vcc = V_CMP_NE_U32_sdwa 0, killed $vgpr53, 0, $vgpr10, 0, 0, 6, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, killed $vgpr2, 0, killed $vgpr3, killed $vcc, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 $vgpr15, killed $vgpr2, implicit $exec
+  ; GFX90A-NEXT:   renamable $vcc = V_CMP_NE_U32_sdwa 0, killed $vgpr17, 0, $vgpr10, 0, 0, 6, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, killed $vgpr2, 0, killed $vgpr15, killed $vcc, implicit $exec
+  ; GFX90A-NEXT:   DS_WRITE2_B32_gfx9 killed renamable $vgpr10, killed renamable $vgpr2, renamable $vgpr10, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3)
   ; GFX90A-NEXT:   S_BRANCH %bb.65
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.68.bb174: