@@ -4031,9 +4031,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
4031
4031
// / Split the 64-bit value \p LHS into two 32-bit components, and perform the
4032
4032
// / binary operation \p Opc to it with the corresponding constant operands.
4033
4033
SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl (
4034
- DAGCombinerInfo &DCI, const SDLoc &SL,
4035
- unsigned Opc, SDValue LHS,
4036
- uint32_t ValLo, uint32_t ValHi) const {
4034
+ DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
4035
+ uint32_t ValLo, uint32_t ValHi) const {
4037
4036
SelectionDAG &DAG = DCI.DAG ;
4038
4037
SDValue Lo, Hi;
4039
4038
std::tie (Lo, Hi) = split64BitValue (LHS, DAG);
@@ -4062,6 +4061,53 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
4062
4061
SDLoc SL (N);
4063
4062
SelectionDAG &DAG = DCI.DAG ;
4064
4063
4064
+ // When the shl64_reduce optimisation code is passed through vector
4065
+ // legalization some scalarising occurs. After ISD::AND was legalised, this
4066
+ // resulted in the AND instructions no longer being elided, as mentioned
4067
+ // below. The following code should make sure this takes place.
4068
+ if (RHS->getOpcode () == ISD::EXTRACT_VECTOR_ELT) {
4069
+ SDValue VAND = RHS.getOperand (0 );
4070
+ if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand (1 ))) {
4071
+ uint64_t AndIndex = RHS->getConstantOperandVal (1 );
4072
+ if (VAND->getOpcode () == ISD::AND && CRRHS) {
4073
+ SDValue LHSAND = VAND.getOperand (0 );
4074
+ SDValue RHSAND = VAND.getOperand (1 );
4075
+ if (RHSAND->getOpcode () == ISD::BUILD_VECTOR) {
4076
+ // Part of shlcombine is to optimise for the case where its possible
4077
+ // to reduce shl64 to shl32 if shift range is [63-32]. This
4078
+ // transforms: DST = shl i64 X, Y to [0, shl i32 X, (Y & 31) ]. The
4079
+ // '&' is then elided by ISel. The vector code for this was being
4080
+ // completely scalarised by the vector legalizer, but now v2i32 is
4081
+ // made legal the vector legaliser only partially scalarises the
4082
+ // vector operations and the and was not elided. This check enables us
4083
+ // to locate and scalarise the v2i32 and and re-enable ISel to elide
4084
+ // the and instruction.
4085
+ ConstantSDNode *CANDL =
4086
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand (0 ));
4087
+ ConstantSDNode *CANDR =
4088
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand (1 ));
4089
+ if (CANDL && CANDR && RHSAND->getConstantOperandVal (0 ) == 0x1f &&
4090
+ RHSAND->getConstantOperandVal (1 ) == 0x1f ) {
4091
+ // Get the non-const AND operands and produce scalar AND
4092
+ const SDValue Zero = DAG.getConstant (0 , SL, MVT::i32 );
4093
+ const SDValue One = DAG.getConstant (1 , SL, MVT::i32 );
4094
+ SDValue Lo = DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 ,
4095
+ LHSAND, Zero);
4096
+ SDValue Hi =
4097
+ DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 , LHSAND, One);
4098
+ SDValue AndMask = DAG.getConstant (0x1f , SL, MVT::i32 );
4099
+ SDValue LoAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Lo, AndMask);
4100
+ SDValue HiAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Hi, AndMask);
4101
+ SDValue Trunc = DAG.getNode (ISD::TRUNCATE, SL, MVT::i32 , LHS);
4102
+ if (AndIndex == 0 || AndIndex == 1 )
4103
+ return DAG.getNode (ISD::SHL, SL, MVT::i32 , Trunc,
4104
+ AndIndex == 0 ? LoAnd : HiAnd, N->getFlags ());
4105
+ }
4106
+ }
4107
+ }
4108
+ }
4109
+ }
4110
+
4065
4111
unsigned RHSVal;
4066
4112
if (CRHS) {
4067
4113
RHSVal = CRHS->getZExtValue ();
@@ -4103,8 +4149,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
4103
4149
if (VT.getScalarType () != MVT::i64 )
4104
4150
return SDValue ();
4105
4151
4106
- // i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))
4107
-
4108
4152
// On some subtargets, 64-bit shift is a quarter rate instruction. In the
4109
4153
// common case, splitting this into a move and a 32-bit shift is faster and
4110
4154
// the same code size.
@@ -4266,6 +4310,53 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
4266
4310
SDLoc SL (N);
4267
4311
unsigned RHSVal;
4268
4312
4313
+ // When the shl64_reduce optimisation code is passed through vector
4314
+ // legalization some scalarising occurs. After ISD::AND was legalised, this
4315
+ // resulted in the AND instructions no longer being elided, as mentioned
4316
+ // below. The following code should make sure this takes place.
4317
+ if (RHS->getOpcode () == ISD::EXTRACT_VECTOR_ELT) {
4318
+ SDValue VAND = RHS.getOperand (0 );
4319
+ if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand (1 ))) {
4320
+ uint64_t AndIndex = RHS->getConstantOperandVal (1 );
4321
+ if (VAND->getOpcode () == ISD::AND && CRRHS) {
4322
+ SDValue LHSAND = VAND.getOperand (0 );
4323
+ SDValue RHSAND = VAND.getOperand (1 );
4324
+ if (RHSAND->getOpcode () == ISD::BUILD_VECTOR) {
4325
+ // Part of srlcombine is to optimise for the case where its possible
4326
+ // to reduce shl64 to shl32 if shift range is [63-32]. This
4327
+ // transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The
4328
+ // '&' is then elided by ISel. The vector code for this was being
4329
+ // completely scalarised by the vector legalizer, but now v2i32 is
4330
+ // made legal the vector legaliser only partially scalarises the
4331
+ // vector operations and the and was not elided. This check enables us
4332
+ // to locate and scalarise the v2i32 and and re-enable ISel to elide
4333
+ // the and instruction.
4334
+ ConstantSDNode *CANDL =
4335
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand (0 ));
4336
+ ConstantSDNode *CANDR =
4337
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand (1 ));
4338
+ if (CANDL && CANDR && RHSAND->getConstantOperandVal (0 ) == 0x1f &&
4339
+ RHSAND->getConstantOperandVal (1 ) == 0x1f ) {
4340
+ // Get the non-const AND operands and produce scalar AND
4341
+ const SDValue Zero = DAG.getConstant (0 , SL, MVT::i32 );
4342
+ const SDValue One = DAG.getConstant (1 , SL, MVT::i32 );
4343
+ SDValue Lo = DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 ,
4344
+ LHSAND, Zero);
4345
+ SDValue Hi =
4346
+ DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 , LHSAND, One);
4347
+ SDValue AndMask = DAG.getConstant (0x1f , SL, MVT::i32 );
4348
+ SDValue LoAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Lo, AndMask);
4349
+ SDValue HiAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Hi, AndMask);
4350
+ SDValue Trunc = DAG.getNode (ISD::TRUNCATE, SL, MVT::i32 , LHS);
4351
+ if (AndIndex == 0 || AndIndex == 1 )
4352
+ return DAG.getNode (ISD::SRL, SL, MVT::i32 , Trunc,
4353
+ AndIndex == 0 ? LoAnd : HiAnd, N->getFlags ());
4354
+ }
4355
+ }
4356
+ }
4357
+ }
4358
+ }
4359
+
4269
4360
if (CRHS) {
4270
4361
RHSVal = CRHS->getZExtValue ();
4271
4362
@@ -4779,8 +4870,26 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
4779
4870
if (!AMDGPUTargetLowering::allUsesHaveSourceMods (N.getNode ()))
4780
4871
return SDValue ();
4781
4872
4782
- return distributeOpThroughSelect (DCI, LHS.getOpcode (),
4783
- SDLoc (N), Cond, LHS, RHS);
4873
+ // select c, (fneg (f32 bitcast i32 x)), (fneg (f32 bitcast i32 y)) can be
4874
+ // lowered directly to a V_CNDMASK_. So prevent the fneg from being pulled
4875
+ // out in this case. For now I've made the logic as specific to the case as
4876
+ // possible, hopefully this can be relaxed in future.
4877
+ if (LHS.getOpcode () == ISD::FNEG && RHS.getOpcode () == ISD::FNEG) {
4878
+ SDValue LHSB = LHS.getOperand (0 );
4879
+ SDValue RHSB = RHS.getOperand (0 );
4880
+ if (LHSB.getOpcode () == ISD::BITCAST &&
4881
+ RHSB->getOpcode () == ISD::BITCAST) {
4882
+ EVT LHSBOpTy = LHSB->getOperand (0 ).getValueType ();
4883
+ EVT RHSBOpTy = RHSB->getOperand (0 ).getValueType ();
4884
+ if (LHSB.getValueType () == MVT::f32 &&
4885
+ RHSB.getValueType () == MVT::f32 && LHSBOpTy == MVT::i32 &&
4886
+ RHSBOpTy == MVT::i32 )
4887
+ return SDValue ();
4888
+ }
4889
+ }
4890
+
4891
+ return distributeOpThroughSelect (DCI, LHS.getOpcode (), SDLoc (N), Cond, LHS,
4892
+ RHS);
4784
4893
}
4785
4894
4786
4895
bool Inv = false ;
@@ -4833,8 +4942,8 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
4833
4942
if (Inv)
4834
4943
std::swap (NewLHS, NewRHS);
4835
4944
4836
- SDValue NewSelect = DAG. getNode (ISD::SELECT, SL, VT,
4837
- Cond, NewLHS, NewRHS);
4945
+ SDValue NewSelect =
4946
+ DAG. getNode (ISD::SELECT, SL, VT, Cond, NewLHS, NewRHS);
4838
4947
DCI.AddToWorklist (NewSelect.getNode ());
4839
4948
return DAG.getNode (LHS.getOpcode (), SL, VT, NewSelect);
4840
4949
}
@@ -5172,8 +5281,25 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
5172
5281
}
5173
5282
case ISD::SELECT: {
5174
5283
// fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5284
+ // This combine became necessary recently to prevent a regression in
5285
+ // fneg-modifier-casting.ll caused by this patch legalising v2i32 xor.
5286
+ // Specifically, additional instructions were added to the final codegen.
5287
+ // When adding this combine a case was added to performFNEGCombine to
5288
+ // prevent this combine from being undone under certain conditions.
5175
5289
// TODO: Invert conditions of foldFreeOpFromSelect
5176
- return SDValue ();
5290
+ SDValue Cond = N0.getOperand (0 );
5291
+ SDValue LHS = N0.getOperand (1 );
5292
+ SDValue RHS = N0.getOperand (2 );
5293
+ EVT LHVT = LHS.getValueType ();
5294
+ EVT RHVT = RHS.getValueType ();
5295
+ // The regression was limited to i32 v2/i32.
5296
+ if (RHVT != MVT::i32 && LHVT != MVT::i32 )
5297
+ return SDValue ();
5298
+
5299
+ SDValue LFNeg = DAG.getNode (ISD::FNEG, SL, LHVT, LHS);
5300
+ SDValue RFNeg = DAG.getNode (ISD::FNEG, SL, RHVT, RHS);
5301
+ SDValue Op = DAG.getNode (Opc, SL, LHVT, Cond, LFNeg, RFNeg);
5302
+ return Op;
5177
5303
}
5178
5304
case ISD::BITCAST: {
5179
5305
SDLoc SL (N);
0 commit comments