@@ -4026,9 +4026,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
4026
4026
// / Split the 64-bit value \p LHS into two 32-bit components, and perform the
4027
4027
// / binary operation \p Opc to it with the corresponding constant operands.
4028
4028
SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl (
4029
- DAGCombinerInfo &DCI, const SDLoc &SL,
4030
- unsigned Opc, SDValue LHS,
4031
- uint32_t ValLo, uint32_t ValHi) const {
4029
+ DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
4030
+ uint32_t ValLo, uint32_t ValHi) const {
4032
4031
SelectionDAG &DAG = DCI.DAG ;
4033
4032
SDValue Lo, Hi;
4034
4033
std::tie (Lo, Hi) = split64BitValue (LHS, DAG);
@@ -4057,6 +4056,53 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
4057
4056
SDLoc SL (N);
4058
4057
SelectionDAG &DAG = DCI.DAG ;
4059
4058
4059
+ // When the shl64_reduce optimisation code is passed through vector
4060
+ // legalization some scalarising occurs. After ISD::AND was legalised, this
4061
+ // resulted in the AND instructions no longer being elided, as mentioned
4062
+ // below. The following code should make sure this takes place.
4063
+ if (RHS->getOpcode () == ISD::EXTRACT_VECTOR_ELT) {
4064
+ SDValue VAND = RHS.getOperand (0 );
4065
+ if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand (1 ))) {
4066
+ uint64_t AndIndex = RHS->getConstantOperandVal (1 );
4067
+ if (VAND->getOpcode () == ISD::AND && CRRHS) {
4068
+ SDValue LHSAND = VAND.getOperand (0 );
4069
+ SDValue RHSAND = VAND.getOperand (1 );
4070
+ if (RHSAND->getOpcode () == ISD::BUILD_VECTOR) {
4071
+ // Part of shlcombine is to optimise for the case where its possible
4072
+ // to reduce shl64 to shl32 if shift range is [63-32]. This
4073
+ // transforms: DST = shl i64 X, Y to [0, shl i32 X, (Y & 31) ]. The
4074
+ // '&' is then elided by ISel. The vector code for this was being
4075
+ // completely scalarised by the vector legalizer, but now v2i32 is
4076
+ // made legal the vector legaliser only partially scalarises the
4077
+ // vector operations and the and was not elided. This check enables us
4078
+ // to locate and scalarise the v2i32 and and re-enable ISel to elide
4079
+ // the and instruction.
4080
+ ConstantSDNode *CANDL =
4081
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand (0 ));
4082
+ ConstantSDNode *CANDR =
4083
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand (1 ));
4084
+ if (CANDL && CANDR && RHSAND->getConstantOperandVal (0 ) == 0x1f &&
4085
+ RHSAND->getConstantOperandVal (1 ) == 0x1f ) {
4086
+ // Get the non-const AND operands and produce scalar AND
4087
+ const SDValue Zero = DAG.getConstant (0 , SL, MVT::i32 );
4088
+ const SDValue One = DAG.getConstant (1 , SL, MVT::i32 );
4089
+ SDValue Lo = DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 ,
4090
+ LHSAND, Zero);
4091
+ SDValue Hi =
4092
+ DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 , LHSAND, One);
4093
+ SDValue AndMask = DAG.getConstant (0x1f , SL, MVT::i32 );
4094
+ SDValue LoAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Lo, AndMask);
4095
+ SDValue HiAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Hi, AndMask);
4096
+ SDValue Trunc = DAG.getNode (ISD::TRUNCATE, SL, MVT::i32 , LHS);
4097
+ if (AndIndex == 0 || AndIndex == 1 )
4098
+ return DAG.getNode (ISD::SHL, SL, MVT::i32 , Trunc,
4099
+ AndIndex == 0 ? LoAnd : HiAnd, N->getFlags ());
4100
+ }
4101
+ }
4102
+ }
4103
+ }
4104
+ }
4105
+
4060
4106
unsigned RHSVal;
4061
4107
if (CRHS) {
4062
4108
RHSVal = CRHS->getZExtValue ();
@@ -4098,8 +4144,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
4098
4144
if (VT.getScalarType () != MVT::i64 )
4099
4145
return SDValue ();
4100
4146
4101
- // i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))
4102
-
4103
4147
// On some subtargets, 64-bit shift is a quarter rate instruction. In the
4104
4148
// common case, splitting this into a move and a 32-bit shift is faster and
4105
4149
// the same code size.
@@ -4261,6 +4305,53 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
4261
4305
SDLoc SL (N);
4262
4306
unsigned RHSVal;
4263
4307
4308
+ // When the shl64_reduce optimisation code is passed through vector
4309
+ // legalization some scalarising occurs. After ISD::AND was legalised, this
4310
+ // resulted in the AND instructions no longer being elided, as mentioned
4311
+ // below. The following code should make sure this takes place.
4312
+ if (RHS->getOpcode () == ISD::EXTRACT_VECTOR_ELT) {
4313
+ SDValue VAND = RHS.getOperand (0 );
4314
+ if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand (1 ))) {
4315
+ uint64_t AndIndex = RHS->getConstantOperandVal (1 );
4316
+ if (VAND->getOpcode () == ISD::AND && CRRHS) {
4317
+ SDValue LHSAND = VAND.getOperand (0 );
4318
+ SDValue RHSAND = VAND.getOperand (1 );
4319
+ if (RHSAND->getOpcode () == ISD::BUILD_VECTOR) {
4320
+ // Part of srlcombine is to optimise for the case where its possible
4321
+ // to reduce shl64 to shl32 if shift range is [63-32]. This
4322
+ // transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The
4323
+ // '&' is then elided by ISel. The vector code for this was being
4324
+ // completely scalarised by the vector legalizer, but now v2i32 is
4325
+ // made legal the vector legaliser only partially scalarises the
4326
+ // vector operations and the and was not elided. This check enables us
4327
+ // to locate and scalarise the v2i32 and and re-enable ISel to elide
4328
+ // the and instruction.
4329
+ ConstantSDNode *CANDL =
4330
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand (0 ));
4331
+ ConstantSDNode *CANDR =
4332
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand (1 ));
4333
+ if (CANDL && CANDR && RHSAND->getConstantOperandVal (0 ) == 0x1f &&
4334
+ RHSAND->getConstantOperandVal (1 ) == 0x1f ) {
4335
+ // Get the non-const AND operands and produce scalar AND
4336
+ const SDValue Zero = DAG.getConstant (0 , SL, MVT::i32 );
4337
+ const SDValue One = DAG.getConstant (1 , SL, MVT::i32 );
4338
+ SDValue Lo = DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 ,
4339
+ LHSAND, Zero);
4340
+ SDValue Hi =
4341
+ DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 , LHSAND, One);
4342
+ SDValue AndMask = DAG.getConstant (0x1f , SL, MVT::i32 );
4343
+ SDValue LoAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Lo, AndMask);
4344
+ SDValue HiAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Hi, AndMask);
4345
+ SDValue Trunc = DAG.getNode (ISD::TRUNCATE, SL, MVT::i32 , LHS);
4346
+ if (AndIndex == 0 || AndIndex == 1 )
4347
+ return DAG.getNode (ISD::SRL, SL, MVT::i32 , Trunc,
4348
+ AndIndex == 0 ? LoAnd : HiAnd, N->getFlags ());
4349
+ }
4350
+ }
4351
+ }
4352
+ }
4353
+ }
4354
+
4264
4355
if (CRHS) {
4265
4356
RHSVal = CRHS->getZExtValue ();
4266
4357
@@ -4774,8 +4865,26 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
4774
4865
if (!AMDGPUTargetLowering::allUsesHaveSourceMods (N.getNode ()))
4775
4866
return SDValue ();
4776
4867
4777
- return distributeOpThroughSelect (DCI, LHS.getOpcode (),
4778
- SDLoc (N), Cond, LHS, RHS);
4868
+ // select c, (fneg (f32 bitcast i32 x)), (fneg (f32 bitcast i32 y)) can be
4869
+ // lowered directly to a V_CNDMASK_. So prevent the fneg from being pulled
4870
+ // out in this case. For now I've made the logic as specific to the case as
4871
+ // possible, hopefully this can be relaxed in future.
4872
+ if (LHS.getOpcode () == ISD::FNEG && RHS.getOpcode () == ISD::FNEG) {
4873
+ SDValue LHSB = LHS.getOperand (0 );
4874
+ SDValue RHSB = RHS.getOperand (0 );
4875
+ if (LHSB.getOpcode () == ISD::BITCAST &&
4876
+ RHSB->getOpcode () == ISD::BITCAST) {
4877
+ EVT LHSBOpTy = LHSB->getOperand (0 ).getValueType ();
4878
+ EVT RHSBOpTy = RHSB->getOperand (0 ).getValueType ();
4879
+ if (LHSB.getValueType () == MVT::f32 &&
4880
+ RHSB.getValueType () == MVT::f32 && LHSBOpTy == MVT::i32 &&
4881
+ RHSBOpTy == MVT::i32 )
4882
+ return SDValue ();
4883
+ }
4884
+ }
4885
+
4886
+ return distributeOpThroughSelect (DCI, LHS.getOpcode (), SDLoc (N), Cond, LHS,
4887
+ RHS);
4779
4888
}
4780
4889
4781
4890
bool Inv = false ;
@@ -4828,8 +4937,8 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
4828
4937
if (Inv)
4829
4938
std::swap (NewLHS, NewRHS);
4830
4939
4831
- SDValue NewSelect = DAG. getNode (ISD::SELECT, SL, VT,
4832
- Cond, NewLHS, NewRHS);
4940
+ SDValue NewSelect =
4941
+ DAG. getNode (ISD::SELECT, SL, VT, Cond, NewLHS, NewRHS);
4833
4942
DCI.AddToWorklist (NewSelect.getNode ());
4834
4943
return DAG.getNode (LHS.getOpcode (), SL, VT, NewSelect);
4835
4944
}
@@ -5167,8 +5276,25 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
5167
5276
}
5168
5277
case ISD::SELECT: {
5169
5278
// fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5279
+ // This combine became necessary recently to prevent a regression in
5280
+ // fneg-modifier-casting.ll caused by this patch legalising v2i32 xor.
5281
+ // Specifically, additional instructions were added to the final codegen.
5282
+ // When adding this combine a case was added to performFNEGCombine to
5283
+ // prevent this combine from being undone under certain conditions.
5170
5284
// TODO: Invert conditions of foldFreeOpFromSelect
5171
- return SDValue ();
5285
+ SDValue Cond = N0.getOperand (0 );
5286
+ SDValue LHS = N0.getOperand (1 );
5287
+ SDValue RHS = N0.getOperand (2 );
5288
+ EVT LHVT = LHS.getValueType ();
5289
+ EVT RHVT = RHS.getValueType ();
5290
+ // The regression was limited to i32 v2/i32.
5291
+ if (RHVT != MVT::i32 && LHVT != MVT::i32 )
5292
+ return SDValue ();
5293
+
5294
+ SDValue LFNeg = DAG.getNode (ISD::FNEG, SL, LHVT, LHS);
5295
+ SDValue RFNeg = DAG.getNode (ISD::FNEG, SL, RHVT, RHS);
5296
+ SDValue Op = DAG.getNode (Opc, SL, LHVT, Cond, LFNeg, RFNeg);
5297
+ return Op;
5172
5298
}
5173
5299
case ISD::BITCAST: {
5174
5300
SDLoc SL (N);
0 commit comments