@@ -4033,9 +4033,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
4033
4033
// / Split the 64-bit value \p LHS into two 32-bit components, and perform the
4034
4034
// / binary operation \p Opc to it with the corresponding constant operands.
4035
4035
SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl (
4036
- DAGCombinerInfo &DCI, const SDLoc &SL,
4037
- unsigned Opc, SDValue LHS,
4038
- uint32_t ValLo, uint32_t ValHi) const {
4036
+ DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
4037
+ uint32_t ValLo, uint32_t ValHi) const {
4039
4038
SelectionDAG &DAG = DCI.DAG ;
4040
4039
SDValue Lo, Hi;
4041
4040
std::tie (Lo, Hi) = split64BitValue (LHS, DAG);
@@ -4064,6 +4063,53 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
4064
4063
SDLoc SL (N);
4065
4064
SelectionDAG &DAG = DCI.DAG ;
4066
4065
4066
+ // When the shl64_reduce optimisation code is passed through vector
4067
+ // legalization some scalarising occurs. After ISD::AND was legalised, this
4068
+ // resulted in the AND instructions no longer being elided, as mentioned
4069
+ // below. The following code should make sure this takes place.
4070
+ if (RHS->getOpcode () == ISD::EXTRACT_VECTOR_ELT) {
4071
+ SDValue VAND = RHS.getOperand (0 );
4072
+ if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand (1 ))) {
4073
+ uint64_t AndIndex = RHS->getConstantOperandVal (1 );
4074
+ if (VAND->getOpcode () == ISD::AND && CRRHS) {
4075
+ SDValue LHSAND = VAND.getOperand (0 );
4076
+ SDValue RHSAND = VAND.getOperand (1 );
4077
+ if (RHSAND->getOpcode () == ISD::BUILD_VECTOR) {
4078
+ // Part of shlcombine is to optimise for the case where its possible
4079
+ // to reduce shl64 to shl32 if shift range is [63-32]. This
4080
+ // transforms: DST = shl i64 X, Y to [0, shl i32 X, (Y & 31) ]. The
4081
+ // '&' is then elided by ISel. The vector code for this was being
4082
+ // completely scalarised by the vector legalizer, but now v2i32 is
4083
+ // made legal the vector legaliser only partially scalarises the
4084
+ // vector operations and the and was not elided. This check enables us
4085
+ // to locate and scalarise the v2i32 and and re-enable ISel to elide
4086
+ // the and instruction.
4087
+ ConstantSDNode *CANDL =
4088
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand (0 ));
4089
+ ConstantSDNode *CANDR =
4090
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand (1 ));
4091
+ if (CANDL && CANDR && RHSAND->getConstantOperandVal (0 ) == 0x1f &&
4092
+ RHSAND->getConstantOperandVal (1 ) == 0x1f ) {
4093
+ // Get the non-const AND operands and produce scalar AND
4094
+ const SDValue Zero = DAG.getConstant (0 , SL, MVT::i32 );
4095
+ const SDValue One = DAG.getConstant (1 , SL, MVT::i32 );
4096
+ SDValue Lo = DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 ,
4097
+ LHSAND, Zero);
4098
+ SDValue Hi =
4099
+ DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 , LHSAND, One);
4100
+ SDValue AndMask = DAG.getConstant (0x1f , SL, MVT::i32 );
4101
+ SDValue LoAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Lo, AndMask);
4102
+ SDValue HiAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Hi, AndMask);
4103
+ SDValue Trunc = DAG.getNode (ISD::TRUNCATE, SL, MVT::i32 , LHS);
4104
+ if (AndIndex == 0 || AndIndex == 1 )
4105
+ return DAG.getNode (ISD::SHL, SL, MVT::i32 , Trunc,
4106
+ AndIndex == 0 ? LoAnd : HiAnd, N->getFlags ());
4107
+ }
4108
+ }
4109
+ }
4110
+ }
4111
+ }
4112
+
4067
4113
unsigned RHSVal;
4068
4114
if (CRHS) {
4069
4115
RHSVal = CRHS->getZExtValue ();
@@ -4105,8 +4151,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
4105
4151
if (VT.getScalarType () != MVT::i64 )
4106
4152
return SDValue ();
4107
4153
4108
- // i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))
4109
-
4110
4154
// On some subtargets, 64-bit shift is a quarter rate instruction. In the
4111
4155
// common case, splitting this into a move and a 32-bit shift is faster and
4112
4156
// the same code size.
@@ -4268,6 +4312,53 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
4268
4312
SDLoc SL (N);
4269
4313
unsigned RHSVal;
4270
4314
4315
+ // When the shl64_reduce optimisation code is passed through vector
4316
+ // legalization some scalarising occurs. After ISD::AND was legalised, this
4317
+ // resulted in the AND instructions no longer being elided, as mentioned
4318
+ // below. The following code should make sure this takes place.
4319
+ if (RHS->getOpcode () == ISD::EXTRACT_VECTOR_ELT) {
4320
+ SDValue VAND = RHS.getOperand (0 );
4321
+ if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand (1 ))) {
4322
+ uint64_t AndIndex = RHS->getConstantOperandVal (1 );
4323
+ if (VAND->getOpcode () == ISD::AND && CRRHS) {
4324
+ SDValue LHSAND = VAND.getOperand (0 );
4325
+ SDValue RHSAND = VAND.getOperand (1 );
4326
+ if (RHSAND->getOpcode () == ISD::BUILD_VECTOR) {
4327
+ // Part of srlcombine is to optimise for the case where its possible
4328
+ // to reduce shl64 to shl32 if shift range is [63-32]. This
4329
+ // transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The
4330
+ // '&' is then elided by ISel. The vector code for this was being
4331
+ // completely scalarised by the vector legalizer, but now v2i32 is
4332
+ // made legal the vector legaliser only partially scalarises the
4333
+ // vector operations and the and was not elided. This check enables us
4334
+ // to locate and scalarise the v2i32 and and re-enable ISel to elide
4335
+ // the and instruction.
4336
+ ConstantSDNode *CANDL =
4337
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand (0 ));
4338
+ ConstantSDNode *CANDR =
4339
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand (1 ));
4340
+ if (CANDL && CANDR && RHSAND->getConstantOperandVal (0 ) == 0x1f &&
4341
+ RHSAND->getConstantOperandVal (1 ) == 0x1f ) {
4342
+ // Get the non-const AND operands and produce scalar AND
4343
+ const SDValue Zero = DAG.getConstant (0 , SL, MVT::i32 );
4344
+ const SDValue One = DAG.getConstant (1 , SL, MVT::i32 );
4345
+ SDValue Lo = DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 ,
4346
+ LHSAND, Zero);
4347
+ SDValue Hi =
4348
+ DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 , LHSAND, One);
4349
+ SDValue AndMask = DAG.getConstant (0x1f , SL, MVT::i32 );
4350
+ SDValue LoAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Lo, AndMask);
4351
+ SDValue HiAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Hi, AndMask);
4352
+ SDValue Trunc = DAG.getNode (ISD::TRUNCATE, SL, MVT::i32 , LHS);
4353
+ if (AndIndex == 0 || AndIndex == 1 )
4354
+ return DAG.getNode (ISD::SRL, SL, MVT::i32 , Trunc,
4355
+ AndIndex == 0 ? LoAnd : HiAnd, N->getFlags ());
4356
+ }
4357
+ }
4358
+ }
4359
+ }
4360
+ }
4361
+
4271
4362
if (CRHS) {
4272
4363
RHSVal = CRHS->getZExtValue ();
4273
4364
@@ -4781,8 +4872,26 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
4781
4872
if (!AMDGPUTargetLowering::allUsesHaveSourceMods (N.getNode ()))
4782
4873
return SDValue ();
4783
4874
4784
- return distributeOpThroughSelect (DCI, LHS.getOpcode (),
4785
- SDLoc (N), Cond, LHS, RHS);
4875
+ // select c, (fneg (f32 bitcast i32 x)), (fneg (f32 bitcast i32 y)) can be
4876
+ // lowered directly to a V_CNDMASK_. So prevent the fneg from being pulled
4877
+ // out in this case. For now I've made the logic as specific to the case as
4878
+ // possible, hopefully this can be relaxed in future.
4879
+ if (LHS.getOpcode () == ISD::FNEG && RHS.getOpcode () == ISD::FNEG) {
4880
+ SDValue LHSB = LHS.getOperand (0 );
4881
+ SDValue RHSB = RHS.getOperand (0 );
4882
+ if (LHSB.getOpcode () == ISD::BITCAST &&
4883
+ RHSB->getOpcode () == ISD::BITCAST) {
4884
+ EVT LHSBOpTy = LHSB->getOperand (0 ).getValueType ();
4885
+ EVT RHSBOpTy = RHSB->getOperand (0 ).getValueType ();
4886
+ if (LHSB.getValueType () == MVT::f32 &&
4887
+ RHSB.getValueType () == MVT::f32 && LHSBOpTy == MVT::i32 &&
4888
+ RHSBOpTy == MVT::i32 )
4889
+ return SDValue ();
4890
+ }
4891
+ }
4892
+
4893
+ return distributeOpThroughSelect (DCI, LHS.getOpcode (), SDLoc (N), Cond, LHS,
4894
+ RHS);
4786
4895
}
4787
4896
4788
4897
bool Inv = false ;
@@ -4835,8 +4944,8 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
4835
4944
if (Inv)
4836
4945
std::swap (NewLHS, NewRHS);
4837
4946
4838
- SDValue NewSelect = DAG. getNode (ISD::SELECT, SL, VT,
4839
- Cond, NewLHS, NewRHS);
4947
+ SDValue NewSelect =
4948
+ DAG. getNode (ISD::SELECT, SL, VT, Cond, NewLHS, NewRHS);
4840
4949
DCI.AddToWorklist (NewSelect.getNode ());
4841
4950
return DAG.getNode (LHS.getOpcode (), SL, VT, NewSelect);
4842
4951
}
@@ -5257,8 +5366,25 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
5257
5366
}
5258
5367
case ISD::SELECT: {
5259
5368
// fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5369
+ // This combine became necessary recently to prevent a regression in
5370
+ // fneg-modifier-casting.ll caused by this patch legalising v2i32 xor.
5371
+ // Specifically, additional instructions were added to the final codegen.
5372
+ // When adding this combine a case was added to performFNEGCombine to
5373
+ // prevent this combine from being undone under certain conditions.
5260
5374
// TODO: Invert conditions of foldFreeOpFromSelect
5261
- return SDValue ();
5375
+ SDValue Cond = N0.getOperand (0 );
5376
+ SDValue LHS = N0.getOperand (1 );
5377
+ SDValue RHS = N0.getOperand (2 );
5378
+ EVT LHVT = LHS.getValueType ();
5379
+ EVT RHVT = RHS.getValueType ();
5380
+ // The regression was limited to i32 v2/i32.
5381
+ if (RHVT != MVT::i32 && LHVT != MVT::i32 )
5382
+ return SDValue ();
5383
+
5384
+ SDValue LFNeg = DAG.getNode (ISD::FNEG, SL, LHVT, LHS);
5385
+ SDValue RFNeg = DAG.getNode (ISD::FNEG, SL, RHVT, RHS);
5386
+ SDValue Op = DAG.getNode (Opc, SL, LHVT, Cond, LFNeg, RFNeg);
5387
+ return Op;
5262
5388
}
5263
5389
case ISD::BITCAST: {
5264
5390
SDLoc SL (N);
0 commit comments