@@ -4032,9 +4032,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
4032
4032
// / Split the 64-bit value \p LHS into two 32-bit components, and perform the
4033
4033
// / binary operation \p Opc to it with the corresponding constant operands.
4034
4034
SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl (
4035
- DAGCombinerInfo &DCI, const SDLoc &SL,
4036
- unsigned Opc, SDValue LHS,
4037
- uint32_t ValLo, uint32_t ValHi) const {
4035
+ DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
4036
+ uint32_t ValLo, uint32_t ValHi) const {
4038
4037
SelectionDAG &DAG = DCI.DAG ;
4039
4038
SDValue Lo, Hi;
4040
4039
std::tie (Lo, Hi) = split64BitValue (LHS, DAG);
@@ -4063,6 +4062,53 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
4063
4062
SDLoc SL (N);
4064
4063
SelectionDAG &DAG = DCI.DAG ;
4065
4064
4065
+ // When the shl64_reduce optimisation code is passed through vector
4066
+ // legalization some scalarising occurs. After ISD::AND was legalised, this
4067
+ // resulted in the AND instructions no longer being elided, as mentioned
4068
+ // below. The following code should make sure this takes place.
4069
+ if (RHS->getOpcode () == ISD::EXTRACT_VECTOR_ELT) {
4070
+ SDValue VAND = RHS.getOperand (0 );
4071
+ if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand (1 ))) {
4072
+ uint64_t AndIndex = RHS->getConstantOperandVal (1 );
4073
+ if (VAND->getOpcode () == ISD::AND && CRRHS) {
4074
+ SDValue LHSAND = VAND.getOperand (0 );
4075
+ SDValue RHSAND = VAND.getOperand (1 );
4076
+ if (RHSAND->getOpcode () == ISD::BUILD_VECTOR) {
4077
+ // Part of shlcombine is to optimise for the case where its possible
4078
+ // to reduce shl64 to shl32 if shift range is [63-32]. This
4079
+ // transforms: DST = shl i64 X, Y to [0, shl i32 X, (Y & 31) ]. The
4080
+ // '&' is then elided by ISel. The vector code for this was being
4081
+ // completely scalarised by the vector legalizer, but now v2i32 is
4082
+ // made legal the vector legaliser only partially scalarises the
4083
+ // vector operations and the and was not elided. This check enables us
4084
+ // to locate and scalarise the v2i32 and and re-enable ISel to elide
4085
+ // the and instruction.
4086
+ ConstantSDNode *CANDL =
4087
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand (0 ));
4088
+ ConstantSDNode *CANDR =
4089
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand (1 ));
4090
+ if (CANDL && CANDR && RHSAND->getConstantOperandVal (0 ) == 0x1f &&
4091
+ RHSAND->getConstantOperandVal (1 ) == 0x1f ) {
4092
+ // Get the non-const AND operands and produce scalar AND
4093
+ const SDValue Zero = DAG.getConstant (0 , SL, MVT::i32 );
4094
+ const SDValue One = DAG.getConstant (1 , SL, MVT::i32 );
4095
+ SDValue Lo = DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 ,
4096
+ LHSAND, Zero);
4097
+ SDValue Hi =
4098
+ DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 , LHSAND, One);
4099
+ SDValue AndMask = DAG.getConstant (0x1f , SL, MVT::i32 );
4100
+ SDValue LoAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Lo, AndMask);
4101
+ SDValue HiAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Hi, AndMask);
4102
+ SDValue Trunc = DAG.getNode (ISD::TRUNCATE, SL, MVT::i32 , LHS);
4103
+ if (AndIndex == 0 || AndIndex == 1 )
4104
+ return DAG.getNode (ISD::SHL, SL, MVT::i32 , Trunc,
4105
+ AndIndex == 0 ? LoAnd : HiAnd, N->getFlags ());
4106
+ }
4107
+ }
4108
+ }
4109
+ }
4110
+ }
4111
+
4066
4112
unsigned RHSVal;
4067
4113
if (CRHS) {
4068
4114
RHSVal = CRHS->getZExtValue ();
@@ -4104,8 +4150,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
4104
4150
if (VT.getScalarType () != MVT::i64 )
4105
4151
return SDValue ();
4106
4152
4107
- // i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))
4108
-
4109
4153
// On some subtargets, 64-bit shift is a quarter rate instruction. In the
4110
4154
// common case, splitting this into a move and a 32-bit shift is faster and
4111
4155
// the same code size.
@@ -4267,6 +4311,53 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
4267
4311
SDLoc SL (N);
4268
4312
unsigned RHSVal;
4269
4313
4314
+ // When the shl64_reduce optimisation code is passed through vector
4315
+ // legalization some scalarising occurs. After ISD::AND was legalised, this
4316
+ // resulted in the AND instructions no longer being elided, as mentioned
4317
+ // below. The following code should make sure this takes place.
4318
+ if (RHS->getOpcode () == ISD::EXTRACT_VECTOR_ELT) {
4319
+ SDValue VAND = RHS.getOperand (0 );
4320
+ if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand (1 ))) {
4321
+ uint64_t AndIndex = RHS->getConstantOperandVal (1 );
4322
+ if (VAND->getOpcode () == ISD::AND && CRRHS) {
4323
+ SDValue LHSAND = VAND.getOperand (0 );
4324
+ SDValue RHSAND = VAND.getOperand (1 );
4325
+ if (RHSAND->getOpcode () == ISD::BUILD_VECTOR) {
4326
+ // Part of srlcombine is to optimise for the case where its possible
4327
+ // to reduce shl64 to shl32 if shift range is [63-32]. This
4328
+ // transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The
4329
+ // '&' is then elided by ISel. The vector code for this was being
4330
+ // completely scalarised by the vector legalizer, but now v2i32 is
4331
+ // made legal the vector legaliser only partially scalarises the
4332
+ // vector operations and the and was not elided. This check enables us
4333
+ // to locate and scalarise the v2i32 and and re-enable ISel to elide
4334
+ // the and instruction.
4335
+ ConstantSDNode *CANDL =
4336
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand (0 ));
4337
+ ConstantSDNode *CANDR =
4338
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand (1 ));
4339
+ if (CANDL && CANDR && RHSAND->getConstantOperandVal (0 ) == 0x1f &&
4340
+ RHSAND->getConstantOperandVal (1 ) == 0x1f ) {
4341
+ // Get the non-const AND operands and produce scalar AND
4342
+ const SDValue Zero = DAG.getConstant (0 , SL, MVT::i32 );
4343
+ const SDValue One = DAG.getConstant (1 , SL, MVT::i32 );
4344
+ SDValue Lo = DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 ,
4345
+ LHSAND, Zero);
4346
+ SDValue Hi =
4347
+ DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 , LHSAND, One);
4348
+ SDValue AndMask = DAG.getConstant (0x1f , SL, MVT::i32 );
4349
+ SDValue LoAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Lo, AndMask);
4350
+ SDValue HiAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Hi, AndMask);
4351
+ SDValue Trunc = DAG.getNode (ISD::TRUNCATE, SL, MVT::i32 , LHS);
4352
+ if (AndIndex == 0 || AndIndex == 1 )
4353
+ return DAG.getNode (ISD::SRL, SL, MVT::i32 , Trunc,
4354
+ AndIndex == 0 ? LoAnd : HiAnd, N->getFlags ());
4355
+ }
4356
+ }
4357
+ }
4358
+ }
4359
+ }
4360
+
4270
4361
if (CRHS) {
4271
4362
RHSVal = CRHS->getZExtValue ();
4272
4363
@@ -4780,8 +4871,26 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
4780
4871
if (!AMDGPUTargetLowering::allUsesHaveSourceMods (N.getNode ()))
4781
4872
return SDValue ();
4782
4873
4783
- return distributeOpThroughSelect (DCI, LHS.getOpcode (),
4784
- SDLoc (N), Cond, LHS, RHS);
4874
+ // select c, (fneg (f32 bitcast i32 x)), (fneg (f32 bitcast i32 y)) can be
4875
+ // lowered directly to a V_CNDMASK_. So prevent the fneg from being pulled
4876
+ // out in this case. For now I've made the logic as specific to the case as
4877
+ // possible, hopefully this can be relaxed in future.
4878
+ if (LHS.getOpcode () == ISD::FNEG && RHS.getOpcode () == ISD::FNEG) {
4879
+ SDValue LHSB = LHS.getOperand (0 );
4880
+ SDValue RHSB = RHS.getOperand (0 );
4881
+ if (LHSB.getOpcode () == ISD::BITCAST &&
4882
+ RHSB->getOpcode () == ISD::BITCAST) {
4883
+ EVT LHSBOpTy = LHSB->getOperand (0 ).getValueType ();
4884
+ EVT RHSBOpTy = RHSB->getOperand (0 ).getValueType ();
4885
+ if (LHSB.getValueType () == MVT::f32 &&
4886
+ RHSB.getValueType () == MVT::f32 && LHSBOpTy == MVT::i32 &&
4887
+ RHSBOpTy == MVT::i32 )
4888
+ return SDValue ();
4889
+ }
4890
+ }
4891
+
4892
+ return distributeOpThroughSelect (DCI, LHS.getOpcode (), SDLoc (N), Cond, LHS,
4893
+ RHS);
4785
4894
}
4786
4895
4787
4896
bool Inv = false ;
@@ -4834,8 +4943,8 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
4834
4943
if (Inv)
4835
4944
std::swap (NewLHS, NewRHS);
4836
4945
4837
- SDValue NewSelect = DAG. getNode (ISD::SELECT, SL, VT,
4838
- Cond, NewLHS, NewRHS);
4946
+ SDValue NewSelect =
4947
+ DAG. getNode (ISD::SELECT, SL, VT, Cond, NewLHS, NewRHS);
4839
4948
DCI.AddToWorklist (NewSelect.getNode ());
4840
4949
return DAG.getNode (LHS.getOpcode (), SL, VT, NewSelect);
4841
4950
}
@@ -5256,8 +5365,25 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
5256
5365
}
5257
5366
case ISD::SELECT: {
5258
5367
// fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5368
+ // This combine became necessary recently to prevent a regression in
5369
+ // fneg-modifier-casting.ll caused by this patch legalising v2i32 xor.
5370
+ // Specifically, additional instructions were added to the final codegen.
5371
+ // When adding this combine a case was added to performFNEGCombine to
5372
+ // prevent this combine from being undone under certain conditions.
5259
5373
// TODO: Invert conditions of foldFreeOpFromSelect
5260
- return SDValue ();
5374
+ SDValue Cond = N0.getOperand (0 );
5375
+ SDValue LHS = N0.getOperand (1 );
5376
+ SDValue RHS = N0.getOperand (2 );
5377
+ EVT LHVT = LHS.getValueType ();
5378
+ EVT RHVT = RHS.getValueType ();
5379
+ // The regression was limited to i32 v2/i32.
5380
+ if (RHVT != MVT::i32 && LHVT != MVT::i32 )
5381
+ return SDValue ();
5382
+
5383
+ SDValue LFNeg = DAG.getNode (ISD::FNEG, SL, LHVT, LHS);
5384
+ SDValue RFNeg = DAG.getNode (ISD::FNEG, SL, RHVT, RHS);
5385
+ SDValue Op = DAG.getNode (Opc, SL, LHVT, Cond, LFNeg, RFNeg);
5386
+ return Op;
5261
5387
}
5262
5388
case ISD::BITCAST: {
5263
5389
SDLoc SL (N);
0 commit comments