Skip to content

Commit e543e45

Browse files
committed
Add functional implementation for i64
While this is functional it can be refactored and simplified, working on this now.
1 parent 0084599 commit e543e45

File tree

2 files changed

+122
-51
lines changed

2 files changed

+122
-51
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 96 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4919,23 +4919,112 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
49194919
return MinMax;
49204920
}
49214921

4922-
// Support source modifiers on integer types.
4923-
if (VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) {
4924-
SDValue SrcModTrue = getBitwiseToSrcModifierOp(True, DCI);
4925-
SDValue SrcModFalse = getBitwiseToSrcModifierOp(False, DCI);
4922+
auto FoldSrcMods = [&](SDValue LHS, SDValue RHS, EVT VT) -> SDValue {
4923+
SDValue SrcModTrue = getBitwiseToSrcModifierOp(LHS, DCI);
4924+
SDValue SrcModFalse = getBitwiseToSrcModifierOp(RHS, DCI);
49264925
if (SrcModTrue || SrcModFalse) {
49274926
SDLoc SL(N);
49284927
EVT FVT =
49294928
SrcModTrue ? SrcModTrue.getValueType() : SrcModFalse.getValueType();
49304929
SDValue FLHS =
4931-
SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, True);
4930+
SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, LHS);
49324931
SDValue FRHS = SrcModFalse ? SrcModFalse
4933-
: DAG.getNode(ISD::BITCAST, SL, FVT, False);
4932+
: DAG.getNode(ISD::BITCAST, SL, FVT, RHS);
49344933
SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, Cond, FLHS, FRHS);
49354934
return DAG.getNode(ISD::BITCAST, SL, VT, FSelect);
4935+
}
4936+
return SDValue();
4937+
};
4938+
4939+
// Support source modifiers on integer operands.
4940+
if (VT == MVT::i32 || VT == MVT::v2i32)
4941+
if (SDValue F = FoldSrcMods(True, False, VT))
4942+
return F;
4943+
4944+
// For i64 if a source modifier is to be folded in we split into two i32
4945+
// select of high and low values. The Operator need only be applied to the
4946+
// high values in order to change the sign bit.
4947+
if (VT == MVT::i64) {
4948+
bool TrueHasModifierOp =
4949+
(True.getOpcode() == ISD::AND || True.getOpcode() == ISD::OR ||
4950+
True.getOpcode() == ISD::XOR);
4951+
4952+
bool FalseHasModifierOp =
4953+
(False.getOpcode() == ISD::AND || False.getOpcode() == ISD::OR ||
4954+
False.getOpcode() == ISD::XOR);
4955+
4956+
ConstantSDNode *CTrueRHS = nullptr;
4957+
if (TrueHasModifierOp) {
4958+
SDValue TrueRHS = True->getOperand(1);
4959+
CTrueRHS = dyn_cast<ConstantSDNode>(TrueRHS);
4960+
}
4961+
4962+
ConstantSDNode *CFalseRHS = nullptr;
4963+
if (FalseHasModifierOp) {
4964+
SDValue FalseRHS = False->getOperand(1);
4965+
CFalseRHS = dyn_cast<ConstantSDNode>(FalseRHS);
4966+
}
4967+
4968+
// If True or False is a candidate for source modifier folding, extract
4969+
// the high value using APInt and reconstruct a ConstantSDNode.
4970+
SDValue TrueHiOp;
4971+
SDValue BCTrue = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, True);
4972+
SDValue TrueLo;
4973+
SDValue TrueHi;
4974+
if (CTrueRHS) {
4975+
SDValue TrueLHS = True->getOperand(0);
4976+
SDValue TrueLHSHiVal = getHiHalf64(BCTrue, DAG);
4977+
TrueLo = getLoHalf64(TrueLHS, DAG);
4978+
APInt CTrueRHSHiBits =
4979+
CTrueRHS->getAPIntValue().getHiBits(32).trunc(32);
4980+
SDValue CTrueRHSHiVal =
4981+
DAG.getConstant(CTrueRHSHiBits, SDLoc(N), MVT::i32);
4982+
unsigned OpcTrue = True.getOpcode();
4983+
TrueHiOp = DAG.getNode(OpcTrue, SDLoc(N), MVT::i32, TrueLHSHiVal,
4984+
CTrueRHSHiVal);
4985+
} else {
4986+
TrueLo = getLoHalf64(BCTrue, DAG);
4987+
TrueHi = getHiHalf64(BCTrue, DAG);
4988+
}
4989+
4990+
SDValue FalseHiOp;
4991+
SDValue BCFalse = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, False);
4992+
SDValue FalseLo;
4993+
SDValue FalseHi;
4994+
if (CFalseRHS) {
4995+
SDValue FalseLHS = False->getOperand(0);
4996+
FalseLo = getLoHalf64(FalseLHS, DAG);
4997+
SDValue FalseLHSHiVal = getHiHalf64(BCFalse, DAG);
4998+
APInt CFalseRHSHiBits =
4999+
CFalseRHS->getAPIntValue().getHiBits(32).trunc(32);
5000+
SDValue CFalseRHSHiVal =
5001+
DAG.getConstant(CFalseRHSHiBits, SDLoc(N), MVT::i32);
5002+
unsigned OpcFalse = False.getOpcode();
5003+
FalseHiOp = DAG.getNode(OpcFalse, SDLoc(N), MVT::i32, FalseLHSHiVal,
5004+
CFalseRHSHiVal);
5005+
} else {
5006+
FalseLo = getLoHalf64(BCFalse, DAG);
5007+
FalseHi = getHiHalf64(BCFalse, DAG);
5008+
}
5009+
5010+
if (CTrueRHS || CFalseRHS) {
5011+
// Place the low bits directly into the select. The operator is unneeded
5012+
// for these.
5013+
SDValue LoSelect =
5014+
DAG.getNode(ISD::SELECT, SDLoc(N), MVT::i32, Cond, TrueLo, FalseLo);
5015+
// If a source modifier may be folded use the bitwise-op of the high
5016+
// values, otherwise just pass the high part of the value.
5017+
SDValue FoldedHi =
5018+
FoldSrcMods(CTrueRHS ? TrueHiOp : TrueHi,
5019+
CFalseRHS ? FalseHiOp : FalseHi, MVT::i32);
5020+
5021+
SDValue ResV =
5022+
DAG.getBuildVector(MVT::v2i32, SDLoc(N), {FoldedHi, LoSelect});
5023+
SDValue Res = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, ResV);
5024+
return Res;
49365025
}
49375026
}
4938-
}
5027+
}
49395028

49405029
// There's no reason to not do this if the condition has other uses.
49415030
return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);

llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll

Lines changed: 26 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -340,18 +340,15 @@ define i64 @fneg_select_i64_1(i64 %cond, i64 %a, i64 %b) {
340340
; GCN: ; %bb.0:
341341
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
342342
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
343-
; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
344-
; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
345-
; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
343+
; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc
344+
; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
346345
; GCN-NEXT: s_setpc_b64 s[30:31]
347346
;
348347
; GFX11-LABEL: fneg_select_i64_1:
349348
; GFX11: ; %bb.0:
350349
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
351350
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
352-
; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v3
353-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
354-
; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v1, v5, v1
351+
; GFX11-NEXT: v_dual_cndmask_b32 v0, v5, v3 :: v_dual_cndmask_b32 v1, v4, v2
355352
; GFX11-NEXT: s_setpc_b64 s[30:31]
356353
%neg.a = xor i64 %a, u0x8000000000000000
357354
%cmp = icmp eq i64 %cond, zeroinitializer
@@ -364,18 +361,15 @@ define i64 @fneg_select_i64_2(i64 %cond, i64 %a, i64 %b) {
364361
; GCN: ; %bb.0:
365362
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
366363
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
367-
; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
368-
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
369-
; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
364+
; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc
365+
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
370366
; GCN-NEXT: s_setpc_b64 s[30:31]
371367
;
372368
; GFX11-LABEL: fneg_select_i64_2:
373369
; GFX11: ; %bb.0:
374370
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
375371
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
376-
; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v3
377-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
378-
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v1, v5
372+
; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v5 :: v_dual_cndmask_b32 v1, v2, v4
379373
; GFX11-NEXT: s_setpc_b64 s[30:31]
380374
%neg.a = xor i64 %a, u0x8000000000000000
381375
%cmp = icmp eq i64 %cond, zeroinitializer
@@ -388,20 +382,16 @@ define i64 @fneg_1_fabs_2_select_i64(i64 %cond, i64 %a, i64 %b) {
388382
; GCN: ; %bb.0:
389383
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
390384
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
391-
; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
392-
; GCN-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5
393-
; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
394-
; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
385+
; GCN-NEXT: v_cndmask_b32_e64 v0, |v5|, v3, vcc
386+
; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
395387
; GCN-NEXT: s_setpc_b64 s[30:31]
396388
;
397389
; GFX11-LABEL: fneg_1_fabs_2_select_i64:
398390
; GFX11: ; %bb.0:
399391
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
400392
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
401-
; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
402-
; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_and_b32 v1, 0x7fffffff, v5
403-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
404-
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
393+
; GFX11-NEXT: v_cndmask_b32_e64 v0, |v5|, v3, vcc_lo
394+
; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc_lo
405395
; GFX11-NEXT: s_setpc_b64 s[30:31]
406396
%neg.a = xor i64 %a, u0x8000000000000000
407397
%abs.b = and i64 %b, u0x7fffffffffffffff
@@ -415,18 +405,16 @@ define i64 @fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) {
415405
; GCN: ; %bb.0:
416406
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
417407
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
418-
; GCN-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3
419-
; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
420-
; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
408+
; GCN-NEXT: v_cndmask_b32_e64 v0, v5, |v3|, vcc
409+
; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
421410
; GCN-NEXT: s_setpc_b64 s[30:31]
422411
;
423412
; GFX11-LABEL: fabs_select_i64_1:
424413
; GFX11: ; %bb.0:
425414
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
426415
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
427-
; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_and_b32 v1, 0x7fffffff, v3
428-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
429-
; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
416+
; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, |v3|, vcc_lo
417+
; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc_lo
430418
; GFX11-NEXT: s_setpc_b64 s[30:31]
431419
%neg.a = and i64 %a, u0x7fffffffffffffff
432420
%cmp = icmp eq i64 %cond, zeroinitializer
@@ -439,18 +427,16 @@ define i64 @fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) {
439427
; GCN: ; %bb.0:
440428
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
441429
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
442-
; GCN-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3
443-
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
444-
; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
430+
; GCN-NEXT: v_cndmask_b32_e64 v0, |v3|, v5, vcc
431+
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
445432
; GCN-NEXT: s_setpc_b64 s[30:31]
446433
;
447434
; GFX11-LABEL: fabs_select_i64_2:
448435
; GFX11: ; %bb.0:
449436
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
450437
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
451-
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_and_b32 v1, 0x7fffffff, v3
452-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
453-
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
438+
; GFX11-NEXT: v_cndmask_b32_e64 v0, |v3|, v5, vcc_lo
439+
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
454440
; GFX11-NEXT: s_setpc_b64 s[30:31]
455441
%neg.a = and i64 %a, u0x7fffffffffffffff
456442
%cmp = icmp eq i64 %cond, zeroinitializer
@@ -463,18 +449,16 @@ define i64 @fneg_fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) {
463449
; GCN: ; %bb.0:
464450
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
465451
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
466-
; GCN-NEXT: v_or_b32_e32 v3, 0x80000000, v3
467-
; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
468-
; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
452+
; GCN-NEXT: v_cndmask_b32_e64 v0, v5, -|v3|, vcc
453+
; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
469454
; GCN-NEXT: s_setpc_b64 s[30:31]
470455
;
471456
; GFX11-LABEL: fneg_fabs_select_i64_1:
472457
; GFX11: ; %bb.0:
473458
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
474459
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
475-
; GFX11-NEXT: v_or_b32_e32 v1, 0x80000000, v3
476-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
477-
; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v1, v5, v1
460+
; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, -|v3|, vcc_lo
461+
; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc_lo
478462
; GFX11-NEXT: s_setpc_b64 s[30:31]
479463
%neg.a = or i64 %a, u0x8000000000000000
480464
%cmp = icmp eq i64 %cond, zeroinitializer
@@ -487,18 +471,16 @@ define i64 @fneg_fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) {
487471
; GCN: ; %bb.0:
488472
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
489473
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
490-
; GCN-NEXT: v_or_b32_e32 v3, 0x80000000, v3
491-
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
492-
; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
474+
; GCN-NEXT: v_cndmask_b32_e64 v0, -|v3|, v5, vcc
475+
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
493476
; GCN-NEXT: s_setpc_b64 s[30:31]
494477
;
495478
; GFX11-LABEL: fneg_fabs_select_i64_2:
496479
; GFX11: ; %bb.0:
497480
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
498481
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
499-
; GFX11-NEXT: v_or_b32_e32 v1, 0x80000000, v3
500-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
501-
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v1, v5
482+
; GFX11-NEXT: v_cndmask_b32_e64 v0, -|v3|, v5, vcc_lo
483+
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
502484
; GFX11-NEXT: s_setpc_b64 s[30:31]
503485
%neg.a = or i64 %a, u0x8000000000000000
504486
%cmp = icmp eq i64 %cond, zeroinitializer

0 commit comments

Comments
 (0)