Skip to content

Commit e5f1e67

Browse files
committed
Add functional implementation for i64
While this is functional it can be refactored and simplified, working on this now.
1 parent a8bd726 commit e5f1e67

File tree

2 files changed

+122
-51
lines changed

2 files changed

+122
-51
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 96 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4931,23 +4931,112 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
49314931
return MinMax;
49324932
}
49334933

4934-
// Support source modifiers on integer types.
4935-
if (VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) {
4936-
SDValue SrcModTrue = getBitwiseToSrcModifierOp(True, DCI);
4937-
SDValue SrcModFalse = getBitwiseToSrcModifierOp(False, DCI);
4934+
auto FoldSrcMods = [&](SDValue LHS, SDValue RHS, EVT VT) -> SDValue {
4935+
SDValue SrcModTrue = getBitwiseToSrcModifierOp(LHS, DCI);
4936+
SDValue SrcModFalse = getBitwiseToSrcModifierOp(RHS, DCI);
49384937
if (SrcModTrue || SrcModFalse) {
49394938
SDLoc SL(N);
49404939
EVT FVT =
49414940
SrcModTrue ? SrcModTrue.getValueType() : SrcModFalse.getValueType();
49424941
SDValue FLHS =
4943-
SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, True);
4942+
SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, LHS);
49444943
SDValue FRHS = SrcModFalse ? SrcModFalse
4945-
: DAG.getNode(ISD::BITCAST, SL, FVT, False);
4944+
: DAG.getNode(ISD::BITCAST, SL, FVT, RHS);
49464945
SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, Cond, FLHS, FRHS);
49474946
return DAG.getNode(ISD::BITCAST, SL, VT, FSelect);
4947+
}
4948+
return SDValue();
4949+
};
4950+
4951+
// Support source modifiers on integer operands.
4952+
if (VT == MVT::i32 || VT == MVT::v2i32)
4953+
if (SDValue F = FoldSrcMods(True, False, VT))
4954+
return F;
4955+
4956+
// For i64 if a source modifier is to be folded in we split into two i32
4957+
// select of high and low values. The Operator need only be applied to the
4958+
// high values in order to change the sign bit.
4959+
if (VT == MVT::i64) {
4960+
bool TrueHasModifierOp =
4961+
(True.getOpcode() == ISD::AND || True.getOpcode() == ISD::OR ||
4962+
True.getOpcode() == ISD::XOR);
4963+
4964+
bool FalseHasModifierOp =
4965+
(False.getOpcode() == ISD::AND || False.getOpcode() == ISD::OR ||
4966+
False.getOpcode() == ISD::XOR);
4967+
4968+
ConstantSDNode *CTrueRHS = nullptr;
4969+
if (TrueHasModifierOp) {
4970+
SDValue TrueRHS = True->getOperand(1);
4971+
CTrueRHS = dyn_cast<ConstantSDNode>(TrueRHS);
4972+
}
4973+
4974+
ConstantSDNode *CFalseRHS = nullptr;
4975+
if (FalseHasModifierOp) {
4976+
SDValue FalseRHS = False->getOperand(1);
4977+
CFalseRHS = dyn_cast<ConstantSDNode>(FalseRHS);
4978+
}
4979+
4980+
// If True or False is a candidate for source modifier folding, extract
4981+
// the high value using APInt and reconstruct a ConstantSDNode.
4982+
SDValue TrueHiOp;
4983+
SDValue BCTrue = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, True);
4984+
SDValue TrueLo;
4985+
SDValue TrueHi;
4986+
if (CTrueRHS) {
4987+
SDValue TrueLHS = True->getOperand(0);
4988+
SDValue TrueLHSHiVal = getHiHalf64(BCTrue, DAG);
4989+
TrueLo = getLoHalf64(TrueLHS, DAG);
4990+
APInt CTrueRHSHiBits =
4991+
CTrueRHS->getAPIntValue().getHiBits(32).trunc(32);
4992+
SDValue CTrueRHSHiVal =
4993+
DAG.getConstant(CTrueRHSHiBits, SDLoc(N), MVT::i32);
4994+
unsigned OpcTrue = True.getOpcode();
4995+
TrueHiOp = DAG.getNode(OpcTrue, SDLoc(N), MVT::i32, TrueLHSHiVal,
4996+
CTrueRHSHiVal);
4997+
} else {
4998+
TrueLo = getLoHalf64(BCTrue, DAG);
4999+
TrueHi = getHiHalf64(BCTrue, DAG);
5000+
}
5001+
5002+
SDValue FalseHiOp;
5003+
SDValue BCFalse = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, False);
5004+
SDValue FalseLo;
5005+
SDValue FalseHi;
5006+
if (CFalseRHS) {
5007+
SDValue FalseLHS = False->getOperand(0);
5008+
FalseLo = getLoHalf64(FalseLHS, DAG);
5009+
SDValue FalseLHSHiVal = getHiHalf64(BCFalse, DAG);
5010+
APInt CFalseRHSHiBits =
5011+
CFalseRHS->getAPIntValue().getHiBits(32).trunc(32);
5012+
SDValue CFalseRHSHiVal =
5013+
DAG.getConstant(CFalseRHSHiBits, SDLoc(N), MVT::i32);
5014+
unsigned OpcFalse = False.getOpcode();
5015+
FalseHiOp = DAG.getNode(OpcFalse, SDLoc(N), MVT::i32, FalseLHSHiVal,
5016+
CFalseRHSHiVal);
5017+
} else {
5018+
FalseLo = getLoHalf64(BCFalse, DAG);
5019+
FalseHi = getHiHalf64(BCFalse, DAG);
5020+
}
5021+
5022+
if (CTrueRHS || CFalseRHS) {
5023+
// Place the low bits directly into the select. The operator is unneeded
5024+
// for these.
5025+
SDValue LoSelect =
5026+
DAG.getNode(ISD::SELECT, SDLoc(N), MVT::i32, Cond, TrueLo, FalseLo);
5027+
// If a source modifier may be folded use the bitwise-op of the high
5028+
// values, otherwise just pass the high part of the value.
5029+
SDValue FoldedHi =
5030+
FoldSrcMods(CTrueRHS ? TrueHiOp : TrueHi,
5031+
CFalseRHS ? FalseHiOp : FalseHi, MVT::i32);
5032+
5033+
SDValue ResV =
5034+
DAG.getBuildVector(MVT::v2i32, SDLoc(N), {FoldedHi, LoSelect});
5035+
SDValue Res = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, ResV);
5036+
return Res;
49485037
}
49495038
}
4950-
}
5039+
}
49515040

49525041
// There's no reason to not do this if the condition has other uses.
49535042
return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);

llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll

Lines changed: 26 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -340,18 +340,15 @@ define i64 @fneg_select_i64_1(i64 %cond, i64 %a, i64 %b) {
340340
; GCN: ; %bb.0:
341341
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
342342
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
343-
; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
344-
; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
345-
; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
343+
; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc
344+
; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
346345
; GCN-NEXT: s_setpc_b64 s[30:31]
347346
;
348347
; GFX11-LABEL: fneg_select_i64_1:
349348
; GFX11: ; %bb.0:
350349
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
351350
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
352-
; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v3
353-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
354-
; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v1, v5, v1
351+
; GFX11-NEXT: v_dual_cndmask_b32 v0, v5, v3 :: v_dual_cndmask_b32 v1, v4, v2
355352
; GFX11-NEXT: s_setpc_b64 s[30:31]
356353
%neg.a = xor i64 %a, u0x8000000000000000
357354
%cmp = icmp eq i64 %cond, zeroinitializer
@@ -364,18 +361,15 @@ define i64 @fneg_select_i64_2(i64 %cond, i64 %a, i64 %b) {
364361
; GCN: ; %bb.0:
365362
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
366363
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
367-
; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
368-
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
369-
; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
364+
; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc
365+
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
370366
; GCN-NEXT: s_setpc_b64 s[30:31]
371367
;
372368
; GFX11-LABEL: fneg_select_i64_2:
373369
; GFX11: ; %bb.0:
374370
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
375371
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
376-
; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v3
377-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
378-
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v1, v5
372+
; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v5 :: v_dual_cndmask_b32 v1, v2, v4
379373
; GFX11-NEXT: s_setpc_b64 s[30:31]
380374
%neg.a = xor i64 %a, u0x8000000000000000
381375
%cmp = icmp eq i64 %cond, zeroinitializer
@@ -388,20 +382,16 @@ define i64 @fneg_1_fabs_2_select_i64(i64 %cond, i64 %a, i64 %b) {
388382
; GCN: ; %bb.0:
389383
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
390384
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
391-
; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
392-
; GCN-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5
393-
; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
394-
; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
385+
; GCN-NEXT: v_cndmask_b32_e64 v0, |v5|, v3, vcc
386+
; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
395387
; GCN-NEXT: s_setpc_b64 s[30:31]
396388
;
397389
; GFX11-LABEL: fneg_1_fabs_2_select_i64:
398390
; GFX11: ; %bb.0:
399391
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
400392
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
401-
; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
402-
; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_and_b32 v1, 0x7fffffff, v5
403-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
404-
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
393+
; GFX11-NEXT: v_cndmask_b32_e64 v0, |v5|, v3, vcc_lo
394+
; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc_lo
405395
; GFX11-NEXT: s_setpc_b64 s[30:31]
406396
%neg.a = xor i64 %a, u0x8000000000000000
407397
%abs.b = and i64 %b, u0x7fffffffffffffff
@@ -415,18 +405,16 @@ define i64 @fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) {
415405
; GCN: ; %bb.0:
416406
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
417407
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
418-
; GCN-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3
419-
; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
420-
; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
408+
; GCN-NEXT: v_cndmask_b32_e64 v0, v5, |v3|, vcc
409+
; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
421410
; GCN-NEXT: s_setpc_b64 s[30:31]
422411
;
423412
; GFX11-LABEL: fabs_select_i64_1:
424413
; GFX11: ; %bb.0:
425414
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
426415
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
427-
; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_and_b32 v1, 0x7fffffff, v3
428-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
429-
; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
416+
; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, |v3|, vcc_lo
417+
; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc_lo
430418
; GFX11-NEXT: s_setpc_b64 s[30:31]
431419
%neg.a = and i64 %a, u0x7fffffffffffffff
432420
%cmp = icmp eq i64 %cond, zeroinitializer
@@ -439,18 +427,16 @@ define i64 @fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) {
439427
; GCN: ; %bb.0:
440428
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
441429
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
442-
; GCN-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3
443-
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
444-
; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
430+
; GCN-NEXT: v_cndmask_b32_e64 v0, |v3|, v5, vcc
431+
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
445432
; GCN-NEXT: s_setpc_b64 s[30:31]
446433
;
447434
; GFX11-LABEL: fabs_select_i64_2:
448435
; GFX11: ; %bb.0:
449436
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
450437
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
451-
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_and_b32 v1, 0x7fffffff, v3
452-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
453-
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
438+
; GFX11-NEXT: v_cndmask_b32_e64 v0, |v3|, v5, vcc_lo
439+
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
454440
; GFX11-NEXT: s_setpc_b64 s[30:31]
455441
%neg.a = and i64 %a, u0x7fffffffffffffff
456442
%cmp = icmp eq i64 %cond, zeroinitializer
@@ -463,18 +449,16 @@ define i64 @fneg_fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) {
463449
; GCN: ; %bb.0:
464450
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
465451
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
466-
; GCN-NEXT: v_or_b32_e32 v3, 0x80000000, v3
467-
; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
468-
; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
452+
; GCN-NEXT: v_cndmask_b32_e64 v0, v5, -|v3|, vcc
453+
; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
469454
; GCN-NEXT: s_setpc_b64 s[30:31]
470455
;
471456
; GFX11-LABEL: fneg_fabs_select_i64_1:
472457
; GFX11: ; %bb.0:
473458
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
474459
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
475-
; GFX11-NEXT: v_or_b32_e32 v1, 0x80000000, v3
476-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
477-
; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v1, v5, v1
460+
; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, -|v3|, vcc_lo
461+
; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc_lo
478462
; GFX11-NEXT: s_setpc_b64 s[30:31]
479463
%neg.a = or i64 %a, u0x8000000000000000
480464
%cmp = icmp eq i64 %cond, zeroinitializer
@@ -487,18 +471,16 @@ define i64 @fneg_fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) {
487471
; GCN: ; %bb.0:
488472
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
489473
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
490-
; GCN-NEXT: v_or_b32_e32 v3, 0x80000000, v3
491-
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
492-
; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
474+
; GCN-NEXT: v_cndmask_b32_e64 v0, -|v3|, v5, vcc
475+
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
493476
; GCN-NEXT: s_setpc_b64 s[30:31]
494477
;
495478
; GFX11-LABEL: fneg_fabs_select_i64_2:
496479
; GFX11: ; %bb.0:
497480
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
498481
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
499-
; GFX11-NEXT: v_or_b32_e32 v1, 0x80000000, v3
500-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
501-
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v1, v5
482+
; GFX11-NEXT: v_cndmask_b32_e64 v0, -|v3|, v5, vcc_lo
483+
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
502484
; GFX11-NEXT: s_setpc_b64 s[30:31]
503485
%neg.a = or i64 %a, u0x8000000000000000
504486
%cmp = icmp eq i64 %cond, zeroinitializer

0 commit comments

Comments
 (0)