Skip to content

Commit 8392149

Browse files
committed
AMDGPU/GlobalISel: Fix isExtractHiElt when selecting fma_mix (llvm#102130)
isExtractHiElt should return new source register instead of returning instruction that defines it. Src = MI.getOperand(0).getReg() is not correct when MI(for example G_UNMERGE_VALUES) defines multiple registers. Refactor existing code to work with source registers only. (cherry picked from commit 269cefb) Change-Id: I2dcc950ff7c5d68c1d6e5ee8e71c39a100f549b8
1 parent 3d31d60 commit 8392149

File tree

3 files changed

+74
-100
lines changed

3 files changed

+74
-100
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 69 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -1372,8 +1372,8 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
13721372
MachineInstrBuilder SelectedMI;
13731373
MachineOperand &LHS = I.getOperand(2);
13741374
MachineOperand &RHS = I.getOperand(3);
1375-
auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS);
1376-
auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS);
1375+
auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1376+
auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
13771377
Register Src0Reg =
13781378
copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
13791379
Register Src1Reg =
@@ -2487,14 +2487,48 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
24872487
return false;
24882488
}
24892489

2490+
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI) {
2491+
return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;
2492+
}
2493+
2494+
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI) {
2495+
Register BitcastSrc;
2496+
if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2497+
Reg = BitcastSrc;
2498+
return Reg;
2499+
}
2500+
24902501
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
24912502
Register &Out) {
2503+
Register Trunc;
2504+
if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2505+
return false;
2506+
24922507
Register LShlSrc;
2493-
if (mi_match(In, MRI,
2494-
m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) {
2495-
Out = LShlSrc;
2508+
Register Cst;
2509+
if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2510+
Cst = stripCopy(Cst, MRI);
2511+
if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2512+
Out = stripBitCast(LShlSrc, MRI);
2513+
return true;
2514+
}
2515+
}
2516+
2517+
MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2518+
if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2519+
return false;
2520+
2521+
assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2522+
LLT::fixed_vector(2, 16));
2523+
2524+
ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2525+
assert(Mask.size() == 2);
2526+
2527+
if (Mask[0] == 1 && Mask[1] <= 1) {
2528+
Out = Shuffle->getOperand(0).getReg();
24962529
return true;
24972530
}
2531+
24982532
return false;
24992533
}
25002534

@@ -3654,11 +3688,8 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
36543688

36553689
}
36563690

3657-
std::pair<Register, unsigned>
3658-
AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3659-
bool IsCanonicalizing,
3660-
bool AllowAbs, bool OpSel) const {
3661-
Register Src = Root.getReg();
3691+
std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
3692+
Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
36623693
unsigned Mods = 0;
36633694
MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
36643695

@@ -3721,7 +3752,7 @@ InstructionSelector::ComplexRendererFns
37213752
AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
37223753
Register Src;
37233754
unsigned Mods;
3724-
std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3755+
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
37253756

37263757
return {{
37273758
[=](MachineInstrBuilder &MIB) {
@@ -3737,7 +3768,7 @@ InstructionSelector::ComplexRendererFns
37373768
AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
37383769
Register Src;
37393770
unsigned Mods;
3740-
std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
3771+
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
37413772
/*IsCanonicalizing=*/true,
37423773
/*AllowAbs=*/false);
37433774

@@ -3764,7 +3795,7 @@ InstructionSelector::ComplexRendererFns
37643795
AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
37653796
Register Src;
37663797
unsigned Mods;
3767-
std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3798+
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
37683799

37693800
return {{
37703801
[=](MachineInstrBuilder &MIB) {
@@ -3779,7 +3810,8 @@ AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
37793810
MachineOperand &Root) const {
37803811
Register Src;
37813812
unsigned Mods;
3782-
std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false);
3813+
std::tie(Src, Mods) =
3814+
selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
37833815

37843816
return {{
37853817
[=](MachineInstrBuilder &MIB) {
@@ -3793,8 +3825,9 @@ InstructionSelector::ComplexRendererFns
37933825
AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
37943826
Register Src;
37953827
unsigned Mods;
3796-
std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true,
3797-
/*AllowAbs=*/false);
3828+
std::tie(Src, Mods) =
3829+
selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
3830+
/*AllowAbs=*/false);
37983831

37993832
return {{
38003833
[=](MachineInstrBuilder &MIB) {
@@ -4120,7 +4153,7 @@ InstructionSelector::ComplexRendererFns
41204153
AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
41214154
Register Src;
41224155
unsigned Mods;
4123-
std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
4156+
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
41244157

41254158
// FIXME: Handle op_sel
41264159
return {{
@@ -4133,7 +4166,7 @@ InstructionSelector::ComplexRendererFns
41334166
AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
41344167
Register Src;
41354168
unsigned Mods;
4136-
std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4169+
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
41374170
/*IsCanonicalizing=*/true,
41384171
/*AllowAbs=*/false,
41394172
/*OpSel=*/false);
@@ -4151,7 +4184,7 @@ InstructionSelector::ComplexRendererFns
41514184
AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
41524185
Register Src;
41534186
unsigned Mods;
4154-
std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4187+
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
41554188
/*IsCanonicalizing=*/true,
41564189
/*AllowAbs=*/false,
41574190
/*OpSel=*/true);
@@ -5333,97 +5366,41 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
53335366
[=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
53345367
}
53355368

5336-
// Variant of stripBitCast that returns the instruction instead of a
5337-
// MachineOperand.
5338-
static MachineInstr *stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI) {
5339-
if (MI->getOpcode() == AMDGPU::G_BITCAST)
5340-
return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
5341-
return MI;
5342-
}
5343-
5344-
// Figure out if this is really an extract of the high 16-bits of a dword,
5345-
// returns nullptr if it isn't.
5346-
static MachineInstr *isExtractHiElt(MachineInstr *Inst,
5347-
MachineRegisterInfo &MRI) {
5348-
Inst = stripBitCast(Inst, MRI);
5349-
5350-
if (Inst->getOpcode() != AMDGPU::G_TRUNC)
5351-
return nullptr;
5352-
5353-
MachineInstr *TruncOp =
5354-
getDefIgnoringCopies(Inst->getOperand(1).getReg(), MRI);
5355-
TruncOp = stripBitCast(TruncOp, MRI);
5356-
5357-
// G_LSHR x, (G_CONSTANT i32 16)
5358-
if (TruncOp->getOpcode() == AMDGPU::G_LSHR) {
5359-
auto SrlAmount = getIConstantVRegValWithLookThrough(
5360-
TruncOp->getOperand(2).getReg(), MRI);
5361-
if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) {
5362-
MachineInstr *SrlOp =
5363-
getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5364-
return stripBitCast(SrlOp, MRI);
5365-
}
5366-
}
5367-
5368-
// G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0)
5369-
// 1, 0 swaps the low/high 16 bits.
5370-
// 1, 1 sets the high 16 bits to be the same as the low 16.
5371-
// in any case, it selects the high elts.
5372-
if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
5373-
assert(MRI.getType(TruncOp->getOperand(0).getReg()) ==
5374-
LLT::fixed_vector(2, 16));
5375-
5376-
ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask();
5377-
assert(Mask.size() == 2);
5378-
5379-
if (Mask[0] == 1 && Mask[1] <= 1) {
5380-
MachineInstr *LHS =
5381-
getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5382-
return stripBitCast(LHS, MRI);
5383-
}
5384-
}
5385-
5386-
return nullptr;
5387-
}
5388-
53895369
std::pair<Register, unsigned>
53905370
AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
53915371
bool &Matched) const {
53925372
Matched = false;
53935373

53945374
Register Src;
53955375
unsigned Mods;
5396-
std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
5397-
5398-
MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
5399-
if (MI->getOpcode() == AMDGPU::G_FPEXT) {
5400-
MachineOperand *MO = &MI->getOperand(1);
5401-
Src = MO->getReg();
5402-
MI = getDefIgnoringCopies(Src, *MRI);
5376+
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
54035377

5378+
if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
54045379
assert(MRI->getType(Src) == LLT::scalar(16));
54055380

5406-
// See through bitcasts.
5407-
// FIXME: Would be nice to use stripBitCast here.
5408-
if (MI->getOpcode() == AMDGPU::G_BITCAST) {
5409-
MO = &MI->getOperand(1);
5410-
Src = MO->getReg();
5411-
MI = getDefIgnoringCopies(Src, *MRI);
5412-
}
5381+
// Only change Src if src modifier could be gained. In such cases new Src
5382+
// could be sgpr but this does not violate constant bus restriction for
5383+
// instruction that is being selected.
5384+
// Note: Src is not changed when there is only a simple sgpr to vgpr copy
5385+
// since this could violate constant bus restriction.
5386+
Register PeekSrc = stripCopy(Src, *MRI);
54135387

54145388
const auto CheckAbsNeg = [&]() {
54155389
// Be careful about folding modifiers if we already have an abs. fneg is
54165390
// applied last, so we don't want to apply an earlier fneg.
54175391
if ((Mods & SISrcMods::ABS) == 0) {
54185392
unsigned ModsTmp;
5419-
std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO);
5420-
MI = getDefIgnoringCopies(Src, *MRI);
5393+
std::tie(PeekSrc, ModsTmp) = selectVOP3ModsImpl(PeekSrc);
54215394

5422-
if ((ModsTmp & SISrcMods::NEG) != 0)
5395+
if ((ModsTmp & SISrcMods::NEG) != 0) {
54235396
Mods ^= SISrcMods::NEG;
5397+
Src = PeekSrc;
5398+
}
54245399

5425-
if ((ModsTmp & SISrcMods::ABS) != 0)
5400+
if ((ModsTmp & SISrcMods::ABS) != 0) {
54265401
Mods |= SISrcMods::ABS;
5402+
Src = PeekSrc;
5403+
}
54275404
}
54285405
};
54295406

@@ -5436,12 +5413,9 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
54365413

54375414
Mods |= SISrcMods::OP_SEL_1;
54385415

5439-
if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) {
5416+
if (isExtractHiElt(*MRI, PeekSrc, PeekSrc)) {
5417+
Src = PeekSrc;
54405418
Mods |= SISrcMods::OP_SEL_0;
5441-
MI = ExtractHiEltMI;
5442-
MO = &MI->getOperand(0);
5443-
Src = MO->getReg();
5444-
54455419
CheckAbsNeg();
54465420
}
54475421

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
151151
bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const;
152152
bool selectSBarrierLeave(MachineInstr &I) const;
153153

154-
std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root,
154+
std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src,
155155
bool IsCanonicalizing = true,
156156
bool AllowAbs = true,
157157
bool OpSel = false) const;

llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -446,28 +446,28 @@ define amdgpu_ps float @test_matching_source_from_unmerge(ptr addrspace(3) %aptr
446446
; GFX9-DENORM: ; %bb.0: ; %.entry
447447
; GFX9-DENORM-NEXT: ds_read_b64 v[2:3], v0
448448
; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0)
449-
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
449+
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
450450
; GFX9-DENORM-NEXT: ; return to shader part epilog
451451
;
452452
; GFX10-LABEL: test_matching_source_from_unmerge:
453453
; GFX10: ; %bb.0: ; %.entry
454454
; GFX10-NEXT: ds_read_b64 v[2:3], v0
455455
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
456-
; GFX10-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
456+
; GFX10-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
457457
; GFX10-NEXT: ; return to shader part epilog
458458
;
459459
; GFX10-CONTRACT-LABEL: test_matching_source_from_unmerge:
460460
; GFX10-CONTRACT: ; %bb.0: ; %.entry
461461
; GFX10-CONTRACT-NEXT: ds_read_b64 v[2:3], v0
462462
; GFX10-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
463-
; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
463+
; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
464464
; GFX10-CONTRACT-NEXT: ; return to shader part epilog
465465
;
466466
; GFX10-DENORM-LABEL: test_matching_source_from_unmerge:
467467
; GFX10-DENORM: ; %bb.0: ; %.entry
468468
; GFX10-DENORM-NEXT: ds_read_b64 v[2:3], v0
469469
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
470-
; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
470+
; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
471471
; GFX10-DENORM-NEXT: ; return to shader part epilog
472472
.entry:
473473
%a = load <4 x half>, ptr addrspace(3) %aptr, align 16

0 commit comments

Comments
 (0)