Skip to content

Commit 022fa2d

Browse files
committed
new tests, reduced nesting, cleanup
1 parent 48cf50c commit 022fa2d

File tree

2 files changed

+136
-58
lines changed

2 files changed

+136
-58
lines changed

llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp

Lines changed: 50 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ class SIPreEmitPeephole {
6262
// v_fma_f32 v1, v0, v2, v2
6363
// Here, we have overwritten v0 before we use it. This function checks if
6464
// unpacking can lead to such a situation.
65-
bool canUnpackingIntroduceDependencies(const MachineInstr &MI);
65+
bool canUnpackingClobberRegister(const MachineInstr &MI);
6666
// Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and
6767
// V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for
6868
// this transformation.
@@ -469,7 +469,7 @@ bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
469469
llvm_unreachable("Fully covered switch");
470470
}
471471

472-
bool SIPreEmitPeephole::canUnpackingIntroduceDependencies(
472+
bool SIPreEmitPeephole::canUnpackingClobberRegister(
473473
const MachineInstr &MI) {
474474
unsigned OpCode = MI.getOpcode();
475475
Register DstReg = MI.getOperand(0).getReg();
@@ -481,14 +481,12 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies(
481481
// Such scenarios can arise due to specific combinations of op_sel and
482482
// op_sel_hi modifiers.
483483
Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0);
484-
unsigned Src0Mods =
485-
TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
486-
unsigned Src1Mods =
487-
TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
488484

489485
const MachineOperand *Src0MO = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
490-
if (Src0MO->isReg()) {
486+
if (Src0MO && Src0MO->isReg()) {
491487
Register SrcReg0 = Src0MO->getReg();
488+
unsigned Src0Mods =
489+
TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
492490
Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
493491
? TRI->getSubReg(SrcReg0, AMDGPU::sub1)
494492
: TRI->getSubReg(SrcReg0, AMDGPU::sub0);
@@ -499,8 +497,10 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies(
499497
}
500498

501499
const MachineOperand *Src1MO = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
502-
if (Src1MO->isReg()) {
500+
if (Src1MO && Src1MO->isReg()) {
503501
Register SrcReg1 = Src1MO->getReg();
502+
unsigned Src1Mods =
503+
TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
504504
Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1)
505505
? TRI->getSubReg(SrcReg1, AMDGPU::sub1)
506506
: TRI->getSubReg(SrcReg1, AMDGPU::sub0);
@@ -511,13 +511,13 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies(
511511
// Applicable for packed instructions with 3 source operands, such as
512512
// V_PK_FMA.
513513
if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
514-
unsigned Src2Mods =
515-
TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm();
516514
const MachineOperand *Src2MO =
517515
TII->getNamedOperand(MI, AMDGPU::OpName::src2);
518-
if (Src2MO->isReg()) {
516+
if (Src2MO && Src2MO->isReg()) {
519517
Register SrcReg2 =
520518
TII->getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
519+
unsigned Src2Mods =
520+
TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm();
521521
Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
522522
? TRI->getSubReg(SrcReg2, AMDGPU::sub1)
523523
: TRI->getSubReg(SrcReg2, AMDGPU::sub0);
@@ -614,49 +614,46 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
614614
MachineInstr &Instr = *I;
615615
if (Instr.isMetaInstruction())
616616
continue;
617-
if (Instr.isTerminator())
618-
return;
619-
if (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr))
620-
return;
621-
if (SIInstrInfo::modifiesModeRegister(Instr) &&
622-
Instr.modifiesRegister(AMDGPU::EXEC, TRI))
617+
if ((Instr.isTerminator()) ||
618+
(TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) ||
619+
(SIInstrInfo::modifiesModeRegister(Instr) &&
620+
Instr.modifiesRegister(AMDGPU::EXEC, TRI)))
623621
return;
622+
624623
const MCSchedClassDesc *InstrSchedClassDesc =
625624
SchedModel.resolveSchedClass(&Instr);
626-
TotalCyclesBetweenCandidates +=
625+
uint16_t Latency =
627626
SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
627+
TotalCyclesBetweenCandidates += Latency;
628628

629-
if (TotalCyclesBetweenCandidates > NumMFMACycles)
629+
if (TotalCyclesBetweenCandidates > NumMFMACycles - 1)
630630
return;
631631
// Identify register dependencies between those used by the MFMA
632632
// instruction and the following packed instructions. Also checks for
633633
// transitive dependencies between the MFMA def and candidate instruction
634634
// def and uses. Conservatively ensures that we do not incorrectly
635635
// read/write registers.
636636
for (const MachineOperand &InstrMO : Instr.operands()) {
637-
if (InstrMO.isReg()) {
638-
if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
639-
return;
640-
}
641-
}
642-
if (isUnpackingSupportedInstr(Instr)) {
643-
assert(TII->isNeverCoissue(Instr) && "Instruction cannot be co-issued.");
644-
if (canUnpackingIntroduceDependencies(Instr))
637+
if (!InstrMO.isReg() || !InstrMO.getReg().isValid())
638+
continue;
639+
if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
645640
return;
646-
// If it is a packed instruction, we should subtract it's latency from the
647-
// overall latency calculation here, because the packed instruction will
648-
// be removed and replaced by 2 unpacked instructions.
649-
TotalCyclesBetweenCandidates -=
650-
SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
651-
// We're adding 2 to account for the extra latency added by unpacking into
652-
// 2 instructions. At the time of writing, the considered unpacked
653-
// instructions have latency of 1.
654-
// TODO: improve latency handling of possible inserted instructions.
655-
TotalCyclesBetweenCandidates += 2;
656-
// Subtract 1 to account for MFMA issue latency.
657-
if (!(TotalCyclesBetweenCandidates >= NumMFMACycles - 1))
658-
InstrsToUnpack.insert(&Instr);
659641
}
642+
if (!isUnpackingSupportedInstr(Instr))
643+
continue;
644+
645+
assert(TII->isNeverCoissue(Instr) && "Instruction cannot be co-issued.");
646+
if (canUnpackingClobberRegister(Instr))
647+
return;
648+
// If it's a packed instruction, adjust latency: remove the packed
649+
// latency, add latency of two unpacked instructions (currently estimated
650+
// as 2 cycles).
651+
TotalCyclesBetweenCandidates -= Latency;
652+
// TODO: improve latency handling based on instruction modeling.
653+
TotalCyclesBetweenCandidates += 2;
654+
// Subtract 1 to account for MFMA issue latency.
655+
if (TotalCyclesBetweenCandidates < NumMFMACycles - 1)
656+
InstrsToUnpack.insert(&Instr);
660657
}
661658
return;
662659
}
@@ -672,8 +669,7 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
672669
createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/false);
673670
MachineOperand LoDstOp = Op0LOp1L->getOperand(0);
674671

675-
if (DstOp.isUndef())
676-
LoDstOp.setIsUndef();
672+
LoDstOp.setIsUndef(DstOp.isUndef());
677673

678674
MachineInstrBuilder Op0HOp1H =
679675
createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/true);
@@ -687,10 +683,9 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
687683
Op0LOp1L->setFlag(MachineInstr::MIFlag::FmContract);
688684
Op0HOp1H->setFlag(MachineInstr::MIFlag::FmContract);
689685
}
690-
if (DstOp.getReg().isPhysical() && DstOp.isRenamable()) {
691-
LoDstOp.setIsRenamable(true);
692-
HiDstOp.setIsRenamable(true);
693-
}
686+
687+
LoDstOp.setIsRenamable(DstOp.isRenamable());
688+
HiDstOp.setIsRenamable(DstOp.isRenamable());
694689

695690
I.eraseFromParent();
696691
return;
@@ -804,22 +799,19 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
804799
for (MachineBasicBlock &MBB : MF) {
805800
// Unpack packed instructions overlapped by MFMAs. This allows the compiler
806801
// to co-issue unpacked instructions with MFMA
807-
uint16_t NumMFMACycles = 0;
808802
auto SchedModel = TII->getSchedModel();
809803
SetVector<MachineInstr *> InstrsToUnpack;
810804
for (auto &MI : make_early_inc_range(MBB.instrs())) {
811-
if (SIInstrInfo::isMFMA(MI)) {
812-
const MCSchedClassDesc *SchedClassDesc =
813-
SchedModel.resolveSchedClass(&MI);
814-
NumMFMACycles =
815-
SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
816-
collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
817-
}
805+
if (!SIInstrInfo::isMFMA(MI))
806+
continue;
807+
const MCSchedClassDesc *SchedClassDesc =
808+
SchedModel.resolveSchedClass(&MI);
809+
uint16_t NumMFMACycles =
810+
SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
811+
collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
818812
}
819-
if (!InstrsToUnpack.empty()) {
820-
for (MachineInstr *MI : InstrsToUnpack) {
821-
performF32Unpacking(*MI);
822-
}
813+
for (MachineInstr *MI : InstrsToUnpack) {
814+
performF32Unpacking(*MI);
823815
}
824816
}
825817

llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -481,3 +481,89 @@ body: |
481481
$vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
482482
renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
483483
S_ENDPGM 0
484+
485+
...
486+
---
487+
name: test_mfma_def_using_instr_blocks_unpacking
488+
tracksRegLiveness: true
489+
490+
liveins:
491+
- { reg: '$sgpr4_sgpr5' }
492+
493+
body: |
494+
bb.0.entry:
495+
liveins: $sgpr4_sgpr5
496+
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
497+
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
498+
renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
499+
renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
500+
early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
501+
early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
502+
$vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
503+
$vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
504+
$vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
505+
$vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
506+
renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
507+
$vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec
508+
$vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec
509+
$vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
510+
renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
511+
S_ENDPGM 0
512+
513+
...
514+
---
515+
name: test_unpacking_with_imm_input
516+
tracksRegLiveness: true
517+
518+
liveins:
519+
- { reg: '$sgpr4_sgpr5' }
520+
521+
body: |
522+
bb.0.entry:
523+
liveins: $sgpr4_sgpr5
524+
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
525+
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
526+
S_WAITCNT 49279
527+
renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
528+
renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
529+
early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
530+
early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
531+
S_WAITCNT 49279
532+
$vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
533+
$vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
534+
$vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
535+
$vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
536+
renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
537+
$vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
538+
$vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
539+
renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, 1065353216, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
540+
S_ENDPGM 0
541+
542+
...
543+
---
544+
name: test_neg_lo_hi_post_unpacking
545+
tracksRegLiveness: true
546+
547+
liveins:
548+
- { reg: '$sgpr4_sgpr5' }
549+
550+
body: |
551+
bb.0.entry:
552+
liveins: $sgpr4_sgpr5
553+
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
554+
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
555+
S_WAITCNT 49279
556+
renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
557+
renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
558+
early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
559+
early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
560+
S_WAITCNT 49279
561+
$vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
562+
$vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
563+
$vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
564+
$vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
565+
renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
566+
$vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
567+
$vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
568+
renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 11, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
569+
S_ENDPGM 0

0 commit comments

Comments
 (0)