Skip to content

Commit 77ae407

Browse files
bcheng0127igcbot
authored andcommitted
gather send optimization
gather send optimization
1 parent fd031eb commit 77ae407

File tree

5 files changed

+195
-35
lines changed

5 files changed

+195
-35
lines changed

visa/G4_Declare.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,10 @@ class G4_Declare {
9494
uint16_t forceSpilled : 1;
9595
uint16_t exclusiveLoad : 1;
9696
uint16_t isCmpUseOnly : 1;
97+
// indicate if the declare is local referenced only
98+
// Especially for the variable with pseodu_kill,
99+
// while will be removed in removeLifetimeOps pass.
100+
uint16_t isBBLocal : 1;
97101

98102
unsigned declId; // global decl id for this builder
99103

@@ -334,6 +338,9 @@ class G4_Declare {
334338
void setIsCmpUseOnly(bool b) { isCmpUseOnly = b; }
335339
bool getIsCmpUseOnly() const { return isCmpUseOnly; }
336340

341+
void setIsBBLocal(bool b) { isBBLocal = b; }
342+
bool getIsBBLocal() const { return isBBLocal; }
343+
337344
unsigned getNumRegNeeded() const;
338345

339346
void emit(std::ostream &output) const;

visa/LocalDataflow.cpp

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,47 @@ static void processReadOpnds(G4_BB *BB, G4_INST *Inst, LocalLivenessInfo &LLI) {
453453
}
454454
}
455455

456+
static void
457+
processReadOpndsForPseudoKill(G4_BB *BB, G4_INST *Inst,
458+
std::unordered_set<G4_Declare *> &pseudoKills) {
459+
if (Inst->isPseudoKill()) {
460+
return;
461+
}
462+
// (1) Indirect dst operand reads address.
463+
G4_DstRegRegion *Dst = Inst->getDst();
464+
if (Dst && Dst->isIndirect()) {
465+
G4_Declare *dcl = Dst->getTopDcl();
466+
if (pseudoKills.find(dcl) != pseudoKills.end()) {
467+
dcl->setIsBBLocal(false);
468+
pseudoKills.erase(dcl);
469+
}
470+
}
471+
472+
// (2) Direct and indirect source operands.
473+
for (auto OpNum :
474+
{Gen4_Operand_Number::Opnd_src0, Gen4_Operand_Number::Opnd_src1,
475+
Gen4_Operand_Number::Opnd_src2, Gen4_Operand_Number::Opnd_src3,
476+
Gen4_Operand_Number::Opnd_src4, Gen4_Operand_Number::Opnd_src5,
477+
Gen4_Operand_Number::Opnd_src6, Gen4_Operand_Number::Opnd_src7,
478+
Gen4_Operand_Number::Opnd_pred, Gen4_Operand_Number::Opnd_implAccSrc}) {
479+
G4_Operand *opnd = Inst->getOperand(OpNum);
480+
if (opnd == nullptr || opnd->isImm() || opnd->isNullReg() ||
481+
opnd->isLabel())
482+
continue;
483+
484+
G4_Declare *dcl = nullptr;
485+
if (Inst->isPseudoAddrMovIntrinsic()) {
486+
dcl =opnd->asAddrExp()->getRegVar()->getDeclare();
487+
} else {
488+
dcl = opnd->getTopDcl();
489+
}
490+
if (pseudoKills.find(dcl) != pseudoKills.end()) {
491+
dcl->setIsBBLocal(false);
492+
pseudoKills.erase(dcl);
493+
}
494+
}
495+
}
496+
456497
// Process writes. If this is a partial definition, then record this partial
457498
// definition. When all partial definitions together define this live read node,
458499
// it is killed and du/ud links are added.
@@ -487,6 +528,12 @@ static void processWriteOpnds(G4_BB *BB, G4_INST *Inst,
487528
}
488529

489530
void FlowGraph::localDataFlowAnalysis() {
531+
// For pseudo kill varaible
532+
// If there is use exposed in a BB, it's treated as global.
533+
// Otherwise, it's treated as local even the same pseudo kill may appear in
534+
// multiple BBs
535+
std::unordered_set<G4_Declare *> pesudoKilledDcls;
536+
490537
for (auto BB : BBs) {
491538
LocalLivenessInfo LLI(!BB->isAllLaneActive());
492539
for (auto I = BB->rbegin(), E = BB->rend(); I != E; ++I) {
@@ -504,7 +551,24 @@ void FlowGraph::localDataFlowAnalysis() {
504551
continue;
505552
}
506553
processWriteOpnds(BB, Inst, LLI);
554+
555+
if (Inst->isPseudoKill() && Inst->getDst() && !Inst->getDst()->isNullReg()) {
556+
G4_Declare *dcl = Inst->getDst()->getTopDcl();
557+
pesudoKilledDcls.insert(dcl);
558+
// In case the use in anther BB is analyzed before define
559+
if (!globalOpndHT.isOpndGlobal(Inst->getDst())) {
560+
G4_Declare *dcl = Inst->getDst()->getTopDcl();
561+
dcl->setIsBBLocal(true);
562+
}
563+
}
564+
507565
processReadOpnds(BB, Inst, LLI);
566+
if (pesudoKilledDcls
567+
.size()) { // Process the operand using variable which
568+
// has psuedo kill. Since the scan is from back to
569+
// front, exposed use will make variable global
570+
processReadOpndsForPseudoKill(BB, Inst, pesudoKilledDcls);
571+
}
508572
}
509573

510574
// All left over live nodes are global.

visa/Optimizer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -834,7 +834,7 @@ void Optimizer::s0SubAfterRA() {
834834
kernel.fg.resetLocalDataFlowData();
835835
kernel.fg.localDataFlowAnalysis();
836836

837-
SRSubPassBeforeRA s0Sub(builder, kernel);
837+
SRSubPassAfterRA s0Sub(builder, kernel);
838838
s0Sub.run();
839839
}
840840

visa/Passes/SRSubstitution.cpp

Lines changed: 114 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ static bool regSortCompare(regMap map1, regMap map2) {
1919
return false;
2020
}
2121

22-
static bool regSortCompareBeforeRA(regMapBRA map1, regMapBRA map2) {
22+
static bool regSortCompareAfterRA(regMapBRA map1, regMapBRA map2) {
2323
if (map1.opndNum < map2.opndNum) {
2424
return true;
2525
} else if (map1.opndNum > map2.opndNum) {
@@ -430,7 +430,7 @@ void SRSubPass::SRSub(G4_BB *bb) {
430430

431431
// Check if current instruction is the candidate of sendi.
432432
// Recorded as candidate.
433-
bool SRSubPassBeforeRA::isSRCandidateBeforeRA(G4_INST *inst,
433+
bool SRSubPassAfterRA::isSRCandidateAfterRA(G4_INST *inst,
434434
regCandidatesBRA &dstSrcRegs) {
435435
if (!inst->isSend()) {
436436
return false;
@@ -482,6 +482,7 @@ bool SRSubPassBeforeRA::isSRCandidateBeforeRA(G4_INST *inst,
482482
int movInstNum = 0;
483483
int32_t firstDefID = 0x7FFFFFFF; // the ID of the first instruction define the
484484
std::vector<std::pair<Gen4_Operand_Number, unsigned>> notRemoveableMap;
485+
std::vector<G4_INST *> immMovs;
485486
for (auto I = inst->def_begin(), E = inst->def_end(); I != E; ++I) {
486487
auto &&def = *I;
487488

@@ -572,14 +573,64 @@ bool SRSubPassBeforeRA::isSRCandidateBeforeRA(G4_INST *inst,
572573

573574
// It's not global define
574575
if (!(builder.getIsKernel() && kernel.fg.getNumBB() == 1)) {
575-
if (kernel.fg.globalOpndHT.isOpndGlobal(dstRgn)) {
576+
if (kernel.fg.globalOpndHT.isOpndGlobal(dstRgn) && !dstRgn->getTopDcl()->getIsBBLocal()) {
576577
return false;
577578
}
578579
}
579580

580581
return true;
581582
};
582583

584+
// mov (16) r81.0<1>:f 0x8:f // $52:&54:
585+
// mov (16|M16) r89.0<1>:f 0x8:f // $53:&55:
586+
// mov (16) r82.0<1>:f 0x0:f // $54:&56:
587+
// mov (16|M16) r90.0<1>:f 0x0:f // $55:&57:
588+
// mov (16) r83.0<1>:f 0x0:f // $56:&58:
589+
// mov (16|M16) r91.0<1>:f 0x0:f // $57:&59:
590+
// mov (16) r84.0<1>:f 0x0:f // $58:&60:
591+
// mov (16|M16) r92.0<1>:f 0x0:f // $59:&61:
592+
// mov (16) r85.0<1>:f 0x0:f // $60:&62:
593+
// mov (16|M16) r93.0<1>:f 0x0:f // $61:&63:
594+
// mov (16) r86.0<1>:f 0x0:f // $62:&64:
595+
// mov (16|M16) r94.0<1>:f 0x0:f // $63:&65:
596+
// mov (16) r87.0<1>:f 0x0:f // $64:&66:
597+
// mov (16|M16) r95.0<1>:f 0x0:f // $65:&67:
598+
// mov (16) r88.0<1>:f 0x0:f // $66:&68:
599+
// mov (16|M16) r96.0<1>:f 0x0:f // $67:&69:
600+
// ==>
601+
// mov (16) r81.0<1>:f 0x8:f // $52:&54:
602+
// mov (16|M16) r89.0<1>:f 0x8:f // $53:&55:
603+
// mov (16) r82.0<1>:f 0x0:f // $54:&56:
604+
// mov (16|M16) r90.0<1>:f 0x0:f // $55:&57:
605+
//
606+
// Reuse r81, r89, r82, r90 in the gather send
607+
auto getRemoveableImm = [this](G4_INST *inst,
608+
std::vector<G4_INST *> &immMovs) {
609+
G4_Operand *src = inst->getSrc(0);
610+
int64_t imm = src->asImm()->getImm();
611+
for (size_t i = 0; i < immMovs.size(); i++) {
612+
G4_INST *imov = immMovs[i];
613+
G4_Operand *isrc = imov->getSrc(0);
614+
int64_t iimm = isrc->asImm()->getImm();
615+
if (imm == iimm &&
616+
src->getType() == isrc->getType() && // Same value and same type
617+
inst->getDst()->getType() ==
618+
imov->getDst()->getType() && // Same dst type
619+
inst->getDst()->asDstRegRegion()->getHorzStride() ==
620+
imov->getDst()
621+
->asDstRegRegion()
622+
->getHorzStride() && // Same region
623+
inst->getExecSize() == imov->getExecSize() && // Same execution size
624+
inst->getMaskOffset() ==
625+
imov->getMaskOffset()) { // Same mask offset
626+
return imov;
627+
}
628+
}
629+
immMovs.push_back(inst);
630+
631+
return (G4_INST *)nullptr;
632+
};
633+
583634
//if opndNum + offset is defined multiple times, cannobe be removed
584635
G4_Operand *dst = movInst->getDst();
585636
unsigned offset = dst->getLeftBound() / builder.getGRFSize();
@@ -604,6 +655,22 @@ bool SRSubPassBeforeRA::isSRCandidateBeforeRA(G4_INST *inst,
604655
movInstNum++;
605656
}
606657
} else {
658+
if (movInst->getSrc(0) && movInst->getSrc(0)->isImm()) {
659+
// Check if there is mov instruction with same imm value
660+
G4_INST *lvnMov = getRemoveableImm(movInst, immMovs);
661+
662+
if (lvnMov) {
663+
// The offset is the offset of original dst, which is used to identify
664+
// the original register used in send.
665+
// The opndNum is the opndNum of send.
666+
regMapBRA regPair(movInst, opndNum, offset,
667+
lvnMov->getDst()); // the lvn mov dst can be reused
668+
dstSrcRegs.dstSrcMap.push_back(regPair);
669+
firstDefID = std::min(firstDefID, def.first->getLocalId());
670+
movInstNum++;
671+
continue;
672+
}
673+
}
607674
notRemoveableMap.push_back(std::make_pair(opndNum, offset));
608675
}
609676
}
@@ -639,14 +706,14 @@ bool SRSubPassBeforeRA::isSRCandidateBeforeRA(G4_INST *inst,
639706
dstSrcRegs.firstDefID = firstDefID;
640707
// Sort according to the register order in the original payload
641708
std::sort(dstSrcRegs.dstSrcMap.begin(), dstSrcRegs.dstSrcMap.end(),
642-
regSortCompareBeforeRA);
709+
regSortCompareAfterRA);
643710

644711
return true;
645712
}
646713

647714
// Replace the send instruction with the payload of
648715
// Insert the scalar register intialization mov instructions.
649-
bool SRSubPassBeforeRA::replaceWithSendiBeforeRA(G4_BB *bb,
716+
bool SRSubPassAfterRA::replaceWithSendiAfterRA(G4_BB *bb,
650717
INST_LIST_ITER instIter,
651718
regCandidatesBRA &dstSrcRegs) {
652719
G4_INST *inst = *instIter;
@@ -784,7 +851,7 @@ bool SRSubPassBeforeRA::replaceWithSendiBeforeRA(G4_BB *bb,
784851
return true;
785852
}
786853

787-
void SRSubPassBeforeRA::SRSubBeforeRA(G4_BB *bb) {
854+
void SRSubPassAfterRA::SRSubAfterRA(G4_BB *bb) {
788855
bb->resetLocalIds();
789856

790857
class CmpFirstDef {
@@ -803,7 +870,7 @@ void SRSubPassBeforeRA::SRSubBeforeRA(G4_BB *bb) {
803870
G4_INST *inst = *ii;
804871

805872
regCandidatesBRA dstSrcRegs;
806-
if (!isSRCandidateBeforeRA(inst, dstSrcRegs)) {
873+
if (!isSRCandidateAfterRA(inst, dstSrcRegs)) {
807874
ii++;
808875
dstSrcRegs.dstSrcMap.clear();
809876
continue;
@@ -840,12 +907,26 @@ void SRSubPassBeforeRA::SRSubBeforeRA(G4_BB *bb) {
840907
candidatesIt = candidates.find(inst);
841908
//Is candidate send
842909
if (candidatesIt != candidates.end()) {
843-
bool overwrite = false;
844910
// Scan backward from the send instruction.
845911
INST_LIST_RITER scan_ri = ri;
846912
scan_ri++;
847913
G4_INST *rInst = *scan_ri;
914+
848915
while (rInst->getLocalId() > candidates[inst].firstDefID) {
916+
if (rInst->isDead()) {
917+
// If the inst is marked as dead, it's dst will not kill other value
918+
// Such as in following case, if third instruction is removed, r64
919+
// value of first instruction is kept.
920+
// mov (16) r16.0<1>:ud r64.0<1;1,0>:ud // $214:&226:
921+
// mov (16) r17.0<1>:ud r66.0<1;1,0>:ud // $216:&228:
922+
// mov (16) r64.0<1>:ud r68.0<1;1,0>:ud // $218:&230:
923+
scan_ri++;
924+
if (scan_ri == rend) {
925+
break;
926+
}
927+
rInst = *scan_ri;
928+
continue;
929+
}
849930
G4_Operand *dst = rInst->getDst();
850931
if (dst && !dst->isNullReg()) {
851932
G4_VarBase *base = dst->getBase();
@@ -879,16 +960,22 @@ void SRSubPassBeforeRA::SRSubBeforeRA(G4_BB *bb) {
879960
G4_Operand *dst = rInst->getDst();
880961
unsigned short dstRegLB = dst->getLinearizedStart();
881962
unsigned short dstRegRB = dst->getLinearizedEnd();
882-
for (int i = 0; i < (int)candidates[inst].dstSrcMap.size(); i++) {
883-
int srcRegLB =
884-
candidates[inst].dstSrcMap[i].opnd->getLinearizedStart();
885-
int srcRegRB =
886-
candidates[inst].dstSrcMap[i].opnd->getLinearizedEnd();
887963

964+
// There is any none removeable offset, the offset define move
965+
// cannot be removed.
966+
std::vector<regMapBRA>::iterator dstSrcRegsIter;
967+
for (dstSrcRegsIter = candidates[inst].dstSrcMap.begin();
968+
dstSrcRegsIter != candidates[inst].dstSrcMap.end();) {
969+
std::vector<regMapBRA>::iterator nextIter = dstSrcRegsIter;
970+
nextIter++;
971+
int srcRegLB = (*dstSrcRegsIter).opnd->getLinearizedStart();
972+
int srcRegRB = (*dstSrcRegsIter).opnd->getLinearizedEnd();
888973
if (!(srcRegRB < dstRegLB || srcRegLB > dstRegRB)) {
889974
// Register is reused.
890-
overwrite = true;
891-
break;
975+
dstSrcRegsIter =
976+
candidates[inst].dstSrcMap.erase(dstSrcRegsIter);
977+
} else {
978+
dstSrcRegsIter = nextIter;
892979
}
893980
}
894981
}
@@ -900,22 +987,24 @@ void SRSubPassBeforeRA::SRSubBeforeRA(G4_BB *bb) {
900987
}
901988
rInst = *scan_ri;
902989
}
903-
if (overwrite) {
990+
991+
// Due to extra mov for s0, so don't use s0 if equal or less than 1 mov
992+
// inst can be removed.
993+
if (candidates[inst].dstSrcMap.size() <= 1 &&
994+
builder.getuint32Option(vISA_EnableGatherWithImmPreRA) !=
995+
INDIRECT_TYPE::ALWAYS_S0) {
904996
candidates.erase(candidatesIt);
997+
} else {
998+
for (int j = 0; j < (int)candidatesIt->second.dstSrcMap.size(); j++) {
999+
G4_INST *movInst = candidatesIt->second.dstSrcMap[j].inst;
1000+
movInst->markDead();
1001+
}
9051002
}
9061003
}
9071004

9081005
ri++;
9091006
}
9101007

911-
for (candidatesIt = candidates.begin(); candidatesIt != candidates.end();
912-
candidatesIt++) {
913-
for (int i = 0; i < (int)candidatesIt->second.dstSrcMap.size(); i++) {
914-
G4_INST *movInst = candidatesIt->second.dstSrcMap[i].inst;
915-
movInst->markDead();
916-
}
917-
}
918-
9191008
// Replace the send instruction with sendi
9201009
// Remove the mov instructions that marked as dead
9211010
INST_LIST_ITER iter;
@@ -926,7 +1015,7 @@ void SRSubPassBeforeRA::SRSubBeforeRA(G4_BB *bb) {
9261015

9271016
candidatesIt = candidates.find(inst);
9281017
if (candidatesIt != candidates.end()) {
929-
replaceWithSendiBeforeRA(bb, curIter, candidates[inst]);
1018+
replaceWithSendiAfterRA(bb, curIter, candidates[inst]);
9301019
}
9311020
if (inst->isDead()) {
9321021
bb->erase(curIter);

0 commit comments

Comments
 (0)