Skip to content

Commit b99bdd4

Browse files
bcheng0127igcbot
authored andcommitted
gather send optimization
gather send optimization
1 parent 374a909 commit b99bdd4

File tree

5 files changed

+221
-35
lines changed

5 files changed

+221
-35
lines changed

visa/G4_Declare.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,10 @@ class G4_Declare {
9494
uint16_t forceSpilled : 1;
9595
uint16_t exclusiveLoad : 1;
9696
uint16_t isCmpUseOnly : 1;
97+
// indicate if the declare is local referenced only
98+
// Especially for the variable with pseodu_kill,
99+
// while will be removed in removeLifetimeOps pass.
100+
uint16_t isBBLocal : 1;
97101

98102
unsigned declId; // global decl id for this builder
99103

@@ -334,6 +338,9 @@ class G4_Declare {
334338
void setIsCmpUseOnly(bool b) { isCmpUseOnly = b; }
335339
bool getIsCmpUseOnly() const { return isCmpUseOnly; }
336340

341+
void setIsBBLocal(bool b) { isBBLocal = b; }
342+
bool getIsBBLocal() const { return isBBLocal; }
343+
337344
unsigned getNumRegNeeded() const;
338345

339346
void emit(std::ostream &output) const;

visa/LocalDataflow.cpp

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,47 @@ static void processReadOpnds(G4_BB *BB, G4_INST *Inst, LocalLivenessInfo &LLI) {
453453
}
454454
}
455455

456+
static void
457+
processReadOpndsForPseudoKill(G4_BB *BB, G4_INST *Inst,
458+
std::unordered_set<G4_Declare *> &pseudoKills) {
459+
if (Inst->isPseudoKill()) {
460+
return;
461+
}
462+
// (1) Indirect dst operand reads address.
463+
G4_DstRegRegion *Dst = Inst->getDst();
464+
if (Dst && Dst->isIndirect()) {
465+
G4_Declare *dcl = Dst->getTopDcl();
466+
if (pseudoKills.find(dcl) != pseudoKills.end()) {
467+
dcl->setIsBBLocal(false);
468+
pseudoKills.erase(dcl);
469+
}
470+
}
471+
472+
// (2) Direct and indirect source operands.
473+
for (auto OpNum :
474+
{Gen4_Operand_Number::Opnd_src0, Gen4_Operand_Number::Opnd_src1,
475+
Gen4_Operand_Number::Opnd_src2, Gen4_Operand_Number::Opnd_src3,
476+
Gen4_Operand_Number::Opnd_src4, Gen4_Operand_Number::Opnd_src5,
477+
Gen4_Operand_Number::Opnd_src6, Gen4_Operand_Number::Opnd_src7,
478+
Gen4_Operand_Number::Opnd_pred, Gen4_Operand_Number::Opnd_implAccSrc}) {
479+
G4_Operand *opnd = Inst->getOperand(OpNum);
480+
if (opnd == nullptr || opnd->isImm() || opnd->isNullReg() ||
481+
opnd->isLabel())
482+
continue;
483+
484+
G4_Declare *dcl = nullptr;
485+
if (Inst->isPseudoAddrMovIntrinsic()) {
486+
dcl =opnd->asAddrExp()->getRegVar()->getDeclare();
487+
} else {
488+
dcl = opnd->getTopDcl();
489+
}
490+
if (pseudoKills.find(dcl) != pseudoKills.end()) {
491+
dcl->setIsBBLocal(false);
492+
pseudoKills.erase(dcl);
493+
}
494+
}
495+
}
496+
456497
// Process writes. If this is a partial definition, then record this partial
457498
// definition. When all partial definitions together define this live read node,
458499
// it is killed and du/ud links are added.
@@ -487,6 +528,12 @@ static void processWriteOpnds(G4_BB *BB, G4_INST *Inst,
487528
}
488529

489530
void FlowGraph::localDataFlowAnalysis() {
531+
// For pseudo kill varaible
532+
// If there is use exposed in a BB, it's treated as global.
533+
// Otherwise, it's treated as local even the same pseudo kill may appear in
534+
// multiple BBs
535+
std::unordered_set<G4_Declare *> pesudoKilledDcls;
536+
490537
for (auto BB : BBs) {
491538
LocalLivenessInfo LLI(!BB->isAllLaneActive());
492539
for (auto I = BB->rbegin(), E = BB->rend(); I != E; ++I) {
@@ -504,7 +551,24 @@ void FlowGraph::localDataFlowAnalysis() {
504551
continue;
505552
}
506553
processWriteOpnds(BB, Inst, LLI);
554+
555+
if (Inst->isPseudoKill() && Inst->getDst() && !Inst->getDst()->isNullReg()) {
556+
G4_Declare *dcl = Inst->getDst()->getTopDcl();
557+
pesudoKilledDcls.insert(dcl);
558+
// In case the use in anther BB is analyzed before define
559+
if (!globalOpndHT.isOpndGlobal(Inst->getDst())) {
560+
G4_Declare *dcl = Inst->getDst()->getTopDcl();
561+
dcl->setIsBBLocal(true);
562+
}
563+
}
564+
507565
processReadOpnds(BB, Inst, LLI);
566+
if (pesudoKilledDcls
567+
.size()) { // Process the operand using variable which
568+
// has psuedo kill. Since the scan is from back to
569+
// front, exposed use will make variable global
570+
processReadOpndsForPseudoKill(BB, Inst, pesudoKilledDcls);
571+
}
508572
}
509573

510574
// All left over live nodes are global.

visa/Optimizer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -834,7 +834,7 @@ void Optimizer::s0SubAfterRA() {
834834
kernel.fg.resetLocalDataFlowData();
835835
kernel.fg.localDataFlowAnalysis();
836836

837-
SRSubPassBeforeRA s0Sub(builder, kernel);
837+
SRSubPassAfterRA s0Sub(builder, kernel);
838838
s0Sub.run();
839839
}
840840

visa/Passes/SRSubstitution.cpp

Lines changed: 140 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ static bool regSortCompare(regMap map1, regMap map2) {
1919
return false;
2020
}
2121

22-
static bool regSortCompareBeforeRA(regMapBRA map1, regMapBRA map2) {
22+
static bool regSortCompareAfterRA(regMapBRA map1, regMapBRA map2) {
2323
if (map1.opndNum < map2.opndNum) {
2424
return true;
2525
} else if (map1.opndNum > map2.opndNum) {
@@ -430,7 +430,7 @@ void SRSubPass::SRSub(G4_BB *bb) {
430430

431431
// Check if current instruction is the candidate of sendi.
432432
// Recorded as candidate.
433-
bool SRSubPassBeforeRA::isSRCandidateBeforeRA(G4_INST *inst,
433+
bool SRSubPassAfterRA::isSRCandidateAfterRA(G4_INST *inst,
434434
regCandidatesBRA &dstSrcRegs) {
435435
if (!inst->isSend()) {
436436
return false;
@@ -482,6 +482,7 @@ bool SRSubPassBeforeRA::isSRCandidateBeforeRA(G4_INST *inst,
482482
int movInstNum = 0;
483483
int32_t firstDefID = 0x7FFFFFFF; // the ID of the first instruction define the
484484
std::vector<std::pair<Gen4_Operand_Number, unsigned>> notRemoveableMap;
485+
std::vector<G4_INST *> immMovs;
485486
for (auto I = inst->def_begin(), E = inst->def_end(); I != E; ++I) {
486487
auto &&def = *I;
487488

@@ -572,14 +573,90 @@ bool SRSubPassBeforeRA::isSRCandidateBeforeRA(G4_INST *inst,
572573

573574
// It's not global define
574575
if (!(builder.getIsKernel() && kernel.fg.getNumBB() == 1)) {
575-
if (kernel.fg.globalOpndHT.isOpndGlobal(dstRgn)) {
576+
if (kernel.fg.globalOpndHT.isOpndGlobal(dstRgn) && !dstRgn->getTopDcl()->getIsBBLocal()) {
576577
return false;
577578
}
578579
}
579580

580581
return true;
581582
};
582583

584+
// mov (16) r81.0<1>:f 0x8:f // $52:&54:
585+
// mov (16|M16) r89.0<1>:f 0x8:f // $53:&55:
586+
// mov (16) r82.0<1>:f 0x0:f // $54:&56:
587+
// mov (16|M16) r90.0<1>:f 0x0:f // $55:&57:
588+
// mov (16) r83.0<1>:f 0x0:f // $56:&58:
589+
// mov (16|M16) r91.0<1>:f 0x0:f // $57:&59:
590+
// mov (16) r84.0<1>:f 0x0:f // $58:&60:
591+
// mov (16|M16) r92.0<1>:f 0x0:f // $59:&61:
592+
// mov (16) r85.0<1>:f 0x0:f // $60:&62:
593+
// mov (16|M16) r93.0<1>:f 0x0:f // $61:&63:
594+
// mov (16) r86.0<1>:f 0x0:f // $62:&64:
595+
// mov (16|M16) r94.0<1>:f 0x0:f // $63:&65:
596+
// mov (16) r87.0<1>:f 0x0:f // $64:&66:
597+
// mov (16|M16) r95.0<1>:f 0x0:f // $65:&67:
598+
// mov (16) r88.0<1>:f 0x0:f // $66:&68:
599+
// mov (16|M16) r96.0<1>:f 0x0:f // $67:&69:
600+
// ==>
601+
// mov (16) r81.0<1>:f 0x8:f // $52:&54:
602+
// mov (16|M16) r89.0<1>:f 0x8:f // $53:&55:
603+
// mov (16) r82.0<1>:f 0x0:f // $54:&56:
604+
// mov (16|M16) r90.0<1>:f 0x0:f // $55:&57:
605+
//
606+
// Reuse r81, r89, r82, r90 in the gather send
607+
auto getRemoveableImm = [this](G4_INST *inst,
608+
std::vector<G4_INST *> &immMovs) {
609+
// The instruction is only used for payload preparation.
610+
if (inst->use_size() != 1) {
611+
return (G4_INST *)nullptr;
612+
}
613+
614+
G4_DstRegRegion *dst = inst->getDst();
615+
// dst GRF aligned and contigous
616+
if (dst->getSubRegOff() || dst->getHorzStride() != 1) {
617+
return (G4_INST *)nullptr;
618+
}
619+
620+
if (kernel.fg.globalOpndHT.isOpndGlobal(dst)) {
621+
return (G4_INST *)nullptr;
622+
}
623+
624+
// GRF Alignment with physical register assigned
625+
if (dst->getLinearizedStart() % builder.getGRFSize() != 0) {
626+
return (G4_INST *)nullptr;
627+
}
628+
629+
// If the destination operand size is less than 1 GRF
630+
if ((dst->getLinearizedEnd() - dst->getLinearizedStart() + 1) <
631+
builder.getGRFSize()) {
632+
return (G4_INST *)nullptr;
633+
}
634+
635+
G4_Operand *src = inst->getSrc(0);
636+
int64_t imm = src->asImm()->getImm();
637+
for (size_t i = 0; i < immMovs.size(); i++) {
638+
G4_INST *imov = immMovs[i];
639+
G4_Operand *isrc = imov->getSrc(0);
640+
int64_t iimm = isrc->asImm()->getImm();
641+
if (imm == iimm &&
642+
src->getType() == isrc->getType() && // Same value and same type
643+
inst->getDst()->getType() ==
644+
imov->getDst()->getType() && // Same dst type
645+
inst->getDst()->asDstRegRegion()->getHorzStride() ==
646+
imov->getDst()
647+
->asDstRegRegion()
648+
->getHorzStride() && // Same region
649+
inst->getExecSize() == imov->getExecSize() && // Same execution size
650+
inst->getMaskOffset() ==
651+
imov->getMaskOffset()) { // Same mask offset
652+
return imov;
653+
}
654+
}
655+
immMovs.push_back(inst);
656+
657+
return (G4_INST *)nullptr;
658+
};
659+
583660
//if opndNum + offset is defined multiple times, cannobe be removed
584661
G4_Operand *dst = movInst->getDst();
585662
unsigned offset = dst->getLeftBound() / builder.getGRFSize();
@@ -604,6 +681,22 @@ bool SRSubPassBeforeRA::isSRCandidateBeforeRA(G4_INST *inst,
604681
movInstNum++;
605682
}
606683
} else {
684+
if (movInst->getSrc(0) && movInst->getSrc(0)->isImm()) {
685+
// Check if there is mov instruction with same imm value
686+
G4_INST *lvnMov = getRemoveableImm(movInst, immMovs);
687+
688+
if (lvnMov) {
689+
// The offset is the offset of original dst, which is used to identify
690+
// the original register used in send.
691+
// The opndNum is the opndNum of send.
692+
regMapBRA regPair(movInst, opndNum, offset,
693+
lvnMov->getDst()); // the lvn mov dst can be reused
694+
dstSrcRegs.dstSrcMap.push_back(regPair);
695+
firstDefID = std::min(firstDefID, def.first->getLocalId());
696+
movInstNum++;
697+
continue;
698+
}
699+
}
607700
notRemoveableMap.push_back(std::make_pair(opndNum, offset));
608701
}
609702
}
@@ -639,14 +732,14 @@ bool SRSubPassBeforeRA::isSRCandidateBeforeRA(G4_INST *inst,
639732
dstSrcRegs.firstDefID = firstDefID;
640733
// Sort according to the register order in the original payload
641734
std::sort(dstSrcRegs.dstSrcMap.begin(), dstSrcRegs.dstSrcMap.end(),
642-
regSortCompareBeforeRA);
735+
regSortCompareAfterRA);
643736

644737
return true;
645738
}
646739

647740
// Replace the send instruction with the payload of
648741
// Insert the scalar register intialization mov instructions.
649-
bool SRSubPassBeforeRA::replaceWithSendiBeforeRA(G4_BB *bb,
742+
bool SRSubPassAfterRA::replaceWithSendiAfterRA(G4_BB *bb,
650743
INST_LIST_ITER instIter,
651744
regCandidatesBRA &dstSrcRegs) {
652745
G4_INST *inst = *instIter;
@@ -784,7 +877,7 @@ bool SRSubPassBeforeRA::replaceWithSendiBeforeRA(G4_BB *bb,
784877
return true;
785878
}
786879

787-
void SRSubPassBeforeRA::SRSubBeforeRA(G4_BB *bb) {
880+
void SRSubPassAfterRA::SRSubAfterRA(G4_BB *bb) {
788881
bb->resetLocalIds();
789882

790883
class CmpFirstDef {
@@ -803,7 +896,7 @@ void SRSubPassBeforeRA::SRSubBeforeRA(G4_BB *bb) {
803896
G4_INST *inst = *ii;
804897

805898
regCandidatesBRA dstSrcRegs;
806-
if (!isSRCandidateBeforeRA(inst, dstSrcRegs)) {
899+
if (!isSRCandidateAfterRA(inst, dstSrcRegs)) {
807900
ii++;
808901
dstSrcRegs.dstSrcMap.clear();
809902
continue;
@@ -840,12 +933,26 @@ void SRSubPassBeforeRA::SRSubBeforeRA(G4_BB *bb) {
840933
candidatesIt = candidates.find(inst);
841934
//Is candidate send
842935
if (candidatesIt != candidates.end()) {
843-
bool overwrite = false;
844936
// Scan backward from the send instruction.
845937
INST_LIST_RITER scan_ri = ri;
846938
scan_ri++;
847939
G4_INST *rInst = *scan_ri;
940+
848941
while (rInst->getLocalId() > candidates[inst].firstDefID) {
942+
if (rInst->isDead()) {
943+
// If the inst is marked as dead, it's dst will not kill other value
944+
// Such as in following case, if third instruction is removed, r64
945+
// value of first instruction is kept.
946+
// mov (16) r16.0<1>:ud r64.0<1;1,0>:ud // $214:&226:
947+
// mov (16) r17.0<1>:ud r66.0<1;1,0>:ud // $216:&228:
948+
// mov (16) r64.0<1>:ud r68.0<1;1,0>:ud // $218:&230:
949+
scan_ri++;
950+
if (scan_ri == rend) {
951+
break;
952+
}
953+
rInst = *scan_ri;
954+
continue;
955+
}
849956
G4_Operand *dst = rInst->getDst();
850957
if (dst && !dst->isNullReg()) {
851958
G4_VarBase *base = dst->getBase();
@@ -879,16 +986,22 @@ void SRSubPassBeforeRA::SRSubBeforeRA(G4_BB *bb) {
879986
G4_Operand *dst = rInst->getDst();
880987
unsigned short dstRegLB = dst->getLinearizedStart();
881988
unsigned short dstRegRB = dst->getLinearizedEnd();
882-
for (int i = 0; i < (int)candidates[inst].dstSrcMap.size(); i++) {
883-
int srcRegLB =
884-
candidates[inst].dstSrcMap[i].opnd->getLinearizedStart();
885-
int srcRegRB =
886-
candidates[inst].dstSrcMap[i].opnd->getLinearizedEnd();
887989

990+
// There is any none removeable offset, the offset define move
991+
// cannot be removed.
992+
std::vector<regMapBRA>::iterator dstSrcRegsIter;
993+
for (dstSrcRegsIter = candidates[inst].dstSrcMap.begin();
994+
dstSrcRegsIter != candidates[inst].dstSrcMap.end();) {
995+
std::vector<regMapBRA>::iterator nextIter = dstSrcRegsIter;
996+
nextIter++;
997+
int srcRegLB = (*dstSrcRegsIter).opnd->getLinearizedStart();
998+
int srcRegRB = (*dstSrcRegsIter).opnd->getLinearizedEnd();
888999
if (!(srcRegRB < dstRegLB || srcRegLB > dstRegRB)) {
8891000
// Register is reused.
890-
overwrite = true;
891-
break;
1001+
dstSrcRegsIter =
1002+
candidates[inst].dstSrcMap.erase(dstSrcRegsIter);
1003+
} else {
1004+
dstSrcRegsIter = nextIter;
8921005
}
8931006
}
8941007
}
@@ -900,22 +1013,24 @@ void SRSubPassBeforeRA::SRSubBeforeRA(G4_BB *bb) {
9001013
}
9011014
rInst = *scan_ri;
9021015
}
903-
if (overwrite) {
1016+
1017+
// Due to extra mov for s0, so don't use s0 if equal or less than 1 mov
1018+
// inst can be removed.
1019+
if (candidates[inst].dstSrcMap.size() <= 1 &&
1020+
builder.getuint32Option(vISA_EnableGatherWithImmPreRA) !=
1021+
INDIRECT_TYPE::ALWAYS_S0) {
9041022
candidates.erase(candidatesIt);
1023+
} else {
1024+
for (int j = 0; j < (int)candidatesIt->second.dstSrcMap.size(); j++) {
1025+
G4_INST *movInst = candidatesIt->second.dstSrcMap[j].inst;
1026+
movInst->markDead();
1027+
}
9051028
}
9061029
}
9071030

9081031
ri++;
9091032
}
9101033

911-
for (candidatesIt = candidates.begin(); candidatesIt != candidates.end();
912-
candidatesIt++) {
913-
for (int i = 0; i < (int)candidatesIt->second.dstSrcMap.size(); i++) {
914-
G4_INST *movInst = candidatesIt->second.dstSrcMap[i].inst;
915-
movInst->markDead();
916-
}
917-
}
918-
9191034
// Replace the send instruction with sendi
9201035
// Remove the mov instructions that marked as dead
9211036
INST_LIST_ITER iter;
@@ -926,7 +1041,7 @@ void SRSubPassBeforeRA::SRSubBeforeRA(G4_BB *bb) {
9261041

9271042
candidatesIt = candidates.find(inst);
9281043
if (candidatesIt != candidates.end()) {
929-
replaceWithSendiBeforeRA(bb, curIter, candidates[inst]);
1044+
replaceWithSendiAfterRA(bb, curIter, candidates[inst]);
9301045
}
9311046
if (inst->isDead()) {
9321047
bb->erase(curIter);

0 commit comments

Comments
 (0)