@@ -6879,7 +6879,8 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
68796879 void Optimizer::HWWorkaround ()
68806880 {
68816881 if ((kernel.getInt32KernelAttr (Attributes::ATTR_Target) == VISA_CM) &&
6882- builder.getJitInfo ()->spillMemUsed > 0 && builder.hasFusedEUWA ())
6882+ builder.hasFusedEUWA () &&
6883+ (builder.getJitInfo ()->spillMemUsed > 0 || builder.getJitInfo ()->numFlagSpillStore > 0 ))
68836884 {
68846885 // For now, do it for CM/VC. Will turn it on for all.
68856886 doNoMaskWA_postRA ();
@@ -11593,6 +11594,10 @@ void Optimizer::doNoMaskWA()
1159311594 for (auto II = BB->begin (), IE = BB->end (); II != IE; ++II)
1159411595 {
1159511596 G4_INST* I = *II;
11597+
11598+ // Mark all instruction as created by preRA to avoid re-processing postRA
11599+ I->setCreatedPreRA (true );
11600+
1159611601 if (!isCandidateInst (I, fg))
1159711602 {
1159811603 continue ;
@@ -11779,21 +11784,48 @@ void Optimizer::doNoMaskWA()
1177911784// // scratch space spill: SP_GRF_V77_3 from offset[4x32];
1178011785// (W) send.dc0 (16|M0) null r0 r4 0x80 0x020F1004
1178111786//
11787+ // For flag spill:
11788+ // Need WA as well due to the following case:
11789+ //
11790+ // After RA:
11791+ // BB_19:
11792+ // (W) mov (1|M0) r34.8<1>:uw f0.1<0;1,0>:uw
11793+ // ...
11794+ // BB_21:
11795+ // (W) mov (1|M0) f1.1<1>:uw r34.8<0;1,0>:uw
11796+ //
11797+ // If BB_19 should be skipped but runs due to this HW bug, r34.8 will be updated
11798+ // with a f0.1, which is undefined value. And at BB_21, reading from r34.8 will
11799+ // get garbage value!
11800+ //
1178211801// Note this works only for NoMaskWA=2
1178311802//
1178411803void Optimizer::doNoMaskWA_postRA ()
1178511804{
1178611805 std::vector<INST_LIST_ITER> NoMaskCandidates;
1178711806 G4_ExecSize simdsize = fg.getKernel ()->getSimdSize ();
11807+ const bool HasFlagSpill = (builder.getJitInfo ()->numFlagSpillStore > 0 );
11808+
11809+ auto isCandidate = [&](G4_INST* I) {
11810+ if (I->getCreatedPreRA () || !I->isWriteEnableInst ())
11811+ {
11812+ return false ;
11813+ }
1178811814
11789- auto isCandidate = [](G4_INST* I) {
11790- if (I->isSend () && I->isWriteEnableInst () &&
11791- I->getPredicate () == nullptr &&
11815+ // If it is global flag spill or global grf spill, need to do WA.
11816+ // For now, global checking is not available
11817+
11818+ // 1. flag spill
11819+ if (HasFlagSpill &&
11820+ I->isMov () && I->getSrc (0 ) && I->getSrc (0 )->isFlag () &&
11821+ I->getExecSize () == g4::SIMD1 && I->getPredicate () == nullptr )
11822+ {
11823+ return true ;
11824+ }
11825+ // 2. GRF spill
11826+ if (I->isSend () && I->getPredicate () == nullptr &&
1179211827 (I->getDst () == nullptr || I->getDst ()->isNullReg ()))
1179311828 {
11794- // This shall be a spill (write).
11795- // May check if the spilled var is global. We only need
11796- // to do WA for global spill!
1179711829 return true ;
1179811830 }
1179911831 return false ;
@@ -11851,6 +11883,16 @@ void Optimizer::doNoMaskWA_postRA()
1185111883 // (W & f0.0.any16h) send (16|M0) ...
1185211884 // 3. (W) mov (1|M0) f0.0<1>:uw DW0:uw // restore
1185311885 //
11886+ // For flag spill, the sequence is the same as the above except for the case in which
11887+ // the WAFlag is the same as spilled flag. For example,
11888+ //
11889+ // (W) mov (1|M0) r34.8<1>:uw f0.0<0;1,0>:uw
11890+ //
11891+ // 1. (W) mov (1|M0) DW0:uw f0.0<0;1,0>:uw // save
11892+ // 2. (W) mov (1|M0) f0.0<1>:uw DW1:uw // WARestore
11893+ // (W & f0.0.any16h) mov r34.8<1>:uw DW0.0<0;1,0>:uw
11894+ // 3. (W) mov (1|M0) f0.0<1>:uw DW0:uw // restore
11895+ //
1185411896 // Todo: check if save/restore is needed to avoid redundant save/restore.
1185511897 //
1185611898 G4_Declare* saveTmp = builder.getEUFusionWATmpVar (); // 2DW;
@@ -11891,10 +11933,15 @@ void Optimizer::doNoMaskWA_postRA()
1189111933 // Without optimization, always do save/restore
1189211934 bool needSave = true ;
1189311935 bool needRestore = true ;
11936+
11937+ // wa flag register to use f(wafregnum, wafsregnum)
11938+ uint32_t wafregnum = 0 ;
11939+ uint32_t wafsregnum = 0 ;
11940+
1189411941 G4_Type Ty = (simdsize > 16 ) ? Type_UD : Type_UW;
1189511942 G4_Declare* flagDcl = builder.createTempFlag ((Ty == Type_UW ? 1 : 2 ), " waflag" );
1189611943 G4_RegVar* flagVar = flagDcl->getRegVar ();
11897- flagVar->setPhyReg (builder.phyregpool .getFlagAreg (0 ), 0 );
11944+ flagVar->setPhyReg (builder.phyregpool .getFlagAreg (wafregnum ), wafsregnum );
1189811945
1189911946 // Save flag, create WA mask, save WAflag
1190011947 createMov1 (BB, WAInsts[0 ], saveVar, saveOff, flagVar, 0 , Ty); // save
@@ -11915,6 +11962,24 @@ void Optimizer::doNoMaskWA_postRA()
1191511962 G4_INST* I = *currII;
1191611963 G4_Predicate* newPred = builder.createPredicate (
1191711964 PredState_Plus, flagVar, 0 , waPredCtrl);
11965+ if (I->isMov () && I->getSrc (0 ) && I->getSrc (0 )->isFlag ())
11966+ {
11967+ G4_SrcRegRegion* srcReg = I->getSrc (0 )->asSrcRegRegion ();
11968+ G4_RegVar* baseVar = static_cast <G4_RegVar*>(srcReg->getBase ());
11969+ assert (baseVar->isPhyRegAssigned ());
11970+
11971+ // For flag, G4_Areg has flag number and G4_RegVar has subRefOff.
11972+ // (SrcRegRegion's refOff/subRefOff is 0/0 always.)
11973+ G4_Areg* flagReg = baseVar->getPhyReg ()->getAreg ();
11974+ uint32_t subRegOff = baseVar->getPhyRegOff ();
11975+ if (flagReg->getFlagNum () == wafregnum &&
11976+ (Ty == Type_UD /* 32bit flag */ || subRegOff == wafsregnum /* 16bit flag */ ))
11977+ {
11978+ G4_SrcRegRegion* S = builder.createSrc (
11979+ saveVar, 0 , saveOff, builder.getRegionScalar (), Ty);
11980+ I->setSrc (S, 0 );
11981+ }
11982+ }
1191811983 I->setPredicate (newPred);
1191911984
1192011985 if (i == (sz - 1 ) || needRestore) {
0 commit comments