@@ -10581,7 +10581,7 @@ void Optimizer::doNoMaskWA()
1058110581 // flagVar : emask for this BB.
1058210582 // Note that if 32-bit flag is used, flagVar and this instruction I's condMod
1058310583 // take two flag registers, leaving no flag for temporary. In this case, we
10584- // will do manual spill, ie, save and restore the original flag (case 1 and 3).
10584+ // will do manual spill, ie, save and restore the original flag (case 1.2 and 3).
1058510585 //
1058610586 // Before:
1058710587 // I: (W) cmp (16|M16) (ne)P D .... // 32-bit flag
@@ -10590,11 +10590,15 @@ void Optimizer::doNoMaskWA()
1059010590 //
1059110591 // After:
1059210592 // (1) D = null (common)
10593- // I0: (W) mov (1|M0) save:ud P<0;1,0>:ud
10594- // I: (W) cmp (16|M16) (ne)P ....
10595- // I1: (W&-flagVar) mov (1|M0) P save:ud
10593+ // 1.1) Not simd32 And P is 16-bit modifier (less chance to have flag spill)
10594+ // I: (W) cmp (16|M0) (ne)nP ....
10595+ // I0: (W&flagVar) mov (1|M0) P nP
10596+ // 1.2 general case (save flag into grf to avoid flag spill)
10597+ // I0: (W) mov (1|M0) save:ud P<0;1,0>:ud
10598+ // I: (W) cmp (16|M16) (ne)P ....
10599+ // I1: (W&-flagVar) mov (1|M0) P save:ud
1059610600 // (2) 'I' uses 16-bit flag (common)
10597- // I0: (W) mov (1) nP<1>:uw flagVar.0 <0;1,0>:uw
10601+ // I0: (W) mov (1) nP<1>:uw flagVar<0;1,0>:uw
1059810602 // I: (W&nP) cmp (16|M0) (ne)nP ....
1059910603 // I1: (W&flagVar) mov (1|M0) P<1>:uw nP<0;1,0>:uw
1060010604 // (3) otherwise(less common)
@@ -10625,34 +10629,64 @@ void Optimizer::doNoMaskWA()
1062510629 G4_Type Ty = (modDcl->getWordSize () > 1 ) ? Type_UD : Type_UW;
1062610630 if (I->hasNULLDst ())
1062710631 { // case 1
10628- G4_Declare* saveDecl = builder.createTempVar (1 , Ty, Any, " saveTmp" );
10629- G4_RegVar* saveVar = saveDecl->getRegVar ();
10630- G4_SrcRegRegion* I0S0 = builder.createSrc (
10631- modDcl->getRegVar (),
10632- 0 , 0 , builder.getRegionScalar (), Ty);
10633- G4_DstRegRegion* D0 = builder.createDst (saveVar, 0 , 0 , 1 , Ty);
10634- G4_INST* I0 = builder.createMov (g4::SIMD1, D0, I0S0, InstOpt_WriteEnable, false );
10635- currBB->insertBefore (currII, I0);
10632+ if (flagVar->getDeclare ()->getTotalElems () == 1 && Ty == Type_UW)
10633+ { // case 1.1
10634+ assert (I->getExecSize () != g4::SIMD32);
1063610635
10637- auto nextII = currII;
10638- ++nextII;
10639- G4_SrcRegRegion* I1S0 = builder.createSrc (saveVar,
10640- 0 , 0 , builder.getRegionScalar (), Ty);
10641- G4_DstRegRegion* D1 = builder.createDst (
10642- modDcl->getRegVar (), 0 , 0 , 1 , Ty);
10643- G4_INST* I1 = builder.createMov (g4::SIMD1, D1, I1S0, InstOpt_WriteEnable, false );
10644- G4_Predicate* flag = builder.createPredicate (
10645- PredState_Minus, flagVar, 0 , getPredCtrl (useAnyh));
10646- I1->setPredicate (flag);
10647- currBB->insertBefore (nextII, I1);
10636+ // Use 16-bit flag
10637+ G4_Declare* nPDecl = builder.createTempFlag (1 , " nP" );
10638+ G4_RegVar* nPVar = nPDecl->getRegVar ();
1064810639
10649- flagVarDefInst-> addDefUse (I1, Opnd_pred );
10650- I0-> addDefUse (I1, Opnd_src0 );
10640+ G4_CondMod* nM = builder. createCondMod (P-> getMod (), nPVar, 0 );
10641+ I-> setCondMod (nM );
1065110642
10652- if (!condModGlb)
10653- {
10654- // Copy condMod uses to I1.
10655- I->copyUsesTo (I1, false );
10643+ auto nextII = currII;
10644+ ++nextII;
10645+
10646+ G4_SrcRegRegion* I0S0 = builder.createSrc (nPVar,
10647+ 0 , 0 , builder.getRegionScalar (), Ty);
10648+ G4_DstRegRegion* I0D0 = builder.createDst (
10649+ modDcl->getRegVar (), 0 , 0 , 1 , Ty);
10650+ G4_INST* I0 = builder.createMov (g4::SIMD1, I0D0, I0S0, InstOpt_WriteEnable, false );
10651+ G4_Predicate* flag = builder.createPredicate (
10652+ PredState_Plus, flagVar, 0 , getPredCtrl (useAnyh));
10653+ I0->setPredicate (flag);
10654+ currBB->insertBefore (nextII, I0);
10655+
10656+ flagVarDefInst->addDefUse (I0, Opnd_pred);
10657+ I->addDefUse (I0, Opnd_src0);
10658+ }
10659+ else
10660+ { // case 1.2
10661+ G4_Declare* saveDecl = builder.createTempVar (1 , Ty, Any, " saveTmp" );
10662+ G4_RegVar* saveVar = saveDecl->getRegVar ();
10663+ G4_SrcRegRegion* I0S0 = builder.createSrc (
10664+ modDcl->getRegVar (),
10665+ 0 , 0 , builder.getRegionScalar (), Ty);
10666+ G4_DstRegRegion* D0 = builder.createDst (saveVar, 0 , 0 , 1 , Ty);
10667+ G4_INST* I0 = builder.createMov (g4::SIMD1, D0, I0S0, InstOpt_WriteEnable, false );
10668+ currBB->insertBefore (currII, I0);
10669+
10670+ auto nextII = currII;
10671+ ++nextII;
10672+ G4_SrcRegRegion* I1S0 = builder.createSrc (saveVar,
10673+ 0 , 0 , builder.getRegionScalar (), Ty);
10674+ G4_DstRegRegion* D1 = builder.createDst (
10675+ modDcl->getRegVar (), 0 , 0 , 1 , Ty);
10676+ G4_INST* I1 = builder.createMov (g4::SIMD1, D1, I1S0, InstOpt_WriteEnable, false );
10677+ G4_Predicate* flag = builder.createPredicate (
10678+ PredState_Minus, flagVar, 0 , getPredCtrl (useAnyh));
10679+ I1->setPredicate (flag);
10680+ currBB->insertBefore (nextII, I1);
10681+
10682+ flagVarDefInst->addDefUse (I1, Opnd_pred);
10683+ I0->addDefUse (I1, Opnd_src0);
10684+
10685+ if (!condModGlb)
10686+ {
10687+ // Copy condMod uses to I1.
10688+ I->copyUsesTo (I1, false );
10689+ }
1065610690 }
1065710691 return ;
1065810692 }
0 commit comments