@@ -6894,6 +6894,14 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
68946894 // also return the call with jmpi in VISAKernelImpl::compilePostOptimize
68956895 replaceRetWithJmpi ();
68966896 }
6897+
6898+ if (kernel.hasIndirectCall ())
6899+ {
6900+ // If the indirect call has regiser src0, the register must be a
6901+ // ip-based address of the call target. Insert a add before call to
6902+ // calculate the relative offset from call to the target
6903+ expandIndirectCallWithRegTarget ();
6904+ }
68976905 }
68986906
68996907class NSDS {
@@ -8259,6 +8267,152 @@ void genBucket(G4_INST *send, Bucket *bucket, RW rwType) {
82598267 }
82608268 }
82618269
8270+ G4_Declare* Optimizer::createInstsForCallTargetOffset (
8271+ InstListType& insts, G4_INST* fcall, int64_t adjust_off)
8272+ {
8273+ // create instruction sequence:
8274+ // add r2.0 -IP call_target
8275+ // add r2.0 r2.0 adjust_off
8276+
8277+ // call's dst must be r1.0, which is reserved at
8278+ // GlobalRA::setABIForStackCallFunctionCalls. It must not be overlapped with
8279+ // r2.0, that is hardcoded as the new jump target
8280+ assert (fcall->getDst ()->isGreg ());
8281+ assert ((fcall->getDst ()->getLinearizedStart () / GENX_GRF_REG_SIZ) != 2 );
8282+
8283+ // hardcoded add's dst to r2.0
8284+ G4_Declare* add_dst_decl =
8285+ builder.createHardwiredDeclare (1 , fcall->getDst ()->getType (), 2 , 0 );
8286+
8287+ // create the first add instruction
8288+ // add r2.0 -IP call_target
8289+ G4_INST* add_inst = builder.createBinOp (
8290+ G4_add, 1 ,
8291+ builder.Create_Dst_Opnd_From_Dcl (add_dst_decl, 1 ),
8292+ builder.createSrcRegRegion (
8293+ Mod_Minus, Direct, builder.phyregpool .getIpReg (), 0 , 0 ,
8294+ builder.getRegionScalar (), Type_UD),
8295+ fcall->getSrc (0 ), InstOpt_WriteEnable | InstOpt_NoCompact, false );
8296+
8297+ // create the second add to add the -ip to adjust_off, adjust_off dependes
8298+ // on how many instructions from the fist add to the jmp instruction, and
8299+ // if it's post-increment (jmpi) or pre-increment (call)
8300+ // add r2.0 r2.0 adjust_off
8301+ G4_INST* add_inst2 = builder.createBinOp (
8302+ G4_add, 1 ,
8303+ builder.Create_Dst_Opnd_From_Dcl (add_dst_decl, 1 ),
8304+ builder.Create_Src_Opnd_From_Dcl (
8305+ add_dst_decl, builder.getRegionScalar ()),
8306+ builder.createImm (adjust_off, Type_D),
8307+ InstOpt_WriteEnable | InstOpt_NoCompact, false );
8308+
8309+ insts.push_back (add_inst);
8310+ insts.push_back (add_inst2);
8311+
8312+ return add_dst_decl;
8313+ }
8314+
8315+ void Optimizer::createInstForJmpiSequence (InstListType& insts, G4_INST* fcall)
8316+ {
8317+ // SKL workaround for indirect call
8318+ // r1.0 is the return IP (the instruction right after jmpi)
8319+ // r1.1 is the return mask. While we'll replace the ret in calee to jmpi as well,
8320+ // we do not need to consider the return mask here.
8321+
8322+ // Do not allow predicate call on jmpi WA
8323+ assert (fcall->getPredicate () == nullptr );
8324+
8325+ // calculate the reserved register's num from fcall's dst register (shoud be r1)
8326+ assert (fcall->getDst ()->isGreg ());
8327+ uint32_t reg_num = fcall->getDst ()->getLinearizedStart () / GENX_GRF_REG_SIZ;
8328+
8329+ G4_Declare* new_target_decl = createInstsForCallTargetOffset (insts, fcall, -64 );
8330+
8331+ // add r1.0 IP 32
8332+ G4_Declare* r1_0_decl =
8333+ builder.createHardwiredDeclare (1 , fcall->getDst ()->getType (), reg_num, 0 );
8334+ insts.push_back (builder.createBinOp (
8335+ G4_add, 1 ,
8336+ builder.Create_Dst_Opnd_From_Dcl (r1_0_decl, 1 ),
8337+ builder.createSrcRegRegion (
8338+ Mod_src_undef, Direct, builder.phyregpool .getIpReg (), 0 , 0 ,
8339+ builder.getRegionScalar (), Type_UD),
8340+ builder.createImm (32 , Type_UD),
8341+ InstOpt_WriteEnable | InstOpt_NoCompact, false ));
8342+
8343+ // jmpi r2.0
8344+ // update jump target (src0) to add's dst
8345+ G4_SrcRegRegion* jump_target = builder.Create_Src_Opnd_From_Dcl (
8346+ new_target_decl, builder.getRegionScalar ());
8347+ jump_target->setType (Type_D);
8348+ insts.push_back (builder.createJmp (nullptr , jump_target, InstOpt_NoCompact, false ));
8349+ }
8350+
8351+ void Optimizer::expandIndirectCallWithRegTarget ()
8352+ {
8353+ // check every fcall
8354+ for (auto bb : kernel.fg )
8355+ {
8356+ // At this point G4_pseudo_fcall may be converted to G4_call,
8357+ // check all call (???)
8358+ if (bb->back ()->isFCall ()) // || bb->back()->isCall())
8359+ {
8360+ G4_INST* fcall = bb->back ();
8361+ if (fcall->getSrc (0 )->isGreg () || fcall->getSrc (0 )->isA0 ()) {
8362+ // at this point the call instruction's src0 has the target_address
8363+ // and the call dst is the reserved register for ret
8364+ // All the caller save register should be saved. We usd r2.0 directly
8365+ // here to calculate the new call's target. We picked r2.0 due to the
8366+ // HW's limitation that call/calla's src and dst offset (the subreg num)
8367+ // must be 0.
8368+ //
8369+ // expand call
8370+ // From:
8371+ // call r1.0 call_target
8372+ // To:
8373+ // add r2.0 -IP call_target
8374+ // add r2.0 r2.0 -32
8375+ // call r1.0 r2.0
8376+
8377+ // For SKL workaround, expand call
8378+ // From:
8379+ // call r1.0 call_target
8380+ // To:
8381+ // add r2.0 -IP call_target
8382+ // add r2.0 r2.0 -64
8383+ // add r1.0 IP 32 // set the return IP
8384+ // jmpi r2.0
8385+
8386+ InstListType expanded_insts;
8387+ if (builder.needReplaceIndirectCallWithJmpi ()) {
8388+ createInstForJmpiSequence (expanded_insts, fcall);
8389+ }
8390+ else {
8391+ G4_Declare* jmp_target_decl =
8392+ createInstsForCallTargetOffset (expanded_insts, fcall, -32 );
8393+ // Updated call's target to the new target
8394+ G4_SrcRegRegion* jump_target = builder.Create_Src_Opnd_From_Dcl (
8395+ jmp_target_decl, builder.getRegionScalar ());
8396+ fcall->setSrc (jump_target, 0 );
8397+ fcall->setNoCompacted ();
8398+ }
8399+ // then insert the expaneded instructions right before the call
8400+ INST_LIST_ITER insert_point = bb->end ();
8401+ --insert_point;
8402+ for (auto inst_to_add : expanded_insts) {
8403+ bb->getInstList ().insert (insert_point, inst_to_add);
8404+ inst_to_add->setCISAOff (fcall->getCISAOff ());
8405+ }
8406+
8407+ // remove call from the instlist for Jmpi WA
8408+ if (builder.needReplaceIndirectCallWithJmpi ())
8409+ bb->getInstList ().erase (--bb->end ());
8410+ }
8411+ }
8412+ }
8413+
8414+ }
8415+
82628416 // Replace ret with jmpi, must be single return
82638417 void Optimizer::replaceRetWithJmpi ()
82648418 {
0 commit comments