@@ -135,8 +135,11 @@ class GenXPrologEpilogInsertion
135135 // caller side argument layout
136136 void generateStackCall (CallInst *CI);
137137
138- // generateStackCall subroutine
138+ // generateStackCall subroutines: writing args, extracting args
139139 unsigned writeArgs (CallInst *CI, Value *SpArgs, IRBuilder<> &IRB);
140+ std::vector<std::pair<Instruction *, Instruction *>>
141+ buildWorkList (CallInst *CI, Value *OrigSp, bool UseMemForRet);
142+ void extractResults (CallInst *CI, Value *OrigSp, IRBuilder<> &IRB);
140143
141144 void generateAlloca (CallInst *CI);
142145
@@ -464,15 +467,15 @@ void GenXPrologEpilogInsertion::generateFunctionEpilog(Function &F,
464467unsigned GenXPrologEpilogInsertion::writeArgs (CallInst *CI, Value *SpArgs,
465468 IRBuilder<> &IRB) {
466469 unsigned Offset = 0 ;
467- std::map<Value *, Value *> ReplaceArgs;
470+ std::vector<std::pair<int , Value *>> ReplaceArgs; // ArgNo, Arg
471+ ReplaceArgs.reserve (CI->getNumArgOperands ());
468472
469473 for (auto &Arg : CI->arg_operands ()) {
470474 // it is tempting to skip here if Arg already is in ReplaceArgs map
471475 // but it will be wrong to do so, because consider:
472476 // foo(x, x, y, y, x, y)
473477 // on callee side we are expecting 6 positions in predef args
474478 // we can not optimize these out on caller side
475-
476479 auto *OrigTy = Arg->getType ();
477480 if (OrigTy->getScalarType ()->isIntegerTy (1 )) {
478481 if (!HandleMaskArgs)
@@ -502,51 +505,28 @@ unsigned GenXPrologEpilogInsertion::writeArgs(CallInst *CI, Value *SpArgs,
502505 if (OrigTy->getScalarType ()->isIntegerTy (1 ))
503506 ArgRegWrite = cast<Instruction>(
504507 IRB.CreateBitOrPointerCast (ArgRegWrite,OrigTy));
505- ReplaceArgs[ Arg] = ArgRegWrite;
508+ ReplaceArgs. emplace_back ( Arg. getOperandNo (), ArgRegWrite) ;
506509 Offset += ArgSize;
507510 }
508511 }
509512
510- for (auto &&Pair : ReplaceArgs)
511- CI->replaceUsesOfWith (Pair.first , Pair.second );
513+ // here ">=" used to account for memory-passing of argument tail
514+ IGC_ASSERT_MESSAGE (CI->getNumArgOperands () >= ReplaceArgs.size (),
515+ " ReplaceArgs too large" );
516+ for (auto &&NewArg : ReplaceArgs)
517+ CI->setArgOperand (NewArg.first , NewArg.second );
512518 return Offset;
513519}
514520
515- // generate caller site of stack call
516- void GenXPrologEpilogInsertion::generateStackCall (CallInst *CI) {
517- LLVM_DEBUG (dbgs () << " Generating stack call for:\n " );
518- LLVM_DEBUG (CI->dump ());
519- LLVM_DEBUG (dbgs () << " \n " );
520- IRBuilder<> IRB (CI);
521- Value *OrigSp = buildReadPredefReg (PreDefined_Vars::PREDEFINED_FE_SP, IRB,
522- IRB.getInt64Ty (), true );
523- // write args, return total offset in arg register
524- unsigned Offset = writeArgs (CI, OrigSp, IRB);
525-
526- CI->setMetadata (
527- InstMD::FuncArgSize,
528- MDNode::get (CI->getContext (),
529- ConstantAsMetadata::get (IRB.getInt32 (
530- (Offset + ST->getGRFWidth () - 1 ) / ST->getGRFWidth ()))));
531- bool isVoidCall = CI->getType ()->isVoidTy ();
532- CI->setMetadata (
533- InstMD::FuncRetSize,
534- MDNode::get (CI->getContext (),
535- ConstantAsMetadata::get (IRB.getInt32 (divideCeil (
536- (isVoidCall ? 0
537- : (DL->getTypeSizeInBits (CI->getType ())) /
538- genx::ByteBits),
539- ST->getGRFWidth ())))));
540- if (isVoidCall)
541- return ;
542- IRB.SetInsertPoint (CI->getNextNode ());
543- bool UseMemForRet =
544- ForceRetMemPassing ||
545- DL->getTypeSizeInBits (CI->getType ()) / genx::ByteBits > RetRegSize;
546- if (UseMemForRet)
547- OrigSp = buildReadPredefReg (PreDefined_Vars::PREDEFINED_FE_SP, IRB,
548- IRB.getInt64Ty (), CI, true );
549- // read retvalue
521+ // build worklist for extraction
522+ // worklist entry format:
523+ // first: actual return
524+ // second: return insertion point
525+ // this might be critical for structure return due to odd agreement of
526+ // returning structures
527+ std::vector<std::pair<Instruction *, Instruction *>>
528+ GenXPrologEpilogInsertion::buildWorkList (CallInst *CI, Value *OrigSp,
529+ bool UseMemForRet) {
550530 std::vector<std::pair<Instruction *, Instruction *>> Worklist;
551531 if (isa<StructType>(CI->getType ())) {
552532 for (auto *U : CI->users ()) {
@@ -556,7 +536,26 @@ void GenXPrologEpilogInsertion::generateStackCall(CallInst *CI) {
556536 Worklist.push_back ({cast<Instruction>(U), cast<Instruction>(U)});
557537 }
558538 } else
539+ // OrigSP as instruction is read.predef.reg
559540 Worklist.push_back ({CI, UseMemForRet ? cast<Instruction>(OrigSp) : CI});
541+ return Worklist;
542+ }
543+
544+ // extract results from stack call return
545+ void GenXPrologEpilogInsertion::extractResults (CallInst *CI, Value *OrigSp,
546+ IRBuilder<> &IRB) {
547+ IRB.SetInsertPoint (CI->getNextNode ());
548+ bool UseMemForRet =
549+ ForceRetMemPassing ||
550+ DL->getTypeSizeInBits (CI->getType ()) / genx::ByteBits > RetRegSize;
551+ if (UseMemForRet)
552+ OrigSp = buildReadPredefReg (PreDefined_Vars::PREDEFINED_FE_SP, IRB,
553+ IRB.getInt64Ty (), CI, true );
554+
555+ // collect return slots
556+ auto Worklist = buildWorkList (CI, OrigSp, UseMemForRet);
557+
558+ // process return slots
560559 for (auto &I : Worklist) {
561560 auto *ActualRet = I.first ;
562561 IRB.SetInsertPoint (I.second ->getNextNode ());
@@ -628,6 +627,38 @@ void GenXPrologEpilogInsertion::generateStackCall(CallInst *CI) {
628627 }
629628}
630629
630+ // generate caller site of stack call
631+ void GenXPrologEpilogInsertion::generateStackCall (CallInst *CI) {
632+ LLVM_DEBUG (dbgs () << " Generating stack call for:\n " );
633+ LLVM_DEBUG (CI->dump ());
634+ LLVM_DEBUG (dbgs () << " \n " );
635+ IRBuilder<> IRB (CI);
636+ Value *OrigSp = buildReadPredefReg (PreDefined_Vars::PREDEFINED_FE_SP, IRB,
637+ IRB.getInt64Ty (), true );
638+ // write args, return total offset in arg register
639+ unsigned Offset = writeArgs (CI, OrigSp, IRB);
640+
641+ CI->setMetadata (
642+ InstMD::FuncArgSize,
643+ MDNode::get (CI->getContext (),
644+ ConstantAsMetadata::get (IRB.getInt32 (
645+ (Offset + ST->getGRFWidth () - 1 ) / ST->getGRFWidth ()))));
646+ bool isVoidCall = CI->getType ()->isVoidTy ();
647+ CI->setMetadata (
648+ InstMD::FuncRetSize,
649+ MDNode::get (CI->getContext (),
650+ ConstantAsMetadata::get (IRB.getInt32 (divideCeil (
651+ (isVoidCall ? 0
652+ : (DL->getTypeSizeInBits (CI->getType ())) /
653+ genx::ByteBits),
654+ ST->getGRFWidth ())))));
655+ if (isVoidCall)
656+ return ;
657+
658+ // read retvalue
659+ extractResults (CI, OrigSp, IRB);
660+ }
661+
631662// alloca_base = FE_SP
632663// FE_SP += sizeof(alloca)
633664void GenXPrologEpilogInsertion::generateAlloca (CallInst *CI) {
0 commit comments