@@ -9224,6 +9224,59 @@ void GlobalRA::addCalleeSavePseudoCode() {
92249224 builder.instList .clear ();
92259225}
92269226
9227+ void GlobalRA::storeCEInProlog () {
9228+ if (!kernel.getOption (vISA_storeCE))
9229+ return ;
9230+
9231+ // If we've to store CE in prolog, we emit:
9232+ // TmpReg (GRF_Aligned) = CE0.0
9233+ // Store TmpReg @ FP+Offset
9234+ //
9235+ // Where Offset = 1 GRF size in bytes
9236+
9237+ // Create new variable equal to GRF size so it's always GRF aligned.
9238+ // It's transitory so shouldn't impact register pressure. We want to
9239+ // write CE0.0 in 0th location of this variable so that it can be
9240+ // used as send payload.
9241+ auto TmpReg = builder.createDeclare (
9242+ " TmpCEReg" , G4_GRF, builder.numEltPerGRF <Type_UD>(), 1 , Type_UD);
9243+ auto *DstRgn = builder.createDstRegRegion (TmpReg, 1 );
9244+ auto *CEReg = regPool.getMask0Reg ();
9245+ auto *SrcOpnd = builder.createSrc (
9246+ CEReg, 0 , 0 , kernel.fg .builder ->getRegionScalar (), Type_UD);
9247+ auto Mov = builder.createMov (g4::SIMD1, DstRgn, SrcOpnd,
9248+ G4_InstOption::InstOpt_WriteEnable, false );
9249+ auto nextPos = kernel.fg .getEntryBB ()->insertBefore (
9250+ kernel.fg .getEntryBB ()->getFirstInsertPos (), Mov);
9251+
9252+ auto payloadSrc =
9253+ builder.createSrcRegRegion (TmpReg, builder.getRegionStride1 ());
9254+ const unsigned execSize = 8 ;
9255+ G4_DstRegRegion *postDst = builder.createNullDst (Type_UD);
9256+ G4_INST *store = nullptr ;
9257+ unsigned int HWOffset = builder.numEltPerGRF <Type_UB>() / getHWordByteSize ();
9258+ vISA_ASSERT (kernel.stackCall .getFrameDescriptorByteSize () <=
9259+ builder.numEltPerGRF <Type_UB>(),
9260+ " ce0 overwrote FDE" );
9261+ kernel.getKernelDebugInfo ()->setCESaveOffset (HWOffset * getHWordByteSize ());
9262+
9263+ if (builder.supportsLSC ()) {
9264+ auto headerOpnd = getSpillFillHeader (*kernel.fg .builder , nullptr );
9265+ store = builder.createSpill (postDst, headerOpnd, payloadSrc,
9266+ G4_ExecSize (execSize), 1 , HWOffset,
9267+ builder.getBEFP (), InstOpt_WriteEnable, false );
9268+ } else {
9269+ store = builder.createSpill (postDst, payloadSrc, G4_ExecSize (execSize), 1 ,
9270+ HWOffset, builder.getBEFP (),
9271+ InstOpt_WriteEnable, false );
9272+ }
9273+ kernel.fg .getEntryBB ()->insertAfter (nextPos, store);
9274+
9275+ if (builder.kernel .getOption (vISA_GenerateDebugInfo)) {
9276+ builder.kernel .getKernelDebugInfo ()->setSaveCEInst (store);
9277+ }
9278+ }
9279+
92279280//
92289281// Insert store r125.[0-4] at entry and restore before return.
92299282// Dst of store will be a hardwired temp at upper end of caller save area.
@@ -10622,6 +10675,7 @@ void GlobalRA::stackCallSaveRestore(bool hasStackCall) {
1062210675 // Only GENX sub-graphs require callee-save code.
1062310676
1062410677 if (builder.getIsKernel () == false ) {
10678+ storeCEInProlog ();
1062510679 addCalleeSavePseudoCode ();
1062610680 addStoreRestoreToReturn ();
1062710681 }
@@ -11205,8 +11259,13 @@ int GlobalRA::coloringRegAlloc() {
1120511259
1120611260 if (kernel.fg .getIsStackCallFunc ()) {
1120711261 // Allocate space to store Frame Descriptor
11208- nextSpillOffset += 32 ;
11209- scratchOffset += 32 ;
11262+ nextSpillOffset += builder.numEltPerGRF <Type_UB>();
11263+ scratchOffset += builder.numEltPerGRF <Type_UB>();
11264+
11265+ if (kernel.getOption (vISA_storeCE)) {
11266+ nextSpillOffset += builder.numEltPerGRF <Type_UB>();
11267+ scratchOffset += builder.numEltPerGRF <Type_UB>();
11268+ }
1121011269 }
1121111270
1121211271 // Global linear scan RA
0 commit comments