@@ -3744,8 +3744,18 @@ void CodeGen::genStructReturn(GenTree* treeNode)
3744
3744
// genAllocLclFrame: Probe the stack and allocate the local stack frame: subtract from SP.
3745
3745
//
3746
3746
// Notes:
3747
- // On ARM64, this only does the probing; allocating the frame is done when
3748
- // callee-saved registers are saved.
3747
+ // On ARM64, this only does the probing; allocating the frame is done when callee-saved registers are saved.
3748
+ // This is done before anything has been pushed. The previous frame might have a large outgoing argument
3749
+ // space that has been allocated, but the lowest addresses have not been touched. Our frame setup might
3750
+ // not touch up to the first 504 bytes. This means we could miss a guard page. On Windows, however,
3751
+ // there are always three guard pages, so we will not miss them all.
3752
+ //
3753
+ // On ARM32, the first instruction of the prolog is always a push (which touches the lowest address
3754
+ // of the stack), either of the LR register or of some argument registers, e.g., in the case of
3755
+ // pre-spilling. The LR register is always pushed because we require it to allow for GC return
3756
+ // address hijacking (see the comment in CodeGen::genPushCalleeSavedRegisters()). These pushes
3757
+ // happen immediately before calling this function, so the SP at the current location has already
3758
+ // been touched.
3749
3759
//
3750
3760
void CodeGen::genAllocLclFrame (unsigned frameSize, regNumber initReg, bool * pInitRegZeroed, regMaskTP maskArgRegsLiveIn)
3751
3761
{
@@ -3763,24 +3773,28 @@ void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pIni
3763
3773
if (frameSize < pageSize)
3764
3774
{
3765
3775
#ifdef _TARGET_ARM_
3766
- // Frame size is (0x0008 ..0x1000)
3776
+ // Frame size is (0x0000 ..0x1000). No probing necessary.
3767
3777
inst_RV_IV (INS_sub, REG_SPBASE, frameSize, EA_PTRSIZE);
3768
3778
#endif // _TARGET_ARM_
3769
3779
}
3770
3780
else if (frameSize < compiler->getVeryLargeFrameSize ())
3771
3781
{
3772
- // Frame size is (0x1000..0x3000)
3773
-
3774
- instGen_Set_Reg_To_Imm (EA_PTRSIZE, initReg, -(ssize_t )pageSize);
3775
- getEmitter ()->emitIns_R_R_R (INS_ldr, EA_4BYTE, initReg, REG_SPBASE, initReg);
3776
- regSet.verifyRegUsed (initReg);
3777
- *pInitRegZeroed = false ; // The initReg does not contain zero
3782
+ #if defined(_TARGET_ARM64_)
3783
+ regNumber rTemp = REG_ZR; // We don't need a register for the target of the dummy load
3784
+ #else
3785
+ regNumber rTemp = initReg;
3786
+ #endif
3778
3787
3779
- if ( frameSize >= 0x2000 )
3788
+ for ( target_size_t probeOffset = pageSize; probeOffset <= frameSize; probeOffset += pageSize )
3780
3789
{
3781
- instGen_Set_Reg_To_Imm (EA_PTRSIZE, initReg, -2 * (ssize_t )pageSize);
3782
- getEmitter ()->emitIns_R_R_R (INS_ldr, EA_4BYTE, initReg, REG_SPBASE, initReg);
3790
+ // Generate:
3791
+ // movw initReg, -probeOffset
3792
+ // ldr rTemp, [SP + initReg] // load into initReg on arm32, wzr on ARM64
3793
+
3794
+ instGen_Set_Reg_To_Imm (EA_PTRSIZE, initReg, -(ssize_t )probeOffset);
3795
+ getEmitter ()->emitIns_R_R_R (INS_ldr, EA_4BYTE, rTemp, REG_SPBASE, initReg);
3783
3796
regSet.verifyRegUsed (initReg);
3797
+ *pInitRegZeroed = false ; // The initReg does not contain zero
3784
3798
}
3785
3799
3786
3800
#ifdef _TARGET_ARM64_
@@ -3796,63 +3810,77 @@ void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pIni
3796
3810
// Frame size >= 0x3000
3797
3811
assert (frameSize >= compiler->getVeryLargeFrameSize ());
3798
3812
3799
- // Emit the following sequence to 'tickle' the pages.
3800
- // Note it is important that stack pointer not change until this is
3801
- // complete since the tickles could cause a stack overflow, and we
3802
- // need to be able to crawl the stack afterward (which means the
3803
- // stack pointer needs to be known).
3804
-
3805
- instGen_Set_Reg_To_Zero (EA_PTRSIZE, initReg);
3806
-
3807
- //
3808
- // Can't have a label inside the ReJIT padding area
3813
+ // Emit the following sequence to 'tickle' the pages. Note it is important that stack pointer not change
3814
+ // until this is complete since the tickles could cause a stack overflow, and we need to be able to crawl
3815
+ // the stack afterward (which means the stack pointer needs to be known).
3809
3816
//
3810
- genPrologPadForReJit ();
3817
+ // ARM64 needs 2 registers. ARM32 needs 3 registers. See VERY_LARGE_FRAME_SIZE_REG_MASK for how these
3818
+ // are reserved.
3811
3819
3812
- // TODO-ARM64-Bug?: set the availMask properly!
3813
- regMaskTP availMask =
3814
- (regSet.rsGetModifiedRegsMask () & RBM_ALLINT) | RBM_R12 | RBM_LR; // Set of available registers
3820
+ regMaskTP availMask = RBM_ALLINT & (regSet.rsGetModifiedRegsMask () | ~RBM_INT_CALLEE_SAVED);
3815
3821
availMask &= ~maskArgRegsLiveIn; // Remove all of the incoming argument registers as they are currently live
3816
3822
availMask &= ~genRegMask (initReg); // Remove the pre-calculated initReg
3817
3823
3818
3824
regNumber rOffset = initReg;
3819
3825
regNumber rLimit;
3820
- regNumber rTemp;
3821
3826
regMaskTP tempMask;
3822
3827
3828
+ #if defined(_TARGET_ARM64_)
3829
+
3830
+ regNumber rTemp = REG_ZR; // We don't need a register for the target of the dummy load
3831
+
3832
+ #else // _TARGET_ARM_
3833
+
3834
+ regNumber rTemp;
3835
+
3823
3836
// We pick the next lowest register number for rTemp
3824
3837
noway_assert (availMask != RBM_NONE);
3825
3838
tempMask = genFindLowestBit (availMask);
3826
3839
rTemp = genRegNumFromMask (tempMask);
3827
3840
availMask &= ~tempMask;
3828
3841
3842
+ #endif // _TARGET_ARM_
3843
+
3829
3844
// We pick the next lowest register number for rLimit
3830
3845
noway_assert (availMask != RBM_NONE);
3831
3846
tempMask = genFindLowestBit (availMask);
3832
3847
rLimit = genRegNumFromMask (tempMask);
3833
3848
availMask &= ~tempMask;
3834
3849
3835
- // TODO-LdStArch-Bug?: review this. The first time we load from [sp+0] which will always succeed. That doesn't
3836
- // make sense.
3837
- // TODO-ARM64-CQ: we could probably use ZR on ARM64 instead of rTemp.
3850
+ // Generate:
3838
3851
//
3852
+ // mov rOffset, -pageSize
3839
3853
// mov rLimit, -frameSize
3840
3854
// loop:
3841
- // ldr rTemp, [sp+rOffset]
3842
- // sub rOffset, 0x1000 // Note that 0x1000 on ARM32 uses the funky Thumb immediate encoding
3843
- // cmp rOffset, rLimit
3844
- // jge loop
3855
+ // ldr rTemp, [sp + rOffset] // rTemp = wzr on ARM64
3856
+ // sub rOffset, pageSize // Note that 0x1000 (normal ARM32 pageSize) on ARM32 uses the funky
3857
+ // // Thumb immediate encoding
3858
+ // cmp rLimit, rOffset
3859
+ // b.ls loop // If rLimit is lower or same, we need to probe this rOffset. Note
3860
+ // // especially that if it is the same, we haven't probed this page.
3861
+
3845
3862
noway_assert ((ssize_t )(int )frameSize == (ssize_t )frameSize); // make sure framesize safely fits within an int
3846
- instGen_Set_Reg_To_Imm (EA_PTRSIZE, rLimit, -(int )frameSize);
3863
+
3864
+ instGen_Set_Reg_To_Imm (EA_PTRSIZE, rOffset, -(ssize_t )pageSize);
3865
+ instGen_Set_Reg_To_Imm (EA_PTRSIZE, rLimit, -(ssize_t )frameSize);
3866
+
3867
+ //
3868
+ // Can't have a label inside the ReJIT padding area
3869
+ //
3870
+ genPrologPadForReJit ();
3871
+
3872
+ // There's a "virtual" label here. But we can't create a label in the prolog, so we use the magic
3873
+ // `emitIns_J` with a negative `instrCount` to branch back a specific number of instructions.
3874
+
3847
3875
getEmitter ()->emitIns_R_R_R (INS_ldr, EA_4BYTE, rTemp, REG_SPBASE, rOffset);
3848
- regSet.verifyRegUsed (rTemp);
3849
3876
#if defined(_TARGET_ARM_)
3877
+ regSet.verifyRegUsed (rTemp);
3850
3878
getEmitter ()->emitIns_R_I (INS_sub, EA_PTRSIZE, rOffset, pageSize);
3851
3879
#elif defined(_TARGET_ARM64_)
3852
3880
getEmitter ()->emitIns_R_R_I (INS_sub, EA_PTRSIZE, rOffset, rOffset, pageSize);
3853
- #endif // _TARGET_ARM64_
3854
- getEmitter ()->emitIns_R_R (INS_cmp, EA_PTRSIZE, rOffset, rLimit);
3855
- getEmitter ()->emitIns_J (INS_bhi , NULL , -4 );
3881
+ #endif
3882
+ getEmitter ()->emitIns_R_R (INS_cmp, EA_PTRSIZE, rLimit, rOffset); // If equal, we need to probe again
3883
+ getEmitter ()->emitIns_J (INS_bls , NULL , -4 );
3856
3884
3857
3885
*pInitRegZeroed = false ; // The initReg does not contain zero
3858
3886
0 commit comments