Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Commit 5ecae61

Browse files
authored
Merge pull request #23817 from wtgodbe/LateMerge
Catch-up recent changes in Master into release/3.0
2 parents 4f09845 + 8826bef commit 5ecae61

File tree

12 files changed

+666
-53
lines changed

12 files changed

+666
-53
lines changed

src/jit/codegenarm64.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2236,15 +2236,13 @@ void CodeGen::genLclHeap(GenTree* tree)
22362236
// case SP is on the last byte of the guard page. Thus you must
22372237
// touch SP-0 first not SP-0x1000.
22382238
//
2239-
// Another subtlety is that you don't want SP to be exactly on the
2240-
// boundary of the guard page because PUSH is predecrement, thus
2241-
// call setup would not touch the guard page but just beyond it
2239+
// This is similar to the prolog code in CodeGen::genAllocLclFrame().
22422240
//
22432241
// Note that we go through a few hoops so that SP never points to
2244-
// illegal pages at any time during the tickling process
2242+
// illegal pages at any time during the tickling process.
22452243
//
22462244
// subs regCnt, SP, regCnt // regCnt now holds ultimate SP
2247-
// bvc Loop // result is smaller than orignial SP (no wrap around)
2245+
// bvc Loop // result is smaller than original SP (no wrap around)
22482246
// mov regCnt, #0 // Overflow, pick lowest possible value
22492247
//
22502248
// Loop:

src/jit/codegenarmarch.cpp

Lines changed: 67 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -3744,8 +3744,18 @@ void CodeGen::genStructReturn(GenTree* treeNode)
37443744
// genAllocLclFrame: Probe the stack and allocate the local stack frame: subtract from SP.
37453745
//
37463746
// Notes:
3747-
// On ARM64, this only does the probing; allocating the frame is done when
3748-
// callee-saved registers are saved.
3747+
// On ARM64, this only does the probing; allocating the frame is done when callee-saved registers are saved.
3748+
// This is done before anything has been pushed. The previous frame might have a large outgoing argument
3749+
// space that has been allocated, but the lowest addresses have not been touched. Our frame setup might
3750+
// not touch up to the first 504 bytes. This means we could miss a guard page. On Windows, however,
3751+
// there are always three guard pages, so we will not miss them all.
3752+
//
3753+
// On ARM32, the first instruction of the prolog is always a push (which touches the lowest address
3754+
// of the stack), either of the LR register or of some argument registers, e.g., in the case of
3755+
// pre-spilling. The LR register is always pushed because we require it to allow for GC return
3756+
// address hijacking (see the comment in CodeGen::genPushCalleeSavedRegisters()). These pushes
3757+
// happen immediately before calling this function, so the SP at the current location has already
3758+
// been touched.
37493759
//
37503760
void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pInitRegZeroed, regMaskTP maskArgRegsLiveIn)
37513761
{
@@ -3763,24 +3773,28 @@ void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pIni
37633773
if (frameSize < pageSize)
37643774
{
37653775
#ifdef _TARGET_ARM_
3766-
// Frame size is (0x0008..0x1000)
3776+
// Frame size is (0x0000..0x1000). No probing necessary.
37673777
inst_RV_IV(INS_sub, REG_SPBASE, frameSize, EA_PTRSIZE);
37683778
#endif // _TARGET_ARM_
37693779
}
37703780
else if (frameSize < compiler->getVeryLargeFrameSize())
37713781
{
3772-
// Frame size is (0x1000..0x3000)
3773-
3774-
instGen_Set_Reg_To_Imm(EA_PTRSIZE, initReg, -(ssize_t)pageSize);
3775-
getEmitter()->emitIns_R_R_R(INS_ldr, EA_4BYTE, initReg, REG_SPBASE, initReg);
3776-
regSet.verifyRegUsed(initReg);
3777-
*pInitRegZeroed = false; // The initReg does not contain zero
3782+
#if defined(_TARGET_ARM64_)
3783+
regNumber rTemp = REG_ZR; // We don't need a register for the target of the dummy load
3784+
#else
3785+
regNumber rTemp = initReg;
3786+
#endif
37783787

3779-
if (frameSize >= 0x2000)
3788+
for (target_size_t probeOffset = pageSize; probeOffset <= frameSize; probeOffset += pageSize)
37803789
{
3781-
instGen_Set_Reg_To_Imm(EA_PTRSIZE, initReg, -2 * (ssize_t)pageSize);
3782-
getEmitter()->emitIns_R_R_R(INS_ldr, EA_4BYTE, initReg, REG_SPBASE, initReg);
3790+
// Generate:
3791+
// movw initReg, -probeOffset
3792+
// ldr rTemp, [SP + initReg] // load into initReg on arm32, wzr on ARM64
3793+
3794+
instGen_Set_Reg_To_Imm(EA_PTRSIZE, initReg, -(ssize_t)probeOffset);
3795+
getEmitter()->emitIns_R_R_R(INS_ldr, EA_4BYTE, rTemp, REG_SPBASE, initReg);
37833796
regSet.verifyRegUsed(initReg);
3797+
*pInitRegZeroed = false; // The initReg does not contain zero
37843798
}
37853799

37863800
#ifdef _TARGET_ARM64_
@@ -3796,63 +3810,77 @@ void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pIni
37963810
// Frame size >= 0x3000
37973811
assert(frameSize >= compiler->getVeryLargeFrameSize());
37983812

3799-
// Emit the following sequence to 'tickle' the pages.
3800-
// Note it is important that stack pointer not change until this is
3801-
// complete since the tickles could cause a stack overflow, and we
3802-
// need to be able to crawl the stack afterward (which means the
3803-
// stack pointer needs to be known).
3804-
3805-
instGen_Set_Reg_To_Zero(EA_PTRSIZE, initReg);
3806-
3807-
//
3808-
// Can't have a label inside the ReJIT padding area
3813+
// Emit the following sequence to 'tickle' the pages. Note it is important that stack pointer not change
3814+
// until this is complete since the tickles could cause a stack overflow, and we need to be able to crawl
3815+
// the stack afterward (which means the stack pointer needs to be known).
38093816
//
3810-
genPrologPadForReJit();
3817+
// ARM64 needs 2 registers. ARM32 needs 3 registers. See VERY_LARGE_FRAME_SIZE_REG_MASK for how these
3818+
// are reserved.
38113819

3812-
// TODO-ARM64-Bug?: set the availMask properly!
3813-
regMaskTP availMask =
3814-
(regSet.rsGetModifiedRegsMask() & RBM_ALLINT) | RBM_R12 | RBM_LR; // Set of available registers
3820+
regMaskTP availMask = RBM_ALLINT & (regSet.rsGetModifiedRegsMask() | ~RBM_INT_CALLEE_SAVED);
38153821
availMask &= ~maskArgRegsLiveIn; // Remove all of the incoming argument registers as they are currently live
38163822
availMask &= ~genRegMask(initReg); // Remove the pre-calculated initReg
38173823

38183824
regNumber rOffset = initReg;
38193825
regNumber rLimit;
3820-
regNumber rTemp;
38213826
regMaskTP tempMask;
38223827

3828+
#if defined(_TARGET_ARM64_)
3829+
3830+
regNumber rTemp = REG_ZR; // We don't need a register for the target of the dummy load
3831+
3832+
#else // _TARGET_ARM_
3833+
3834+
regNumber rTemp;
3835+
38233836
// We pick the next lowest register number for rTemp
38243837
noway_assert(availMask != RBM_NONE);
38253838
tempMask = genFindLowestBit(availMask);
38263839
rTemp = genRegNumFromMask(tempMask);
38273840
availMask &= ~tempMask;
38283841

3842+
#endif // _TARGET_ARM_
3843+
38293844
// We pick the next lowest register number for rLimit
38303845
noway_assert(availMask != RBM_NONE);
38313846
tempMask = genFindLowestBit(availMask);
38323847
rLimit = genRegNumFromMask(tempMask);
38333848
availMask &= ~tempMask;
38343849

3835-
// TODO-LdStArch-Bug?: review this. The first time we load from [sp+0] which will always succeed. That doesn't
3836-
// make sense.
3837-
// TODO-ARM64-CQ: we could probably use ZR on ARM64 instead of rTemp.
3850+
// Generate:
38383851
//
3852+
// mov rOffset, -pageSize
38393853
// mov rLimit, -frameSize
38403854
// loop:
3841-
// ldr rTemp, [sp+rOffset]
3842-
// sub rOffset, 0x1000 // Note that 0x1000 on ARM32 uses the funky Thumb immediate encoding
3843-
// cmp rOffset, rLimit
3844-
// jge loop
3855+
// ldr rTemp, [sp + rOffset] // rTemp = wzr on ARM64
3856+
// sub rOffset, pageSize // Note that 0x1000 (normal ARM32 pageSize) on ARM32 uses the funky
3857+
// // Thumb immediate encoding
3858+
// cmp rLimit, rOffset
3859+
// b.ls loop // If rLimit is lower or same, we need to probe this rOffset. Note
3860+
// // especially that if it is the same, we haven't probed this page.
3861+
38453862
noway_assert((ssize_t)(int)frameSize == (ssize_t)frameSize); // make sure framesize safely fits within an int
3846-
instGen_Set_Reg_To_Imm(EA_PTRSIZE, rLimit, -(int)frameSize);
3863+
3864+
instGen_Set_Reg_To_Imm(EA_PTRSIZE, rOffset, -(ssize_t)pageSize);
3865+
instGen_Set_Reg_To_Imm(EA_PTRSIZE, rLimit, -(ssize_t)frameSize);
3866+
3867+
//
3868+
// Can't have a label inside the ReJIT padding area
3869+
//
3870+
genPrologPadForReJit();
3871+
3872+
// There's a "virtual" label here. But we can't create a label in the prolog, so we use the magic
3873+
// `emitIns_J` with a negative `instrCount` to branch back a specific number of instructions.
3874+
38473875
getEmitter()->emitIns_R_R_R(INS_ldr, EA_4BYTE, rTemp, REG_SPBASE, rOffset);
3848-
regSet.verifyRegUsed(rTemp);
38493876
#if defined(_TARGET_ARM_)
3877+
regSet.verifyRegUsed(rTemp);
38503878
getEmitter()->emitIns_R_I(INS_sub, EA_PTRSIZE, rOffset, pageSize);
38513879
#elif defined(_TARGET_ARM64_)
38523880
getEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, rOffset, rOffset, pageSize);
3853-
#endif // _TARGET_ARM64_
3854-
getEmitter()->emitIns_R_R(INS_cmp, EA_PTRSIZE, rOffset, rLimit);
3855-
getEmitter()->emitIns_J(INS_bhi, NULL, -4);
3881+
#endif
3882+
getEmitter()->emitIns_R_R(INS_cmp, EA_PTRSIZE, rLimit, rOffset); // If equal, we need to probe again
3883+
getEmitter()->emitIns_J(INS_bls, NULL, -4);
38563884

38573885
*pInitRegZeroed = false; // The initReg does not contain zero
38583886

src/jit/codegencommon.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7385,17 +7385,15 @@ void CodeGen::genFinalizeFrame()
73857385
}
73867386
#endif // _TARGET_X86_
73877387

7388-
#if defined(_TARGET_ARMARCH_)
7388+
#if defined(_TARGET_ARM_)
73897389
// We need to determine if we will change SP larger than a specific amount to determine if we want to use a loop
73907390
// to touch stack pages, that will require multiple registers. See genAllocLclFrame() for details.
73917391
if (compiler->compLclFrameSize >= compiler->getVeryLargeFrameSize())
73927392
{
73937393
regSet.rsSetRegsModified(VERY_LARGE_FRAME_SIZE_REG_MASK);
73947394
}
7395-
#endif // defined(_TARGET_ARMARCH_)
73967395

7397-
#if defined(_TARGET_ARM_)
7398-
// If there are any reserved registers, add them to the
7396+
// If there are any reserved registers, add them to the modified set.
73997397
if (regSet.rsMaskResvd != RBM_NONE)
74007398
{
74017399
regSet.rsSetRegsModified(regSet.rsMaskResvd);

src/jit/codegenxarch.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2611,7 +2611,7 @@ void CodeGen::genLclHeap(GenTree* tree)
26112611
// loop:
26122612
// test ESP, [ESP+0] // tickle the page
26132613
// mov REGTMP, ESP
2614-
// sub REGTMP, GetOsPageSize()
2614+
// sub REGTMP, eeGetPageSize()
26152615
// mov ESP, REGTMP
26162616
// cmp ESP, REGCNT
26172617
// jae loop

src/jit/compiler.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10491,11 +10491,14 @@ extern const BYTE genActualTypes[];
1049110491

1049210492
// VERY_LARGE_FRAME_SIZE_REG_MASK is the set of registers we need to use for
1049310493
// the probing loop generated for very large stack frames (see `getVeryLargeFrameSize`).
10494+
// We only use this to ensure that if we need to reserve a callee-saved register,
10495+
// it will be reserved. For ARM32, only R12 and LR are non-callee-saved, non-argument
10496+
// registers, so we save at least one more callee-saved register. For ARM64, however,
10497+
// we already know we have at least three non-callee-saved, non-argument integer registers,
10498+
// so we don't need to save any more.
1049410499

1049510500
#ifdef _TARGET_ARM_
10496-
#define VERY_LARGE_FRAME_SIZE_REG_MASK (RBM_R4 | RBM_R5 | RBM_R6)
10497-
#elif defined(_TARGET_ARM64_)
10498-
#define VERY_LARGE_FRAME_SIZE_REG_MASK (RBM_R9 | RBM_R10 | RBM_R11)
10501+
#define VERY_LARGE_FRAME_SIZE_REG_MASK (RBM_R4)
1049910502
#endif
1050010503

1050110504
/*****************************************************************************/

src/jit/lclvars.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4553,7 +4553,6 @@ unsigned Compiler::lvaGetMaxSpillTempSize()
45534553
*
45544554
*
45554555
* The frame is laid out as follows for ARM64 (this is a general picture; details may differ for different conditions):
4556-
* TODO-ARM64-NYI: this is preliminary (copied from ARM and modified), and needs to be reviewed.
45574556
* NOTE: SP must be 16-byte aligned, so there may be alignment slots in the frame.
45584557
* We will often save and establish a frame pointer to create better ETW stack walks.
45594558
*

0 commit comments

Comments
 (0)