Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Commit e8bdf3e

Browse files
Merge pull request #6276 from BruceForstall/LocAllocCleanup
Clean up localloc implementation
2 parents 59b0355 + 7a516a2 commit e8bdf3e

File tree

3 files changed

+65
-50
lines changed

3 files changed

+65
-50
lines changed

src/jit/codegenxarch.cpp

Lines changed: 58 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -3046,9 +3046,22 @@ CodeGen::genMultiRegCallStoreToLocal(GenTreePtr treeNode)
30463046
}
30473047

30483048

3049-
/***********************************************************************************************
3050-
* Generate code for localloc
3051-
*/
3049+
//------------------------------------------------------------------------
3050+
// genLclHeap: Generate code for localloc.
3051+
//
3052+
// Arguments:
3053+
// tree - the localloc tree to generate.
3054+
//
3055+
// Notes:
3056+
// Note that for x86, we don't track ESP movements while generating the localloc code.
3057+
// The ESP tracking is used to report stack pointer-relative GC info, which is not
3058+
// interesting while doing the localloc construction. Also, for functions with localloc,
3059+
// we have EBP frames, and EBP-relative locals, and ESP-relative accesses only for function
3060+
// call arguments. We store the ESP after the localloc is complete in the LocAllocSP
3061+
// variable. This variable is implicitly reported to the VM in the GC info (its position
3062+
// is defined by convention relative to other items), and is used by the GC to find the
3063+
// "base" stack pointer in functions with localloc.
3064+
//
30523065
void
30533066
CodeGen::genLclHeap(GenTreePtr tree)
30543067
{
@@ -3106,7 +3119,9 @@ CodeGen::genLclHeap(GenTreePtr tree)
31063119
}
31073120
else
31083121
{
3109-
// If 0 bail out by returning null in targetReg
3122+
// The localloc requested memory size is non-constant.
3123+
3124+
// Put the size value in targetReg. If it is zero, bail out by returning null in targetReg.
31103125
genConsumeRegAndCopy(size, targetReg);
31113126
endLabel = genCreateTempLabel();
31123127
getEmitter()->emitIns_R_R(INS_test, easz, targetReg, targetReg);
@@ -3127,33 +3142,40 @@ CodeGen::genLclHeap(GenTreePtr tree)
31273142
tmpRegsMask &= ~regCntMask;
31283143
regCnt = genRegNumFromMask(regCntMask);
31293144
if (regCnt != targetReg)
3145+
{
3146+
// Above, we put the size in targetReg. Now, copy it to our new temp register if necessary.
31303147
inst_RV_RV(INS_mov, regCnt, targetReg, size->TypeGet());
3148+
}
31313149
}
31323150

3133-
// Align to STACK_ALIGN
3134-
// regCnt will be the total number of bytes to localloc
3135-
inst_RV_IV(INS_add, regCnt, (STACK_ALIGN - 1), emitActualTypeSize(type));
3151+
// Round up the number of bytes to allocate to a STACK_ALIGN boundary. This is done
3152+
// by code like:
3153+
// add reg, 15
3154+
// and reg, -16
3155+
// However, in the initialized memory case, we need the count of STACK_ALIGN-sized
3156+
// elements, not a byte count, after the alignment. So instead of the "and", which
3157+
// becomes unnecessary, generate a shift, e.g.:
3158+
// add reg, 15
3159+
// shr reg, 4
3160+
3161+
inst_RV_IV(INS_add, regCnt, STACK_ALIGN - 1, emitActualTypeSize(type));
31363162

3137-
#if defined(_TARGET_X86_)
3138-
// TODO-Cleanup: change amd64 to use the same code path as x86 to reduce #ifdefs
3139-
// and improve amd64 CQ (use a dec loop instead of sub rsp loop).
31403163
if (compiler->info.compInitMem)
31413164
{
3142-
// Convert the count from a count of bytes to a count of pointer-sized words.
3143-
// We don't need the 'and' because we'll shift off those bits anyway. That's
3144-
// asserted by the following.
3145-
C_ASSERT((STACK_ALIGN >> STACK_ALIGN_SHIFT) <= 1);
3165+
// Convert the count from a count of bytes to a loop count. We will loop once per
3166+
// stack alignment size, so each loop will zero 4 bytes on x86 and 16 bytes on x64.
3167+
// Note that we zero a single reg-size word per iteration on x86, and 2 reg-size
3168+
// words per iteration on x64. We will shift off all the stack alignment bits
3169+
// added above, so there is no need for an 'and' instruction.
31463170

3147-
// --- shr regCnt, 2 ---
3148-
inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_PTRSIZE, regCnt, STACK_ALIGN_SHIFT);
3171+
// --- shr regCnt, 2 (or 4) ---
3172+
inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_PTRSIZE, regCnt, STACK_ALIGN_SHIFT_ALL);
31493173
}
31503174
else
31513175
{
3176+
// Otherwise, mask off the low bits to align the byte count.
31523177
inst_RV_IV(INS_AND, regCnt, ~(STACK_ALIGN - 1), emitActualTypeSize(type));
31533178
}
3154-
#else // !_TARGET_X86_
3155-
inst_RV_IV(INS_AND, regCnt, ~(STACK_ALIGN - 1), emitActualTypeSize(type));
3156-
#endif // !_TARGET_X86_
31573179
}
31583180

31593181
#if FEATURE_FIXED_OUT_ARGS
@@ -3179,18 +3201,17 @@ CodeGen::genLclHeap(GenTreePtr tree)
31793201
{
31803202
// We should reach here only for non-zero, constant size allocations.
31813203
assert(amount > 0);
3204+
assert((amount % STACK_ALIGN) == 0);
3205+
assert((amount % REGSIZE_BYTES) == 0);
31823206

31833207
// For small allocations we will generate up to six push 0 inline
3184-
size_t cntPtrSizedWords = (amount >> STACK_ALIGN_SHIFT);
3185-
if (cntPtrSizedWords <= 6)
3208+
size_t cntRegSizedWords = amount / REGSIZE_BYTES;
3209+
if (cntRegSizedWords <= 6)
31863210
{
3187-
while (cntPtrSizedWords != 0)
3211+
for (; cntRegSizedWords != 0; cntRegSizedWords--)
31883212
{
3189-
// push_hide means don't track the stack
3190-
inst_IV(INS_push_hide, 0);
3191-
cntPtrSizedWords--;
3213+
inst_IV(INS_push_hide, 0); // push_hide means don't track the stack
31923214
}
3193-
31943215
goto ALLOC_DONE;
31953216
}
31963217

@@ -3246,51 +3267,42 @@ CodeGen::genLclHeap(GenTreePtr tree)
32463267

32473268
// else, "mov regCnt, amount"
32483269

3249-
#if defined(_TARGET_X86_)
32503270
if (compiler->info.compInitMem)
32513271
{
3252-
// For x86, when initializing memory with a constant count, we want 'amount' to be the
3253-
// count of pointer-sized words, not bytes.
3254-
amount = cntPtrSizedWords;
3272+
// When initializing memory, we want 'amount' to be the loop count.
3273+
assert((amount % STACK_ALIGN) == 0);
3274+
amount /= STACK_ALIGN;
32553275
}
3256-
#endif // _TARGET_X86_
32573276

32583277
genSetRegToIcon(regCnt, amount, ((int)amount == amount)? TYP_INT : TYP_LONG);
32593278
}
32603279

32613280
loop = genCreateTempLabel();
32623281
if (compiler->info.compInitMem)
32633282
{
3264-
// At this point 'regCnt' is set to the total number of bytes (or words, for constant counts) to locAlloc.
3283+
// At this point 'regCnt' is set to the number of loop iterations for this loop, if each
3284+
// iteration zeros (and subtracts from the stack pointer) STACK_ALIGN bytes.
32653285
// Since we have to zero out the allocated memory AND ensure that RSP is always valid
32663286
// by tickling the pages, we will just push 0's on the stack.
3267-
//
3268-
// Note: regCnt is guaranteed to be even on Amd64 since STACK_ALIGN/TARGET_POINTER_SIZE = 2
3269-
// and localloc size is a multiple of STACK_ALIGN.
32703287

32713288
assert(genIsValidIntReg(regCnt));
32723289

32733290
// Loop:
32743291
genDefineTempLabel(loop);
32753292

32763293
#if defined(_TARGET_AMD64_)
3277-
// dec is a 2 byte instruction, but sub is 4 (could be 3 if
3278-
// we know size is TYP_INT instead of TYP_I_IMPL)
3279-
// Also we know that we can only push 8 bytes at a time, but
3280-
// alignment is 16 bytes, so we can push twice and do a sub
3281-
// for just a little bit of loop unrolling
3294+
// Push two 8-byte zeros. This matches the 16-byte STACK_ALIGN value.
3295+
static_assert_no_msg(STACK_ALIGN == (REGSIZE_BYTES * 2));
32823296
inst_IV(INS_push_hide, 0); // --- push 8-byte 0
32833297
inst_IV(INS_push_hide, 0); // --- push 8-byte 0
3284-
3285-
// Note that regCnt is the number of bytes to stack allocate.
3286-
// Therefore we need to subtract 16 from regcnt here.
3287-
inst_RV_IV(INS_sub, regCnt, 16, emitActualTypeSize(type));
32883298
#elif defined(_TARGET_X86_)
3299+
// Push a single 4-byte zero. This matches the 4-byte STACK_ALIGN value.
3300+
static_assert_no_msg(STACK_ALIGN == REGSIZE_BYTES);
32893301
inst_IV(INS_push_hide, 0); // --- push 4-byte 0
3290-
inst_RV(INS_dec, regCnt, TYP_I_IMPL);
32913302
#endif // _TARGET_X86_
32923303

3293-
// If not done, loop
3304+
// Decrement the loop counter and loop if not done.
3305+
inst_RV(INS_dec, regCnt, TYP_I_IMPL);
32943306
inst_JMP(EJ_jne, loop);
32953307
}
32963308
else

src/jit/lowerxarch.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1901,8 +1901,8 @@ Lowering::TreeNodeInfoInitLclHeap(GenTree* tree)
19011901
//
19021902
// Size? Init Memory? # temp regs
19031903
// 0 - 0
1904-
// const and <=6 ptr words - 0
1905-
// const and >6 ptr words Yes 0
1904+
// const and <=6 reg words - 0
1905+
// const and >6 reg words Yes 0
19061906
// const and <PageSize No 0 (amd64) 1 (x86)
19071907
// const and >=PageSize No 2
19081908
// Non-const Yes 0
@@ -1925,11 +1925,12 @@ Lowering::TreeNodeInfoInitLclHeap(GenTree* tree)
19251925
// Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
19261926
// This should also help in debugging as we can examine the original size specified with localloc.
19271927
sizeVal = AlignUp(sizeVal, STACK_ALIGN);
1928-
size_t cntStackAlignedWidthItems = (sizeVal >> STACK_ALIGN_SHIFT);
19291928

19301929
// For small allocations up to 6 pointer sized words (i.e. 48 bytes of localloc)
19311930
// we will generate 'push 0'.
1932-
if (cntStackAlignedWidthItems <= 6)
1931+
assert((sizeVal % REGSIZE_BYTES) == 0);
1932+
size_t cntRegSizedWords = sizeVal / REGSIZE_BYTES;
1933+
if (cntRegSizedWords <= 6)
19331934
{
19341935
info->internalIntCount = 0;
19351936
}

src/jit/target.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,7 @@ typedef unsigned short regPairNoSmall; // arm: need 12 bits
485485
#define CODE_ALIGN 1 // code alignment requirement
486486
#define STACK_ALIGN 4 // stack alignment requirement
487487
#define STACK_ALIGN_SHIFT 2 // Shift-right amount to convert stack size in bytes to size in DWORD_PTRs
488+
#define STACK_ALIGN_SHIFT_ALL 2 // Shift-right amount to convert stack size in bytes to size in STACK_ALIGN units
488489

489490
#define RBM_INT_CALLEE_SAVED (RBM_EBX|RBM_ESI|RBM_EDI)
490491
#define RBM_INT_CALLEE_TRASH (RBM_EAX|RBM_ECX|RBM_EDX)
@@ -780,6 +781,7 @@ typedef unsigned short regPairNoSmall; // arm: need 12 bits
780781
#define CODE_ALIGN 1 // code alignment requirement
781782
#define STACK_ALIGN 16 // stack alignment requirement
782783
#define STACK_ALIGN_SHIFT 3 // Shift-right amount to convert stack size in bytes to size in pointer sized words
784+
#define STACK_ALIGN_SHIFT_ALL 4 // Shift-right amount to convert stack size in bytes to size in STACK_ALIGN units
783785

784786
#if ETW_EBP_FRAMED
785787
#define RBM_ETW_FRAMED_EBP RBM_NONE

0 commit comments

Comments
 (0)