@@ -3046,9 +3046,22 @@ CodeGen::genMultiRegCallStoreToLocal(GenTreePtr treeNode)
3046
3046
}
3047
3047
3048
3048
3049
- /* **********************************************************************************************
3050
- * Generate code for localloc
3051
- */
3049
+ // ------------------------------------------------------------------------
3050
+ // genLclHeap: Generate code for localloc.
3051
+ //
3052
+ // Arguments:
3053
+ // tree - the localloc tree to generate.
3054
+ //
3055
+ // Notes:
3056
+ // Note that for x86, we don't track ESP movements while generating the localloc code.
3057
+ // The ESP tracking is used to report stack pointer-relative GC info, which is not
3058
+ // interesting while doing the localloc construction. Also, for functions with localloc,
3059
+ // we have EBP frames, and EBP-relative locals, and ESP-relative accesses only for function
3060
+ // call arguments. We store the ESP after the localloc is complete in the LocAllocSP
3061
+ // variable. This variable is implicitly reported to the VM in the GC info (its position
3062
+ // is defined by convention relative to other items), and is used by the GC to find the
3063
+ // "base" stack pointer in functions with localloc.
3064
+ //
3052
3065
void
3053
3066
CodeGen::genLclHeap (GenTreePtr tree)
3054
3067
{
@@ -3106,7 +3119,9 @@ CodeGen::genLclHeap(GenTreePtr tree)
3106
3119
}
3107
3120
else
3108
3121
{
3109
- // If 0 bail out by returning null in targetReg
3122
+ // The localloc requested memory size is non-constant.
3123
+
3124
+ // Put the size value in targetReg. If it is zero, bail out by returning null in targetReg.
3110
3125
genConsumeRegAndCopy (size, targetReg);
3111
3126
endLabel = genCreateTempLabel ();
3112
3127
getEmitter ()->emitIns_R_R (INS_test, easz, targetReg, targetReg);
@@ -3127,33 +3142,40 @@ CodeGen::genLclHeap(GenTreePtr tree)
3127
3142
tmpRegsMask &= ~regCntMask;
3128
3143
regCnt = genRegNumFromMask (regCntMask);
3129
3144
if (regCnt != targetReg)
3145
+ {
3146
+ // Above, we put the size in targetReg. Now, copy it to our new temp register if necessary.
3130
3147
inst_RV_RV (INS_mov, regCnt, targetReg, size->TypeGet ());
3148
+ }
3131
3149
}
3132
3150
3133
- // Align to STACK_ALIGN
3134
- // regCnt will be the total number of bytes to localloc
3135
- inst_RV_IV (INS_add, regCnt, (STACK_ALIGN - 1 ), emitActualTypeSize (type));
3151
+ // Round up the number of bytes to allocate to a STACK_ALIGN boundary. This is done
3152
+ // by code like:
3153
+ // add reg, 15
3154
+ // and reg, -16
3155
+ // However, in the initialized memory case, we need the count of STACK_ALIGN-sized
3156
+ // elements, not a byte count, after the alignment. So instead of the "and", which
3157
+ // becomes unnecessary, generate a shift, e.g.:
3158
+ // add reg, 15
3159
+ // shr reg, 4
3160
+
3161
+ inst_RV_IV (INS_add, regCnt, STACK_ALIGN - 1 , emitActualTypeSize (type));
3136
3162
3137
- #if defined(_TARGET_X86_)
3138
- // TODO-Cleanup: change amd64 to use the same code path as x86 to reduce #ifdefs
3139
- // and improve amd64 CQ (use a dec loop instead of sub rsp loop).
3140
3163
if (compiler->info .compInitMem )
3141
3164
{
3142
- // Convert the count from a count of bytes to a count of pointer-sized words.
3143
- // We don't need the 'and' because we'll shift off those bits anyway. That's
3144
- // asserted by the following.
3145
- C_ASSERT ((STACK_ALIGN >> STACK_ALIGN_SHIFT) <= 1 );
3165
+ // Convert the count from a count of bytes to a loop count. We will loop once per
3166
+ // stack alignment size, so each loop will zero 4 bytes on x86 and 16 bytes on x64.
3167
+ // Note that we zero a single reg-size word per iteration on x86, and 2 reg-size
3168
+ // words per iteration on x64. We will shift off all the stack alignment bits
3169
+ // added above, so there is no need for an 'and' instruction.
3146
3170
3147
- // --- shr regCnt, 2 ---
3148
- inst_RV_SH (INS_SHIFT_RIGHT_LOGICAL, EA_PTRSIZE, regCnt, STACK_ALIGN_SHIFT );
3171
+ // --- shr regCnt, 2 (or 4) ---
3172
+ inst_RV_SH (INS_SHIFT_RIGHT_LOGICAL, EA_PTRSIZE, regCnt, STACK_ALIGN_SHIFT_ALL );
3149
3173
}
3150
3174
else
3151
3175
{
3176
+ // Otherwise, mask off the low bits to align the byte count.
3152
3177
inst_RV_IV (INS_AND, regCnt, ~(STACK_ALIGN - 1 ), emitActualTypeSize (type));
3153
3178
}
3154
- #else // !_TARGET_X86_
3155
- inst_RV_IV (INS_AND, regCnt, ~(STACK_ALIGN - 1 ), emitActualTypeSize (type));
3156
- #endif // !_TARGET_X86_
3157
3179
}
3158
3180
3159
3181
#if FEATURE_FIXED_OUT_ARGS
@@ -3179,18 +3201,17 @@ CodeGen::genLclHeap(GenTreePtr tree)
3179
3201
{
3180
3202
// We should reach here only for non-zero, constant size allocations.
3181
3203
assert (amount > 0 );
3204
+ assert ((amount % STACK_ALIGN) == 0 );
3205
+ assert ((amount % REGSIZE_BYTES) == 0 );
3182
3206
3183
3207
// For small allocations we will generate up to six push 0 inline
3184
- size_t cntPtrSizedWords = ( amount >> STACK_ALIGN_SHIFT) ;
3185
- if (cntPtrSizedWords <= 6 )
3208
+ size_t cntRegSizedWords = amount / REGSIZE_BYTES ;
3209
+ if (cntRegSizedWords <= 6 )
3186
3210
{
3187
- while (cntPtrSizedWords != 0 )
3211
+ for (; cntRegSizedWords != 0 ; cntRegSizedWords-- )
3188
3212
{
3189
- // push_hide means don't track the stack
3190
- inst_IV (INS_push_hide, 0 );
3191
- cntPtrSizedWords--;
3213
+ inst_IV (INS_push_hide, 0 ); // push_hide means don't track the stack
3192
3214
}
3193
-
3194
3215
goto ALLOC_DONE;
3195
3216
}
3196
3217
@@ -3246,51 +3267,42 @@ CodeGen::genLclHeap(GenTreePtr tree)
3246
3267
3247
3268
// else, "mov regCnt, amount"
3248
3269
3249
- #if defined(_TARGET_X86_)
3250
3270
if (compiler->info .compInitMem )
3251
3271
{
3252
- // For x86, when initializing memory with a constant count , we want 'amount' to be the
3253
- // count of pointer-sized words, not bytes.
3254
- amount = cntPtrSizedWords ;
3272
+ // When initializing memory, we want 'amount' to be the loop count.
3273
+ assert ((amount % STACK_ALIGN) == 0 );
3274
+ amount /= STACK_ALIGN ;
3255
3275
}
3256
- #endif // _TARGET_X86_
3257
3276
3258
3277
genSetRegToIcon (regCnt, amount, ((int )amount == amount)? TYP_INT : TYP_LONG);
3259
3278
}
3260
3279
3261
3280
loop = genCreateTempLabel ();
3262
3281
if (compiler->info .compInitMem )
3263
3282
{
3264
- // At this point 'regCnt' is set to the total number of bytes (or words, for constant counts) to locAlloc.
3283
+ // At this point 'regCnt' is set to the number of loop iterations for this loop, if each
3284
+ // iteration zeros (and subtracts from the stack pointer) STACK_ALIGN bytes.
3265
3285
// Since we have to zero out the allocated memory AND ensure that RSP is always valid
3266
3286
// by tickling the pages, we will just push 0's on the stack.
3267
- //
3268
- // Note: regCnt is guaranteed to be even on Amd64 since STACK_ALIGN/TARGET_POINTER_SIZE = 2
3269
- // and localloc size is a multiple of STACK_ALIGN.
3270
3287
3271
3288
assert (genIsValidIntReg (regCnt));
3272
3289
3273
3290
// Loop:
3274
3291
genDefineTempLabel (loop);
3275
3292
3276
3293
#if defined(_TARGET_AMD64_)
3277
- // dec is a 2 byte instruction, but sub is 4 (could be 3 if
3278
- // we know size is TYP_INT instead of TYP_I_IMPL)
3279
- // Also we know that we can only push 8 bytes at a time, but
3280
- // alignment is 16 bytes, so we can push twice and do a sub
3281
- // for just a little bit of loop unrolling
3294
+ // Push two 8-byte zeros. This matches the 16-byte STACK_ALIGN value.
3295
+ static_assert_no_msg (STACK_ALIGN == (REGSIZE_BYTES * 2 ));
3282
3296
inst_IV (INS_push_hide, 0 ); // --- push 8-byte 0
3283
3297
inst_IV (INS_push_hide, 0 ); // --- push 8-byte 0
3284
-
3285
- // Note that regCnt is the number of bytes to stack allocate.
3286
- // Therefore we need to subtract 16 from regcnt here.
3287
- inst_RV_IV (INS_sub, regCnt, 16 , emitActualTypeSize (type));
3288
3298
#elif defined(_TARGET_X86_)
3299
+ // Push a single 4-byte zero. This matches the 4-byte STACK_ALIGN value.
3300
+ static_assert_no_msg (STACK_ALIGN == REGSIZE_BYTES);
3289
3301
inst_IV (INS_push_hide, 0 ); // --- push 4-byte 0
3290
- inst_RV (INS_dec, regCnt, TYP_I_IMPL);
3291
3302
#endif // _TARGET_X86_
3292
3303
3293
- // If not done, loop
3304
+ // Decrement the loop counter and loop if not done.
3305
+ inst_RV (INS_dec, regCnt, TYP_I_IMPL);
3294
3306
inst_JMP (EJ_jne, loop);
3295
3307
}
3296
3308
else
0 commit comments