Skip to content

Commit ffb52e9

Browse files
EgorBojakobbotsch
andauthored
Optimize stackalloc zeroing on arm64 via STORE_BLK (#121986)
Enable X64's optimization where we clear LCLHEAP via STORE_BLK inserted in Lower on arm64. ```cs static void Test128() => Consume(stackalloc char[128]); ``` was: ```asm stp xzr, xzr, [sp, #-0x10]! stp xzr, xzr, [sp, #-0xF0]! stp xzr, xzr, [sp, #0x10] stp xzr, xzr, [sp, #0x20] stp xzr, xzr, [sp, #0x30] stp xzr, xzr, [sp, #0x40] stp xzr, xzr, [sp, #0x50] stp xzr, xzr, [sp, #0x60] stp xzr, xzr, [sp, #0x70] stp xzr, xzr, [sp, #0x80] stp xzr, xzr, [sp, #0x90] stp xzr, xzr, [sp, #0xA0] stp xzr, xzr, [sp, #0xB0] stp xzr, xzr, [sp, #0xC0] stp xzr, xzr, [sp, #0xD0] stp xzr, xzr, [sp, #0xE0] ``` now: ```asm movi v16.16b, #0 stp q16, q16, [x0] stp q16, q16, [x0, #0x20] stp q16, q16, [x0, #0x40] stp q16, q16, [x0, #0x60] stp q16, q16, [x0, #0x80] stp q16, q16, [x0, #0xA0] stp q16, q16, [x0, #0xC0] stp q16, q16, [x0, #0xE0] ``` Also, for larger sizes the previous logic used to emit a slow loop (e.g. 1024 bytes): ```asm mov w0, #0x400 G_M30953_IG03: stp xzr, xzr, [sp, #-0x10]! subs x0, x0, #16 bne G_M30953_IG03 ``` Now it will emit a call to `CORINFO_HELP_MEMZERO` [Benchmarks.](EgorBot/runtime-utils#553) ```cs using System.Runtime.CompilerServices; using BenchmarkDotNet.Attributes; public class Benchmarks { [Benchmark] public void Stackalloc64() => Consume(stackalloc byte[64]); [Benchmark] public void Stackalloc128() => Consume(stackalloc byte[128]); [Benchmark] public void Stackalloc256() => Consume(stackalloc byte[256]); [Benchmark] public void Stackalloc512() => Consume(stackalloc byte[512]); [Benchmark] public void Stackalloc1024() => Consume(stackalloc byte[1024]); [Benchmark] public void Stackalloc16384() => Consume(stackalloc byte[16384]); [MethodImpl(MethodImplOptions.NoInlining)] static void Consume(Span<byte> x){} } ``` | Method | Toolchain | Mean | Error | Ratio | |---------------- |------------------------ |-----------:|----------:|------:| | Stackalloc64 | Main | 3.425 ns | 0.0004 ns | 1.00 | | Stackalloc64 | PR | 2.559 ns | 0.0008 ns | 0.75 | | | | | | | | Stackalloc128 | Main | 3.999 ns | 0.0002 ns | 1.00 | | Stackalloc128 | PR | 2.404 ns | 0.0003 ns | 0.60 | | | | | | | | Stackalloc256 | Main | 5.431 ns | 0.0005 ns | 1.00 | | Stackalloc256 | PR | 2.754 ns | 0.0003 ns | 0.51 | | | | | | | | Stackalloc512 | Main | 12.661 ns | 0.2744 ns | 1.00 | | Stackalloc512 | PR | 7.423 ns | 0.0008 ns | 0.59 | | | | | | | | Stackalloc1024 | Main | 24.958 ns | 0.5326 ns | 1.00 | | Stackalloc1024 | PR | 14.031 ns | 0.0040 ns | 0.56 | | | | | | | | Stackalloc16384 | Main | 374.899 ns | 0.0130 ns | 1.00 | | Stackalloc16384 | PR | 111.029 ns | 1.2123 ns | 0.30 | --------- Co-authored-by: Jakob Botsch Nielsen <[email protected]>
1 parent 80d3434 commit ffb52e9

File tree

3 files changed

+21
-82
lines changed

3 files changed

+21
-82
lines changed

src/coreclr/jit/codegenarm64.cpp

Lines changed: 11 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3018,7 +3018,6 @@ void CodeGen::genLclHeap(GenTree* tree)
30183018
var_types type = genActualType(size->gtType);
30193019
emitAttr easz = emitTypeSize(type);
30203020
BasicBlock* endLabel = nullptr;
3021-
BasicBlock* loop = nullptr;
30223021
unsigned stackAdjustment = 0;
30233022
const target_ssize_t ILLEGAL_LAST_TOUCH_DELTA = (target_ssize_t)-1;
30243023
target_ssize_t lastTouchDelta =
@@ -3027,12 +3026,15 @@ void CodeGen::genLclHeap(GenTree* tree)
30273026
noway_assert(isFramePointerUsed()); // localloc requires Frame Pointer to be established since SP changes
30283027
noway_assert(genStackLevel == 0); // Can't have anything on the stack
30293028

3029+
bool needsZeroing = compiler->info.compInitMem;
3030+
30303031
// compute the amount of memory to allocate to properly STACK_ALIGN.
30313032
size_t amount = 0;
3032-
if (size->IsCnsIntOrI())
3033+
if (size->isContainedIntOrIImmed())
30333034
{
3034-
// If size is a constant, then it must be contained.
3035-
assert(size->isContained());
3035+
// The size node being a contained constant means that Lower has taken care of
3036+
// zeroing the memory if compInitMem is true.
3037+
needsZeroing = false;
30363038

30373039
// If amount is zero then return null in targetReg
30383040
amount = size->AsIntCon()->gtIconVal;
@@ -3056,7 +3058,7 @@ void CodeGen::genLclHeap(GenTree* tree)
30563058
// Compute the size of the block to allocate and perform alignment.
30573059
// If compInitMem=true, we can reuse targetReg as regcnt,
30583060
// since we don't need any internal registers.
3059-
if (compiler->info.compInitMem)
3061+
if (needsZeroing)
30603062
{
30613063
assert(internalRegisters.Count(tree) == 0);
30623064
regCnt = targetReg;
@@ -3093,7 +3095,7 @@ void CodeGen::genLclHeap(GenTree* tree)
30933095
stackAdjustment += compiler->lvaOutgoingArgSpaceSize;
30943096
}
30953097

3096-
if (size->IsCnsIntOrI())
3098+
if (size->isContainedIntOrIImmed())
30973099
{
30983100
// We should reach here only for non-zero, constant size allocations.
30993101
assert(amount > 0);
@@ -3104,39 +3106,7 @@ void CodeGen::genLclHeap(GenTree* tree)
31043106
static_assert(STACK_ALIGN == storePairRegsWritesBytes);
31053107
assert(amount % storePairRegsWritesBytes == 0); // stp stores two registers at a time
31063108

3107-
if (compiler->info.compInitMem)
3108-
{
3109-
if (amount <= compiler->getUnrollThreshold(Compiler::UnrollKind::Memset))
3110-
{
3111-
// The following zeroes the last 16 bytes and probes the page containing [sp, #16] address.
3112-
// stp xzr, xzr, [sp, #-16]!
3113-
GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, -storePairRegsWritesBytes,
3114-
INS_OPTS_PRE_INDEX);
3115-
3116-
if (amount > storePairRegsWritesBytes)
3117-
{
3118-
// The following sets SP to its final value and zeroes the first 16 bytes of the allocated space.
3119-
// stp xzr, xzr, [sp, #-amount+16]!
3120-
const ssize_t finalSpDelta = (ssize_t)amount - storePairRegsWritesBytes;
3121-
GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, -finalSpDelta,
3122-
INS_OPTS_PRE_INDEX);
3123-
3124-
// The following zeroes the remaining space in [finalSp+16, initialSp-16) interval
3125-
// using a sequence of stp instruction with unsigned offset.
3126-
for (ssize_t offset = storePairRegsWritesBytes; offset < finalSpDelta;
3127-
offset += storePairRegsWritesBytes)
3128-
{
3129-
// stp xzr, xzr, [sp, #offset]
3130-
GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, offset);
3131-
}
3132-
}
3133-
3134-
lastTouchDelta = 0;
3135-
3136-
goto ALLOC_DONE;
3137-
}
3138-
}
3139-
else if (amount < compiler->eeGetPageSize()) // must be < not <=
3109+
if (amount < compiler->eeGetPageSize()) // must be < not <=
31403110
{
31413111
// Since the size is less than a page, simply adjust the SP value.
31423112
// The SP might already be in the guard page, so we must touch it BEFORE
@@ -3178,7 +3148,7 @@ void CodeGen::genLclHeap(GenTree* tree)
31783148
// If compInitMem=true, we can reuse targetReg as regcnt.
31793149
// Since size is a constant, regCnt is not yet initialized.
31803150
assert(regCnt == REG_NA);
3181-
if (compiler->info.compInitMem)
3151+
if (needsZeroing)
31823152
{
31833153
assert(internalRegisters.Count(tree) == 0);
31843154
regCnt = targetReg;
@@ -3190,7 +3160,7 @@ void CodeGen::genLclHeap(GenTree* tree)
31903160
instGen_Set_Reg_To_Imm(((unsigned int)amount == amount) ? EA_4BYTE : EA_8BYTE, regCnt, amount);
31913161
}
31923162

3193-
if (compiler->info.compInitMem)
3163+
if (needsZeroing)
31943164
{
31953165
BasicBlock* loop = genCreateTempLabel();
31963166

src/coreclr/jit/lower.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11418,7 +11418,7 @@ void Lowering::LowerLclHeap(GenTree* node)
1141811418
{
1141911419
assert(node->OperIs(GT_LCLHEAP));
1142011420

11421-
#if defined(TARGET_XARCH)
11421+
#if defined(TARGET_XARCH) || defined(TARGET_ARM64)
1142211422
if (node->gtGetOp1()->IsCnsIntOrI())
1142311423
{
1142411424
GenTreeIntCon* sizeNode = node->gtGetOp1()->AsIntCon();

src/coreclr/jit/lsraarm64.cpp

Lines changed: 9 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1129,52 +1129,21 @@ int LinearScan::BuildNode(GenTree* tree)
11291129
{
11301130
assert(dstCount == 1);
11311131

1132-
// Need a variable number of temp regs (see genLclHeap() in codegenarm64.cpp):
1133-
// Here '-' means don't care.
1134-
//
1135-
// Size? Init Memory? # temp regs
1136-
// 0 - 0
1137-
// const and <=UnrollLimit - 0
1138-
// const and <PageSize No 0
1139-
// >UnrollLimit Yes 0
1140-
// Non-const Yes 0
1141-
// Non-const No 2
1142-
//
1143-
11441132
GenTree* size = tree->gtGetOp1();
1145-
if (size->IsCnsIntOrI())
1133+
if (size->isContainedIntOrIImmed())
11461134
{
1147-
assert(size->isContained());
11481135
srcCount = 0;
11491136

1137+
// We won't have to clear the memory regardless info.compInitMem is set or not
1138+
// (if it's set, Lower will emit a STORE_BLK for this LCLHEAP), but we still need to
1139+
// do stack-probing for large allocations.
1140+
//
11501141
size_t sizeVal = size->AsIntCon()->gtIconVal;
1151-
1152-
if (sizeVal != 0)
1142+
if (AlignUp(sizeVal, STACK_ALIGN) >= compiler->eeGetPageSize())
11531143
{
1154-
// Compute the amount of memory to properly STACK_ALIGN.
1155-
// Note: The GenTree node is not updated here as it is cheap to recompute stack aligned size.
1156-
// This should also help in debugging as we can examine the original size specified with
1157-
// localloc.
1158-
sizeVal = AlignUp(sizeVal, STACK_ALIGN);
1159-
1160-
if (sizeVal <= compiler->getUnrollThreshold(Compiler::UnrollKind::Memset))
1161-
{
1162-
// Need no internal registers
1163-
}
1164-
else if (!compiler->info.compInitMem)
1165-
{
1166-
// No need to initialize allocated stack space.
1167-
if (sizeVal < compiler->eeGetPageSize())
1168-
{
1169-
// Need no internal registers
1170-
}
1171-
else
1172-
{
1173-
// We need two registers: regCnt and RegTmp
1174-
buildInternalIntRegisterDefForNode(tree);
1175-
buildInternalIntRegisterDefForNode(tree);
1176-
}
1177-
}
1144+
// We need two registers: regCnt and RegTmp
1145+
buildInternalIntRegisterDefForNode(tree);
1146+
buildInternalIntRegisterDefForNode(tree);
11781147
}
11791148
}
11801149
else

0 commit comments

Comments
 (0)