@@ -11253,29 +11253,45 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu
1125311253 // movups xmmword ptr [ebp/esp-OFFS], xmm4
1125411254 // ...
1125511255 // movups xmmword ptr [ebp/esp-OFFS], xmm4
11256- // mov qword ptr [ebp/esp-OFFS], rax
11257- //
11256+
1125811257 // NOTE: it implicitly zeroes YMM4 and ZMM4 as well.
1125911258 emit->emitIns_SIMD_R_R_R (INS_xorps, EA_16BYTE, zeroSIMDReg, zeroSIMDReg, zeroSIMDReg, INS_OPTS_NONE);
1126011259
11261- int i = 0 ;
11262- if (maxSimdSize > XMM_REGSIZE_BYTES)
11260+ assert ((blkSize % XMM_REGSIZE_BYTES) == 0 );
11261+
11262+ int regSize = (int )compiler->roundDownSIMDSize (blkSize);
11263+ int lenRemaining = blkSize;
11264+ while (lenRemaining > 0 )
1126311265 {
11264- for (; i <= blkSize - maxSimdSize; i += maxSimdSize)
11266+ // Overlap with the previously zeroed memory if we can clear the remainder
11267+ // with just single store. Example: say we have 112 bytes to clear:
11268+ //
11269+ // Option 1 (no overlapping):
11270+ // movups zmmword ptr [+0]
11271+ // movups ymmword ptr [+64]
11272+ // movups xmmword ptr [+96]
11273+ //
11274+ // Option 2 (overlapping):
11275+ // movups zmmword ptr [+0]
11276+ // movups zmmword ptr [+48]
11277+ //
11278+ if ((regSize > lenRemaining) && !isPow2 (lenRemaining))
1126511279 {
11266- // We previously aligned data to 16 bytes which might not be aligned to maxSimdSize
11267- emit->emitIns_AR_R (simdUnalignedMovIns (), EA_ATTR (maxSimdSize), zeroSIMDReg, frameReg,
11268- alignedLclLo + i);
11280+ lenRemaining = regSize;
1126911281 }
11270- // Remainder will be handled by the xmm loop below
11271- }
1127211282
11273- for (; i < blkSize; i += XMM_REGSIZE_BYTES)
11274- {
11275- emit->emitIns_AR_R (simdMov, EA_ATTR (XMM_REGSIZE_BYTES), zeroSIMDReg, frameReg, alignedLclLo + i);
11276- }
11283+ // Use the largest SIMD register size that fits in the remaining length
11284+ regSize = (int )compiler->roundDownSIMDSize (lenRemaining);
11285+ assert (regSize >= XMM_REGSIZE_BYTES);
11286+
11287+ // frameReg is definitely not known to be 32B/64B aligned -> switch to unaligned movs
11288+ instruction ins = regSize > XMM_REGSIZE_BYTES ? simdUnalignedMovIns () : simdMov;
11289+ const int offset = blkSize - lenRemaining;
11290+ emit->emitIns_AR_R (ins, EA_ATTR (regSize), zeroSIMDReg, frameReg, alignedLclLo + offset);
1127711291
11278- assert (i == blkSize);
11292+ lenRemaining -= regSize;
11293+ }
11294+ assert (lenRemaining == 0 );
1127911295 }
1128011296 else
1128111297 {
0 commit comments