Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Commit e47ea2c

Browse files
author
Koundinya Veluri
committed
Add allocation fast path for arrays of value type elements outside Windows
- A microbenchmark involving byte array allocation was about 200% faster on Windows compared to Linux - On Windows, using the portable version of the fast path is about 5% slower than using the asm version on the microbenchmark - On Linux, using the portable fast path improves the microbenchmark perf by 160% - With the fast path enabled on Linux, the microbenchmark on Windows (with asm fast path) is now about 17% faster than on Linux.
1 parent f7461fe commit e47ea2c

File tree

6 files changed

+108
-10
lines changed

6 files changed

+108
-10
lines changed

src/gc/gc.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -454,10 +454,11 @@ class GCHeap {
454454
// SIMPLIFY: only use allocation contexts
455455
return true;
456456
#else
457-
#ifdef _TARGET_ARM_
458-
return TRUE;
459-
#endif
457+
#if defined(_TARGET_ARM_) || defined(FEATURE_PAL)
458+
return true;
459+
#else
460460
return ((IsServerHeap() ? true : (g_SystemInfo.dwNumberOfProcessors >= 2)));
461+
#endif
461462
#endif
462463
}
463464

src/inc/sbuffer.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
(( (size^(size-1)) >> 1) +1)
5151

5252
#define ALIGN(size, align) \
53-
(((size)+(align)-1) & ~((align)-1))
53+
(((size)+((align)-1)) & ~((align)-1))
5454

5555
#define PAD(size, align) \
5656
(ALIGN((size), (align)) - (size))

src/vm/jithelpers.cpp

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3021,6 +3021,93 @@ HCIMPL2(Object *, JIT_StrCns, unsigned rid, CORINFO_MODULE_HANDLE scopeHnd)
30213021
HCIMPLEND
30223022

30233023

3024+
//========================================================================
3025+
//
3026+
// ARRAY FAST PATHS
3027+
//
3028+
//========================================================================
3029+
3030+
#include <optsmallperfcritical.h>
3031+
3032+
//*************************************************************
3033+
// Array allocation fast path for arrays of value type elements
3034+
//
3035+
3036+
HCIMPL2(Object*, JIT_NewArr1VC_MP_FastPortable, CORINFO_CLASS_HANDLE arrayTypeHnd_, INT_PTR size)
3037+
{
3038+
FCALL_CONTRACT;
3039+
3040+
do
3041+
{
3042+
_ASSERTE(GCHeap::UseAllocationContexts());
3043+
3044+
// Do a conservative check here. This is to avoid overflow while doing the calculations. We don't
3045+
// have to worry about "large" objects, since the allocation quantum is never big enough for
3046+
// LARGE_OBJECT_SIZE.
3047+
//
3048+
// For Value Classes, this needs to be 2^16 - slack (2^32 / max component size),
3049+
// The slack includes the size for the array header and round-up ; for alignment. Use 256 for the
3050+
// slack value out of laziness.
3051+
SIZE_T componentCount = static_cast<SIZE_T>(size);
3052+
if (componentCount >= static_cast<SIZE_T>(65535 - 256))
3053+
{
3054+
break;
3055+
}
3056+
3057+
// This is typically the only call in the fast path. Making the call early seems to be better, as it allows the compiler
3058+
// to use volatile registers for intermediate values. This reduces the number of push/pop instructions and eliminates
3059+
// some reshuffling of intermediate values into nonvolatile registers around the call.
3060+
Thread *thread = GetThread();
3061+
3062+
TypeHandle arrayTypeHandle(arrayTypeHnd_);
3063+
ArrayTypeDesc *arrayTypeDesc = arrayTypeHandle.AsArray();
3064+
MethodTable *arrayMethodTable = arrayTypeDesc->GetTemplateMethodTable();
3065+
3066+
_ASSERTE(arrayMethodTable->HasComponentSize());
3067+
SIZE_T componentSize = arrayMethodTable->RawGetComponentSize();
3068+
SIZE_T totalSize = componentCount * componentSize;
3069+
_ASSERTE(totalSize / componentSize == componentCount);
3070+
3071+
SIZE_T baseSize = arrayMethodTable->GetBaseSize();
3072+
totalSize += baseSize;
3073+
_ASSERTE(totalSize >= baseSize);
3074+
3075+
SIZE_T alignedTotalSize = ALIGN_UP(totalSize, DATA_ALIGNMENT);
3076+
_ASSERTE(alignedTotalSize >= totalSize);
3077+
totalSize = alignedTotalSize;
3078+
3079+
alloc_context *allocContext = thread->GetAllocContext();
3080+
BYTE *allocPtr = allocContext->alloc_ptr;
3081+
_ASSERTE(allocPtr <= allocContext->alloc_limit);
3082+
if (totalSize > static_cast<SIZE_T>(allocContext->alloc_limit - allocPtr))
3083+
{
3084+
break;
3085+
}
3086+
allocContext->alloc_ptr = allocPtr + totalSize;
3087+
3088+
_ASSERTE(allocPtr != nullptr);
3089+
ArrayBase *array = reinterpret_cast<ArrayBase *>(allocPtr);
3090+
array->SetMethodTable(arrayMethodTable);
3091+
_ASSERTE(static_cast<DWORD>(componentCount) == componentCount);
3092+
array->m_NumComponents = static_cast<DWORD>(componentCount);
3093+
3094+
#if CHECK_APP_DOMAIN_LEAKS
3095+
if (g_pConfig->AppDomainLeaks())
3096+
{
3097+
array->SetAppDomain();
3098+
}
3099+
#endif // CHECK_APP_DOMAIN_LEAKS
3100+
3101+
return array;
3102+
} while (false);
3103+
3104+
// Tail call to the slow helper
3105+
ENDFORBIDGC();
3106+
return HCCALL2(JIT_NewArr1, arrayTypeHnd_, size);
3107+
}
3108+
HCIMPLEND
3109+
3110+
#include <optdefault.h>
30243111

30253112
//========================================================================
30263113
//
@@ -3068,7 +3155,8 @@ HCIMPL2(Object*, JIT_NewArr1, CORINFO_CLASS_HANDLE arrayTypeHnd_, INT_PTR size)
30683155
&& (elemType != ELEMENT_TYPE_U8)
30693156
&& (elemType != ELEMENT_TYPE_R8)
30703157
#endif
3071-
) {
3158+
)
3159+
{
30723160
#ifdef _DEBUG
30733161
if (g_pConfig->FastGCStressLevel()) {
30743162
GetThread()->DisableStressHeap();

src/vm/jitinterface.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,9 @@ EXTERN_C FCDECL2(Object*, JIT_IsInstanceOfInterface_Portable, MethodTable* pMT,
200200
EXTERN_C FCDECL1(Object*, JIT_NewCrossContext, CORINFO_CLASS_HANDLE typeHnd_);
201201
EXTERN_C FCDECL1(Object*, JIT_NewCrossContext_Portable, CORINFO_CLASS_HANDLE typeHnd_);
202202

203+
extern FCDECL2(Object*, JIT_NewArr1VC_MP_FastPortable, CORINFO_CLASS_HANDLE typeHnd_, INT_PTR size);
204+
extern FCDECL2(Object*, JIT_NewArr1, CORINFO_CLASS_HANDLE typeHnd_, INT_PTR size);
205+
203206
#ifndef JIT_Stelem_Ref
204207
#define JIT_Stelem_Ref JIT_Stelem_Ref_Portable
205208
#endif
@@ -1614,7 +1617,6 @@ FCDECL1(StringObject*, UnframedAllocateString, DWORD stringLength);
16141617
OBJECTHANDLE ConstructStringLiteral(CORINFO_MODULE_HANDLE scopeHnd, mdToken metaTok);
16151618

16161619
FCDECL1(Object*, JIT_New, CORINFO_CLASS_HANDLE typeHnd_);
1617-
FCDECL2(Object*, JIT_NewArr1, CORINFO_CLASS_HANDLE typeHnd_, INT_PTR size);
16181620
FCDECL2(Object*, JIT_Box, CORINFO_CLASS_HANDLE type, void* data);
16191621
FCDECL0(VOID, JIT_PollGC);
16201622
#ifdef ENABLE_FAST_GCPOLL_HELPER

src/vm/jitinterfacegen.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -199,11 +199,10 @@ void InitJITHelpers1()
199199

200200
#if defined(_TARGET_AMD64_)
201201

202-
g_WriteBarrierManager.Initialize();
202+
g_WriteBarrierManager.Initialize();
203203

204204
#ifndef FEATURE_IMPLICIT_TLS
205-
206-
if (gThreadTLSIndex < TLS_MINIMUM_AVAILABLE)
205+
if (gThreadTLSIndex < TLS_MINIMUM_AVAILABLE)
207206
{
208207
FixupInlineGetters(gThreadTLSIndex, InlineGetThreadLocations, COUNTOF(InlineGetThreadLocations));
209208
}
@@ -212,6 +211,7 @@ void InitJITHelpers1()
212211
{
213212
FixupInlineGetters(gAppDomainTLSIndex, InlineGetAppDomainLocations, COUNTOF(InlineGetAppDomainLocations));
214213
}
214+
#endif // !FEATURE_IMPLICIT_TLS
215215

216216
// Allocation helpers, faster but non-logging
217217
if (!((TrackAllocationsEnabled()) ||
@@ -221,10 +221,12 @@ void InitJITHelpers1()
221221
#endif // _DEBUG
222222
))
223223
{
224-
225224
// if (multi-proc || server GC)
226225
if (GCHeap::UseAllocationContexts())
227226
{
227+
#ifdef FEATURE_IMPLICIT_TLS
228+
SetJitHelperFunction(CORINFO_HELP_NEWARR_1_VC, JIT_NewArr1VC_MP_FastPortable);
229+
#else // !FEATURE_IMPLICIT_TLS
228230
// If the TLS for Thread is low enough use the super-fast helpers
229231
if (gThreadTLSIndex < TLS_MINIMUM_AVAILABLE)
230232
{
@@ -246,9 +248,11 @@ void InitJITHelpers1()
246248

247249
ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateStringFastMP), ECall::FastAllocateString);
248250
}
251+
#endif // FEATURE_IMPLICIT_TLS
249252
}
250253
else
251254
{
255+
#ifndef FEATURE_PAL
252256
// Replace the 1p slow allocation helpers with faster version
253257
//
254258
// When we're running Workstation GC on a single proc box we don't have
@@ -260,9 +264,11 @@ void InitJITHelpers1()
260264
SetJitHelperFunction(CORINFO_HELP_NEWARR_1_OBJ, JIT_NewArr1OBJ_UP);
261265

262266
ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateStringFastUP), ECall::FastAllocateString);
267+
#endif // !FEATURE_PAL
263268
}
264269
}
265270

271+
#ifndef FEATURE_IMPLICIT_TLS
266272
if (gThreadTLSIndex >= TLS_MINIMUM_AVAILABLE)
267273
{
268274
// We need to patch the helpers for FCalls

src/vm/object.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -745,6 +745,7 @@ class ArrayBase : public Object
745745
friend class Object;
746746
friend OBJECTREF AllocateArrayEx(TypeHandle arrayClass, INT32 *pArgs, DWORD dwNumArgs, BOOL bAllocateInLargeHeap DEBUG_ARG(BOOL bDontSetAppDomain));
747747
friend OBJECTREF FastAllocatePrimitiveArray(MethodTable* arrayType, DWORD cElements, BOOL bAllocateInLargeHeap);
748+
friend Object *JIT_NewArr1VC_MP_FastPortable(CORINFO_CLASS_HANDLE arrayTypeHnd_, INT_PTR size);
748749
friend class JIT_TrialAlloc;
749750
friend class CheckAsmOffsets;
750751
friend struct _DacGlobals;

0 commit comments

Comments
 (0)