From 4a97f04fbabe583a529cbd6ddf5bc9c8a2f5b222 Mon Sep 17 00:00:00 2001 From: SingleAccretion Date: Tue, 23 Sep 2025 01:17:56 +0300 Subject: [PATCH] Optimize PI and RPI transitions This changes the managed ABI in the following ways: 1) The PI transition frames becomes simply the shadow stack top. This "frame" is zero-sized - we don't store the current thread in it, since the WASM TLS model allows us to elide it. The obvious benefit from this is that the PI path is now almost 100% optimal: two stores and one load. We can get rid of the load in an ST build as well, but that's left for a future change. 2) The RPI transition frame is now always allocated at a zero offset and returned directly from the RPI helper. We thus elide the intermediate state where we already have the shadow stack, but haven't yet attached the thread. This brings us in line with other targets. 3) The sparse virtual unwind frame is now allocated right after the RPI frame, and "combined" RPI helpers introduced that both effect the transition and push the EH frame. The RPI changes reduce the number of helper calls that any RPI method needs to make from 3 to 1 (for epilogs - 2 to 1) in the sparse virtual unwinding model, and reduce the number of intructions on the critical path. Benchmarks: Node base: Bench_PInvoke took : 86 ms (8.64 ns / op) Bench_ReversePInvoke_Empty took : 113 ms (11.30 ns / op) Bench_ReversePInvoke_WithEH took : 172 ms (17.24 ns / op) Node diff: Bench_PInvoke took : 81 ms (8.06 ns / op) Bench_ReversePInvoke_Empty took : 58 ms (5.81 ns / op) Bench_ReversePInvoke_WithEH took : 108 ms (10.79 ns / op) Wasmtime base: Bench_PInvoke took : 99 ms (9.86 ns / op) Bench_ReversePInvoke_Empty took : 73 ms (7.28 ns / op) Bench_ReversePInvoke_WithEH took : 77 ms (7.71 ns / op) Wasmtime diff: Bench_PInvoke took : 82 ms (8.16 ns / op) Bench_ReversePInvoke_Empty took : 31 ms (3.06 ns / op) Bench_ReversePInvoke_WithEH took : 50 ms (4.98 ns / op) --- src/coreclr/inc/corinfo.h | 4 +- src/coreclr/inc/jithelpers.h | 4 +- src/coreclr/jit/flowgraph.cpp | 2 + src/coreclr/jit/llvm.cpp | 16 +- src/coreclr/jit/llvm.h | 1 + src/coreclr/jit/llvmlower.cpp | 55 ++++--- src/coreclr/jit/llvmlssa.cpp | 102 ++++++++++--- src/coreclr/jit/utils.cpp | 15 +- src/coreclr/nativeaot/Runtime/GCHelpers.cpp | 2 +- .../nativeaot/Runtime/StackFrameIterator.cpp | 2 + src/coreclr/nativeaot/Runtime/inc/rhbinder.h | 6 +- src/coreclr/nativeaot/Runtime/thread.cpp | 34 ++++- src/coreclr/nativeaot/Runtime/thread.h | 28 ++-- src/coreclr/nativeaot/Runtime/thread.inl | 29 +--- src/coreclr/nativeaot/Runtime/threadstore.cpp | 2 + .../nativeaot/Runtime/wasm/AllocFast.cpp | 11 +- .../ExceptionHandling/ExceptionHandling.cpp | 20 +-- .../nativeaot/Runtime/wasm/GcStress.cpp | 16 +- .../nativeaot/Runtime/wasm/PInvoke.cpp | 142 ++++++++++++++---- src/coreclr/nativeaot/Runtime/wasm/wasm.h | 26 ++++ .../Common/JitInterface/CorInfoHelpFunc.cs | 4 +- .../JitInterface/CorInfoImpl.RyuJit.cs | 14 +- 22 files changed, 360 insertions(+), 175 deletions(-) diff --git a/src/coreclr/inc/corinfo.h b/src/coreclr/inc/corinfo.h index 6854c4cf6b97..c3c6890aeee6 100644 --- a/src/coreclr/inc/corinfo.h +++ b/src/coreclr/inc/corinfo.h @@ -595,14 +595,14 @@ enum CorInfoHelpFunc CORINFO_HELP_VALIDATE_INDIRECT_CALL, // CFG: Validate function pointer CORINFO_HELP_DISPATCH_INDIRECT_CALL, // CFG: Validate and dispatch to pointer - CORINFO_HELP_LLVM_GET_OR_INIT_SHADOW_STACK_TOP, CORINFO_HELP_LLVM_EH_CATCH, CORINFO_HELP_LLVM_EH_POP_UNWOUND_VIRTUAL_FRAMES, CORINFO_HELP_LLVM_EH_PUSH_VIRTUAL_UNWIND_FRAME, + CORINFO_HELP_LLVM_EH_REVERSE_PINVOKE_ENTER_AND_PUSH_VIRTUAL_UNWIND_FRAME, CORINFO_HELP_LLVM_EH_POP_VIRTUAL_UNWIND_FRAME, + CORINFO_HELP_LLVM_EH_REVERSE_PINVOKE_EXIT_AND_POP_VIRTUAL_UNWIND_FRAME, CORINFO_HELP_LLVM_EH_UNHANDLED_EXCEPTION, CORINFO_HELP_LLVM_RESOLVE_INTERFACE_CALL_TARGET, - CORINFO_HELP_LLVM_GET_EXTERNAL_CALL_TARGET, CORINFO_HELP_LLVM_STRESS_GC, CORINFO_HELP_COUNT, diff --git a/src/coreclr/inc/jithelpers.h b/src/coreclr/inc/jithelpers.h index e7ae43ac2b8e..27166b2c8046 100644 --- a/src/coreclr/inc/jithelpers.h +++ b/src/coreclr/inc/jithelpers.h @@ -343,14 +343,14 @@ JITHELPER(CORINFO_HELP_DISPATCH_INDIRECT_CALL, NULL, METHOD__NIL) #endif - JITHELPER(CORINFO_HELP_LLVM_GET_OR_INIT_SHADOW_STACK_TOP, NULL, METHOD__NIL) JITHELPER(CORINFO_HELP_LLVM_EH_CATCH, NULL, METHOD__NIL) JITHELPER(CORINFO_HELP_LLVM_EH_POP_UNWOUND_VIRTUAL_FRAMES, NULL, METHOD__NIL) JITHELPER(CORINFO_HELP_LLVM_EH_PUSH_VIRTUAL_UNWIND_FRAME, NULL, METHOD__NIL) + JITHELPER(CORINFO_HELP_LLVM_EH_REVERSE_PINVOKE_ENTER_AND_PUSH_VIRTUAL_UNWIND_FRAME, NULL, METHOD__NIL) JITHELPER(CORINFO_HELP_LLVM_EH_POP_VIRTUAL_UNWIND_FRAME, NULL, METHOD__NIL) + JITHELPER(CORINFO_HELP_LLVM_EH_REVERSE_PINVOKE_EXIT_AND_POP_VIRTUAL_UNWIND_FRAME, NULL, METHOD__NIL) JITHELPER(CORINFO_HELP_LLVM_EH_UNHANDLED_EXCEPTION, NULL, METHOD__NIL) JITHELPER(CORINFO_HELP_LLVM_RESOLVE_INTERFACE_CALL_TARGET, NULL, METHOD__NIL) - JITHELPER(CORINFO_HELP_LLVM_GET_EXTERNAL_CALL_TARGET, NULL, METHOD__NIL) JITHELPER(CORINFO_HELP_LLVM_STRESS_GC, JIT_StressGC, METHOD__NIL) #undef JITHELPER #undef DYNAMICJITHELPER diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp index bb836a6f61ac..5a348d69c312 100644 --- a/src/coreclr/jit/flowgraph.cpp +++ b/src/coreclr/jit/flowgraph.cpp @@ -1740,6 +1740,7 @@ void Compiler::fgAddReversePInvokeEnterExit() LclVarDsc* varDsc = lvaGetDesc(lvaReversePInvokeFrameVar); lvaSetStruct(lvaReversePInvokeFrameVar, typGetBlkLayout(eeGetEEInfo()->sizeOfReversePInvokeFrame), false); +#ifndef TARGET_WASM // WASM RPI helpers have special ABI and are inserted in lowering. // Add enter pinvoke exit callout at the start of prolog GenTree* pInvokeFrameVar = gtNewLclVarAddrNode(lvaReversePInvokeFrameVar); @@ -1804,6 +1805,7 @@ void Compiler::fgAddReversePInvokeEnterExit() printf("\n"); } #endif +#endif // !TARGET_WASM } /***************************************************************************** diff --git a/src/coreclr/jit/llvm.cpp b/src/coreclr/jit/llvm.cpp index 7a2982ca4bcf..fe78d39da9d0 100644 --- a/src/coreclr/jit/llvm.cpp +++ b/src/coreclr/jit/llvm.cpp @@ -517,11 +517,11 @@ bool Llvm::helperCallMayVirtuallyUnwind(CorInfoHelpFunc helperFunc) const { FUNC(CORINFO_HELP_THROW_ENTRYPOINT_NOT_FOUND_EXCEPTION) }, // [R]PI helpers, implemented in "Runtime\thread.cpp". - { FUNC(CORINFO_HELP_JIT_PINVOKE_BEGIN) CORINFO_TYPE_VOID, { CORINFO_TYPE_PTR }, HFIF_SS_ARG | HFIF_NO_RPI_OR_GC | HFIF_NO_VIRTUAL_UNWIND }, - { FUNC(CORINFO_HELP_JIT_PINVOKE_END) CORINFO_TYPE_VOID, { CORINFO_TYPE_PTR }, HFIF_NO_RPI_OR_GC | HFIF_NO_VIRTUAL_UNWIND }, - { FUNC(CORINFO_HELP_JIT_REVERSE_PINVOKE_ENTER) CORINFO_TYPE_VOID, { CORINFO_TYPE_PTR }, HFIF_SS_ARG }, + { FUNC(CORINFO_HELP_JIT_PINVOKE_BEGIN) CORINFO_TYPE_VOID, { }, HFIF_SS_ARG | HFIF_NO_RPI_OR_GC | HFIF_NO_VIRTUAL_UNWIND }, + { FUNC(CORINFO_HELP_JIT_PINVOKE_END) CORINFO_TYPE_VOID, { }, HFIF_SS_ARG | HFIF_NO_RPI_OR_GC | HFIF_NO_VIRTUAL_UNWIND }, + { FUNC(CORINFO_HELP_JIT_REVERSE_PINVOKE_ENTER) CORINFO_TYPE_PTR, { CORINFO_TYPE_NATIVEUINT }, HFIF_NO_RPI_OR_GC | HFIF_NO_VIRTUAL_UNWIND }, { FUNC(CORINFO_HELP_JIT_REVERSE_PINVOKE_ENTER_TRACK_TRANSITIONS) }, - { FUNC(CORINFO_HELP_JIT_REVERSE_PINVOKE_EXIT) CORINFO_TYPE_VOID, { CORINFO_TYPE_PTR, CORINFO_TYPE_PTR }, HFIF_NO_RPI_OR_GC | HFIF_NO_VIRTUAL_UNWIND }, + { FUNC(CORINFO_HELP_JIT_REVERSE_PINVOKE_EXIT) CORINFO_TYPE_VOID, { CORINFO_TYPE_PTR }, HFIF_NO_RPI_OR_GC | HFIF_NO_VIRTUAL_UNWIND }, { FUNC(CORINFO_HELP_JIT_REVERSE_PINVOKE_EXIT_TRACK_TRANSITIONS) }, // Implemented in "CoreLib\src\System\Runtime\TypeLoaderExports.cs". @@ -544,14 +544,14 @@ bool Llvm::helperCallMayVirtuallyUnwind(CorInfoHelpFunc helperFunc) const { FUNC(CORINFO_HELP_VALIDATE_INDIRECT_CALL) }, { FUNC(CORINFO_HELP_DISPATCH_INDIRECT_CALL) }, - { FUNC(CORINFO_HELP_LLVM_GET_OR_INIT_SHADOW_STACK_TOP) CORINFO_TYPE_PTR, { }, HFIF_NO_RPI_OR_GC | HFIF_NO_VIRTUAL_UNWIND }, { FUNC(CORINFO_HELP_LLVM_EH_CATCH) CORINFO_TYPE_CLASS, { CORINFO_TYPE_NATIVEUINT }, HFIF_SS_ARG }, { FUNC(CORINFO_HELP_LLVM_EH_POP_UNWOUND_VIRTUAL_FRAMES) CORINFO_TYPE_VOID, { }, HFIF_SS_ARG }, - { FUNC(CORINFO_HELP_LLVM_EH_PUSH_VIRTUAL_UNWIND_FRAME) CORINFO_TYPE_VOID, { CORINFO_TYPE_PTR, CORINFO_TYPE_PTR, CORINFO_TYPE_NATIVEUINT }, HFIF_NO_RPI_OR_GC | HFIF_NO_VIRTUAL_UNWIND}, - { FUNC(CORINFO_HELP_LLVM_EH_POP_VIRTUAL_UNWIND_FRAME) CORINFO_TYPE_VOID, { }, HFIF_NO_RPI_OR_GC | HFIF_NO_VIRTUAL_UNWIND}, + { FUNC(CORINFO_HELP_LLVM_EH_PUSH_VIRTUAL_UNWIND_FRAME) CORINFO_TYPE_VOID, { CORINFO_TYPE_PTR, CORINFO_TYPE_PTR, CORINFO_TYPE_NATIVEUINT }, HFIF_NO_RPI_OR_GC | HFIF_NO_VIRTUAL_UNWIND }, + { FUNC(CORINFO_HELP_LLVM_EH_REVERSE_PINVOKE_ENTER_AND_PUSH_VIRTUAL_UNWIND_FRAME) CORINFO_TYPE_PTR, { CORINFO_TYPE_NATIVEUINT, CORINFO_TYPE_PTR, CORINFO_TYPE_NATIVEUINT }, HFIF_NO_RPI_OR_GC | HFIF_NO_VIRTUAL_UNWIND }, + { FUNC(CORINFO_HELP_LLVM_EH_POP_VIRTUAL_UNWIND_FRAME) CORINFO_TYPE_VOID, { }, HFIF_NO_RPI_OR_GC | HFIF_NO_VIRTUAL_UNWIND }, + { FUNC(CORINFO_HELP_LLVM_EH_REVERSE_PINVOKE_EXIT_AND_POP_VIRTUAL_UNWIND_FRAME) CORINFO_TYPE_VOID, { CORINFO_TYPE_PTR }, HFIF_NO_RPI_OR_GC | HFIF_NO_VIRTUAL_UNWIND }, { FUNC(CORINFO_HELP_LLVM_EH_UNHANDLED_EXCEPTION) CORINFO_TYPE_VOID, { CORINFO_TYPE_CLASS }, HFIF_SS_ARG }, { FUNC(CORINFO_HELP_LLVM_RESOLVE_INTERFACE_CALL_TARGET) CORINFO_TYPE_PTR, { CORINFO_TYPE_CLASS, CORINFO_TYPE_PTR }, HFIF_SS_ARG }, - { FUNC(CORINFO_HELP_LLVM_GET_EXTERNAL_CALL_TARGET) CORINFO_TYPE_PTR, { }, HFIF_NO_RPI_OR_GC | HFIF_NO_VIRTUAL_UNWIND }, { FUNC(CORINFO_HELP_LLVM_STRESS_GC) CORINFO_TYPE_BYREF, { CORINFO_TYPE_BYREF, CORINFO_TYPE_PTR }, HFIF_SS_ARG }, }; // clang-format on diff --git a/src/coreclr/jit/llvm.h b/src/coreclr/jit/llvm.h index cd944165431e..8a43fd9f12cf 100644 --- a/src/coreclr/jit/llvm.h +++ b/src/coreclr/jit/llvm.h @@ -308,6 +308,7 @@ class Llvm // Shared between virtual unwind frame insertion and LSSA. unsigned m_initialUnwindIndex = UNWIND_INDEX_NONE; + CORINFO_GENERIC_HANDLE m_ehInfoSymbol = nullptr; // Shared between unwind index insertion and EH codegen. ArrayStack* m_unwindIndexMap = nullptr; diff --git a/src/coreclr/jit/llvmlower.cpp b/src/coreclr/jit/llvmlower.cpp index dbb15fad9f7c..daabed2fc0dc 100644 --- a/src/coreclr/jit/llvmlower.cpp +++ b/src/coreclr/jit/llvmlower.cpp @@ -243,6 +243,8 @@ void Llvm::lowerBlock(BasicBlock* block) void Llvm::lowerRange(BasicBlock* block, LIR::Range& range) { + BasicBlock* savedBlock = m_currentBlock; + LIR::Range* savedRange = m_currentRange; m_currentBlock = block; m_currentRange = ⦥ @@ -253,8 +255,8 @@ void Llvm::lowerRange(BasicBlock* block, LIR::Range& range) INDEBUG(range.CheckLIR(_compiler, /* checkUnusedValues */ true)); - m_currentBlock = nullptr; - m_currentRange = nullptr; + m_currentBlock = savedBlock; + m_currentRange = savedRange; } void Llvm::lowerNode(GenTree* node) @@ -795,21 +797,19 @@ void Llvm::lowerUnmanagedCall(GenTreeCall* callNode) // two or more consecutive PI calls. if (!callNode->IsSuppressGCTransition()) { + // TODO-LLVM-Upstream: don't allocate lvaInlinedPInvokeFrameVar (its size is zero). assert(_compiler->opts.ShouldUsePInvokeHelpers()); // No inline transition support yet. assert(_compiler->lvaInlinedPInvokeFrameVar != BAD_VAR_NUM); // Insert CORINFO_HELP_JIT_PINVOKE_BEGIN. - GenTreeLclFld* frameAddr = _compiler->gtNewLclVarAddrNode(_compiler->lvaInlinedPInvokeFrameVar); - GenTreeCall* helperCall = _compiler->gtNewHelperCallNode(CORINFO_HELP_JIT_PINVOKE_BEGIN, TYP_VOID, frameAddr); - CurrentRange().InsertBefore(callNode, frameAddr, helperCall); - lowerNode(frameAddr); + GenTreeCall* helperCall = _compiler->gtNewHelperCallNode(CORINFO_HELP_JIT_PINVOKE_BEGIN, TYP_VOID); + CurrentRange().InsertBefore(callNode, helperCall); lowerNode(helperCall); // Insert CORINFO_HELP_JIT_PINVOKE_END. No need to explicitly lower the call/local address as the // normal lowering loop will pick them up. - frameAddr = _compiler->gtNewLclVarAddrNode(_compiler->lvaInlinedPInvokeFrameVar); - helperCall = _compiler->gtNewHelperCallNode(CORINFO_HELP_JIT_PINVOKE_END, TYP_VOID, frameAddr); - CurrentRange().InsertAfter(callNode, frameAddr, helperCall); + helperCall = _compiler->gtNewHelperCallNode(CORINFO_HELP_JIT_PINVOKE_END, TYP_VOID); + CurrentRange().InsertAfter(callNode, helperCall); } if (callNode->gtCallType != CT_INDIRECT) @@ -1598,20 +1598,27 @@ bool Llvm::addVirtualUnwindFrameForExceptionHandling() CORINFO_GENERIC_HANDLE ehInfoSymbol = m_llvm->GetSparseVirtualUnwindInfo(&clauses.BottomRef(), clauses.Height()); - GenTree* ehInfoNode = - m_compiler->gtNewIconHandleNode(reinterpret_cast(ehInfoSymbol), GTF_ICON_CONST_PTR); - GenTree* unwindFrameLclAddr = m_compiler->gtNewLclVarAddrNode(unwindFrameLclNum); - GenTreeIntCon* initialUnwindIndexNode = m_compiler->gtNewIconNode(m_initialIndexValue, TYP_I_IMPL); - GenTreeCall* initializeCall = - m_compiler->gtNewHelperCallNode(CORINFO_HELP_LLVM_EH_PUSH_VIRTUAL_UNWIND_FRAME, TYP_VOID, - unwindFrameLclAddr, ehInfoNode, initialUnwindIndexNode); - - LIR::Range initRange; - initRange.InsertAtEnd(unwindFrameLclAddr); - initRange.InsertAtEnd(ehInfoNode); - initRange.InsertAtEnd(initialUnwindIndexNode); - initRange.InsertAtEnd(initializeCall); - m_llvm->lowerAndInsertIntoFirstBlock(std::move(initRange)); + // For frames with an RPI transition, we will use RPI helpers that combine the transitions with unwind + // frame linking. + if (!m_compiler->opts.IsReversePInvoke()) + { + GenTree* ehInfoNode = + m_compiler->gtNewIconHandleNode(reinterpret_cast(ehInfoSymbol), GTF_ICON_CONST_PTR); + GenTree* unwindFrameLclAddr = m_compiler->gtNewLclVarAddrNode(unwindFrameLclNum); + GenTreeIntCon* initialUnwindIndexNode = m_compiler->gtNewIconNode(m_initialIndexValue, TYP_I_IMPL); + GenTreeCall* initializeCall = + m_compiler->gtNewHelperCallNode(CORINFO_HELP_LLVM_EH_PUSH_VIRTUAL_UNWIND_FRAME, TYP_VOID, + unwindFrameLclAddr, ehInfoNode, initialUnwindIndexNode); + + LIR::Range initRange; + initRange.InsertAtEnd(unwindFrameLclAddr); + initRange.InsertAtEnd(ehInfoNode); + initRange.InsertAtEnd(initialUnwindIndexNode); + initRange.InsertAtEnd(initializeCall); + m_llvm->lowerAndInsertIntoFirstBlock(std::move(initRange)); + } + + m_llvm->m_ehInfoSymbol = ehInfoSymbol; m_llvm->m_sparseVirtualUnwindFrameLclNum = unwindFrameLclNum; } @@ -1630,7 +1637,7 @@ bool Llvm::addVirtualUnwindFrameForExceptionHandling() } // Explicit pops are only needed for explicitly linked (via TLS) sparse frames. - if (m_llvm->m_sparseVirtualUnwindFrameLclNum != BAD_VAR_NUM) + if ((m_llvm->m_sparseVirtualUnwindFrameLclNum != BAD_VAR_NUM) && !m_compiler->opts.IsReversePInvoke()) { for (BasicBlock* block : m_compiler->Blocks()) { diff --git a/src/coreclr/jit/llvmlssa.cpp b/src/coreclr/jit/llvmlssa.cpp index 771353f2767c..5901d25a0c39 100644 --- a/src/coreclr/jit/llvmlssa.cpp +++ b/src/coreclr/jit/llvmlssa.cpp @@ -80,6 +80,14 @@ class ShadowStackAllocator private: void IdentifyCandidatesAndInitializeLocals() { + if (m_compiler->lvaReversePInvokeFrameVar != BAD_VAR_NUM) + { + // Expose this explicitly since we delay inserting the RPI helpers until after allocation. + m_compiler->lvaSetVarAddrExposed( + m_compiler->lvaReversePInvokeFrameVar DEBUGARG(AddressExposedReason::ESCAPE_ADDRESS)); + m_compiler->lvaGetDesc(m_compiler->lvaReversePInvokeFrameVar)->lvHasExplicitInit = true; + } + // Initialize independently promoted parameter field locals. // for (unsigned lclNum = 0; lclNum < m_compiler->lvaCount; lclNum++) @@ -163,6 +171,12 @@ class ShadowStackAllocator allocLocation = REG_STK_CANDIDATE_UNCONDITIONAL; INDEBUG(reason = "sparse virtual unwind frame"); } + // RPI frame being on the shadow stack allows us to combine it with the sparse virtual unwind frame. + else if (lclNum == m_compiler->lvaReversePInvokeFrameVar) + { + allocLocation = REG_STK_CANDIDATE_UNCONDITIONAL; + INDEBUG(reason = "RPI frame"); + } // Precise virtual unwind frames work by being at known offsets from each other on the shadow stack. else if (lclNum == m_llvm->m_preciseVirtualUnwindFrameLclNum) { @@ -1572,8 +1586,15 @@ class ShadowStackAllocator m_compiler->lvaGetDesc(preciseVirtualUnwindFrameLclNum)->SetRegNum(REG_STK); } - // The shadow frame must be allocated at a zero offset; the runtime uses its value as the original - // shadow frame parameter to filter funclets. + // As an optimization, the RPI frame is hardcoded to be at offset zero so that so that we don't + // need to pass its offset to the RPI helper. + if (m_compiler->lvaReversePInvokeFrameVar != BAD_VAR_NUM) + { + assignOffset(m_compiler->lvaGetDesc(m_compiler->lvaReversePInvokeFrameVar)); + } + + // As another optimization the sparse virtual unwind frame is allocated right after the RPI frame + // so that we can use the RPI helpers which combine the transition itself with the EH frame push/pop. if (m_llvm->m_sparseVirtualUnwindFrameLclNum != BAD_VAR_NUM) { assignOffset(m_compiler->lvaGetDesc(m_llvm->m_sparseVirtualUnwindFrameLclNum)); @@ -1635,7 +1656,7 @@ class ShadowStackAllocator m_llvm->m_currentBlock = m_compiler->fgFirstBB; m_llvm->m_currentRange = &initRange; - InitializeShadowStackValue(); + InitializeShadowStackValueAndInsertReversePInvokeTransitions(); m_llvm->initializePreciseVirtualUnwindFrame(); unsigned zeroingSize = m_prologZeroingSize; @@ -1664,8 +1685,11 @@ class ShadowStackAllocator m_llvm->m_prologEnd = zeroILOffsetNode; } - void InitializeShadowStackValue() + void InitializeShadowStackValueAndInsertReversePInvokeTransitions() { + unsigned alignment = m_shadowFrameAlignment; + bool explicitAlignNeeded = alignment != DEFAULT_SHADOW_STACK_ALIGNMENT; + unsigned lclNum = m_llvm->m_shadowStackLclNum; LclVarDsc* varDsc = m_compiler->lvaGetDesc(lclNum); // The liveness of our shadow stack local that has been computed before LSSA is not correct since we haven't @@ -1673,21 +1697,64 @@ class ShadowStackAllocator varDsc->lvTracked = 0; GenTreeLclVar* def = nullptr; - if (!varDsc->lvIsParam) + assert(!varDsc->lvIsParam == m_compiler->opts.IsReversePInvoke()); + if (m_compiler->opts.IsReversePInvoke()) { - GenTree* call = m_compiler->gtNewHelperCallNode(CORINFO_HELP_LLVM_GET_OR_INIT_SHADOW_STACK_TOP, TYP_I_IMPL); + assert(!m_compiler->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_TRACK_TRANSITIONS)); + + // We optimize the case where the transition can be combined with the virtual unwind frame push/pop. + GenTree* call; + GenTree* alignValueNode = m_compiler->gtNewIconNode(explicitAlignNeeded ? alignment : 0, TYP_I_IMPL); + m_llvm->CurrentRange().InsertAtEnd(alignValueNode); + if (m_llvm->m_sparseVirtualUnwindFrameLclNum != BAD_VAR_NUM) + { + GenTree* ehInfoNode = m_compiler->gtNewIconHandleNode( + reinterpret_cast(m_llvm->m_ehInfoSymbol), GTF_ICON_CONST_PTR); + m_llvm->CurrentRange().InsertAtEnd(ehInfoNode); + + GenTreeIntCon* initialUnwindIndexNode = + m_compiler->gtNewIconNode(m_llvm->m_initialUnwindIndex, TYP_I_IMPL); + m_llvm->CurrentRange().InsertAtEnd(initialUnwindIndexNode); + + call = m_compiler->gtNewHelperCallNode( + CORINFO_HELP_LLVM_EH_REVERSE_PINVOKE_ENTER_AND_PUSH_VIRTUAL_UNWIND_FRAME, + TYP_I_IMPL, alignValueNode, ehInfoNode, initialUnwindIndexNode); + } + else + { + call = m_compiler->gtNewHelperCallNode( + CORINFO_HELP_JIT_REVERSE_PINVOKE_ENTER, TYP_I_IMPL, alignValueNode); + } def = m_compiler->gtNewStoreLclVarNode(lclNum, call); m_llvm->CurrentRange().InsertAtEnd(call); m_llvm->CurrentRange().InsertAtEnd(def); varDsc->lvHasExplicitInit = 1; + explicitAlignNeeded = false; // The helper will align the shadow stack as necessary. JITDUMP("ReversePInvoke: initialized the shadow stack:\n"); DISPTREERANGE(m_llvm->CurrentRange(), def); + + for (BasicBlock* block : m_compiler->Blocks()) + { + if (block->KindIs(BBJ_RETURN)) + { + LIR::Range callRange; + CorInfoHelpFunc helperFunc = m_llvm->m_sparseVirtualUnwindFrameLclNum != BAD_VAR_NUM + ? CORINFO_HELP_LLVM_EH_REVERSE_PINVOKE_EXIT_AND_POP_VIRTUAL_UNWIND_FRAME + : CORINFO_HELP_JIT_REVERSE_PINVOKE_EXIT; + GenTree* addr = m_compiler->gtNewLclVarAddrNode(m_compiler->lvaReversePInvokeFrameVar); + GenTree* call = m_compiler->gtNewHelperCallNode(helperFunc, TYP_VOID, addr); + callRange.InsertAtEnd(addr); + callRange.InsertAtEnd(call); + + m_llvm->lowerRange(block, callRange); + LIR::InsertBeforeTerminator(block, std::move(callRange)); + } + } } m_llvm->m_shadowStackSsaNum = AddUntrackedSsaDef(def, lclNum); - unsigned alignment = m_shadowFrameAlignment; - if (alignment != DEFAULT_SHADOW_STACK_ALIGNMENT) + if (explicitAlignNeeded) { // Zero the padding that may be introduced by the code below. This serves two purposes: // 1. We don't leave "random" pointers on the shadow stack. @@ -1827,9 +1894,14 @@ class ShadowStackAllocator // Filters will be called by the first pass while live state still exists on shadow frames above (in the // traditional sense, where stacks grow down) them. For this reason, filters will access state from the // original frame via a dedicated shadow stack pointer, and use the actual shadow stack for calls. - regNumber shadowStackArgReg = - m_llvm->isBlockInFilter(m_llvm->CurrentBlock()) ? REG_ORIGINAL_SHADOW_STACK_ARG : REG_NA; + bool isFilter = m_llvm->isBlockInFilter(m_llvm->CurrentBlock()); + regNumber shadowStackArgReg = isFilter ? REG_ORIGINAL_SHADOW_STACK_ARG : REG_NA; unsigned lclOffset = lclBaseOffset + lclNode->GetLclOffs(); + if (isFilter && (m_llvm->m_sparseVirtualUnwindFrameLclNum != BAD_VAR_NUM)) + { + // In the sparse model, the original shadow stack pointer is the address of the virtual uwnind frame. + lclOffset -= m_compiler->lvaGetDesc(m_llvm->m_sparseVirtualUnwindFrameLclNum)->GetStackOffset(); + } GenTree* lclAddress = lclAddress = InsertShadowStackAddr(lclNode, lclOffset, shadowStackArgReg); ClassLayout* layout = lclNode->TypeIs(TYP_STRUCT) ? lclNode->GetLayout(m_compiler) : nullptr; @@ -1891,16 +1963,6 @@ class ShadowStackAllocator void RewriteCall(GenTreeCall* call) { - if (call->IsHelperCall(m_compiler, CORINFO_HELP_JIT_REVERSE_PINVOKE_EXIT)) - { - // The RPI exit call has an additional argument - the shadow stack top on entry to this RPI method. - GenTree* previousShadowStackTop = InsertShadowStackAddr(call, 0); - CallArg* callArg = - call->gtArgs.PushFront(m_compiler, NewCallArg::Primitive(previousShadowStackTop, CORINFO_TYPE_PTR)); - callArg->AbiInfo.IsPointer = true; - callArg->AbiInfo.ArgType = TYP_I_IMPL; - } - // Add in the shadow stack argument now that we know the shadow frame size. if (m_llvm->callHasManagedCallingConvention(call)) { diff --git a/src/coreclr/jit/utils.cpp b/src/coreclr/jit/utils.cpp index 3b16eec3c7cc..6de527fd9c01 100644 --- a/src/coreclr/jit/utils.cpp +++ b/src/coreclr/jit/utils.cpp @@ -1810,7 +1810,6 @@ void HelperCallProperties::init() // This is a debugging aid; it simply returns a constant address. case CORINFO_HELP_LOOP_CLONE_CHOICE_ADDR: - case CORINFO_HELP_LLVM_GET_EXTERNAL_CALL_TARGET: isPure = true; noThrow = true; break; @@ -1841,22 +1840,16 @@ void HelperCallProperties::init() mutatesHeap = true; // Conservatively. break; - case CORINFO_HELP_LLVM_GET_OR_INIT_SHADOW_STACK_TOP: + case CORINFO_HELP_LLVM_EH_REVERSE_PINVOKE_ENTER_AND_PUSH_VIRTUAL_UNWIND_FRAME: + case CORINFO_HELP_LLVM_EH_REVERSE_PINVOKE_EXIT_AND_POP_VIRTUAL_UNWIND_FRAME: + isNoGC = true; + FALLTHROUGH; case CORINFO_HELP_LLVM_EH_CATCH: case CORINFO_HELP_LLVM_EH_POP_UNWOUND_VIRTUAL_FRAMES: case CORINFO_HELP_LLVM_EH_PUSH_VIRTUAL_UNWIND_FRAME: case CORINFO_HELP_LLVM_EH_POP_VIRTUAL_UNWIND_FRAME: noThrow = true; mutatesHeap = true; - switch (helper) - { - case CORINFO_HELP_LLVM_GET_OR_INIT_SHADOW_STACK_TOP: - nonNullReturn = true; - break; - - default: - break; - } break; default: diff --git a/src/coreclr/nativeaot/Runtime/GCHelpers.cpp b/src/coreclr/nativeaot/Runtime/GCHelpers.cpp index c50d2c854c24..91805266d6ba 100644 --- a/src/coreclr/nativeaot/Runtime/GCHelpers.cpp +++ b/src/coreclr/nativeaot/Runtime/GCHelpers.cpp @@ -672,7 +672,7 @@ EXTERN_C void* F_CALL_CONV RhpGcAlloc(MethodTable* pEEType, uint32_t uFlags, uin ASSERT(pThread->IsHijacked()); pTransitionFrame->m_RIP = pThread->GetHijackedReturnAddress(); } -#else +#elif !defined(HOST_WASM) // NOTE: The x64 fixup above would not be sufficient on ARM64 and similar architectures since // m_RIP is used to restore LR in POP_COOP_PINVOKE_FRAME. diff --git a/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp b/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp index 762cde5c9bfb..171b2e25c916 100644 --- a/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp +++ b/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp @@ -70,6 +70,7 @@ EXTERN_C CODE_LOCATION RhpRethrow2; #define FAILFAST_OR_DAC_FAIL_UNCONDITIONALLY(msg) { ASSERT_UNCONDITIONALLY(msg); RhFailFast(); } #endif +#ifndef HOST_WASM // TODO-LLVM: consider excluding this whole file from the portable runtime build... StackFrameIterator::StackFrameIterator(Thread * pThreadToWalk, PInvokeTransitionFrame* pInitialTransitionFrame) { STRESS_LOG0(LF_STACKWALK, LL_INFO10000, "----Init---- [ GC ]\n"); @@ -94,6 +95,7 @@ StackFrameIterator::StackFrameIterator(Thread * pThreadToWalk, PInvokeTransition PrepareToYieldFrame(); } +#endif // !HOST_WASM StackFrameIterator::StackFrameIterator(Thread * pThreadToWalk, PTR_PAL_LIMITED_CONTEXT pCtx) { diff --git a/src/coreclr/nativeaot/Runtime/inc/rhbinder.h b/src/coreclr/nativeaot/Runtime/inc/rhbinder.h index 4451b9225d5f..16db0e813093 100644 --- a/src/coreclr/nativeaot/Runtime/inc/rhbinder.h +++ b/src/coreclr/nativeaot/Runtime/inc/rhbinder.h @@ -491,9 +491,11 @@ class Thread; #if defined(USE_PORTABLE_HELPERS) struct PInvokeTransitionFrame { +#ifndef HOST_WASM Thread* m_pThread; // Cached so that GetThread is only called once per method - uint32_t m_Flags; // PInvokeTransitionFrameFlags. TODO-LLVM-CQ: Remove. Only needed for Thread.Abort "support". - TgtPTR_Void m_RIP; // PInvokeTransitionFrameFlags. TODO-LLVM-CQ: Remove. + uint32_t m_Flags; // PInvokeTransitionFrameFlags. + TgtPTR_Void m_RIP; // PInvokeTransitionFrameFlags. +#endif // HOST_WASM }; #else // USE_PORTABLE_HELPERS struct PInvokeTransitionFrame diff --git a/src/coreclr/nativeaot/Runtime/thread.cpp b/src/coreclr/nativeaot/Runtime/thread.cpp index 3c471751f244..e738a32a8fdb 100644 --- a/src/coreclr/nativeaot/Runtime/thread.cpp +++ b/src/coreclr/nativeaot/Runtime/thread.cpp @@ -146,9 +146,7 @@ void Thread::ResetCachedTransitionFrame() void Thread::EnablePreemptiveMode() { ASSERT(ThreadStore::GetCurrentThread() == this); -#if !defined(HOST_WASM) ASSERT(m_pDeferredTransitionFrame != NULL); -#endif // set preemptive mode VolatileStoreWithoutBarrier(&m_pTransitionFrame, m_pDeferredTransitionFrame); @@ -317,6 +315,13 @@ void Thread::Construct() ASSERT(m_threadAbortException == NULL); +#ifdef HOST_WASM + // TODO-LLVM: make this configurable. E. g. dependent on native stack size. + m_pShadowStackBottom = new (nothrow) uint8_t[1 * 1024 * 1024]; + if (m_pShadowStackBottom == nullptr) + RhFailFast(); +#endif // HOST_WASM + #ifdef FEATURE_SUSPEND_REDIRECTION ASSERT(m_redirectionContextBuffer == NULL); #endif //FEATURE_SUSPEND_REDIRECTION @@ -387,6 +392,13 @@ void Thread::Destroy() StressLog::ThreadDetach(ptsl); #endif // STRESS_LOG +#ifdef HOST_WASM + if (m_pShadowStackBottom != nullptr) + { + delete[] m_pShadowStackBottom; + } +#endif // HOST_WASM + #ifdef FEATURE_SUSPEND_REDIRECTION if (m_redirectionContextBuffer != NULL) { @@ -398,10 +410,12 @@ void Thread::Destroy() } #ifdef HOST_WASM -void Thread::GcScanWasmShadowStack(ScanFunc * pfnEnumCallback, ScanContext * pvCallbackData) +void Thread::GcScanRootsWorker_Wasm(ScanFunc * pfnEnumCallback, ScanContext * pvCallbackData) { // Wasm does not permit iteration of stack frames so is uses a shadow stack instead - EnumGcRefsInRegionConservatively((PTR_OBJECTREF)m_pShadowStackBottom, (PTR_OBJECTREF)m_pShadowStackTop, pfnEnumCallback, pvCallbackData); + PTR_OBJECTREF pShadowStackBottom = (PTR_OBJECTREF)GetShadowStackBottom(); + PTR_OBJECTREF pShadowStackTop = (PTR_OBJECTREF)GetShadowStackTop(GetTransitionFrame()); + EnumGcRefsInRegionConservatively(pShadowStackBottom, pShadowStackTop, pfnEnumCallback, pvCallbackData); // TODO-LLVM-Upstream: unify this method with the general "GcScanRootsWorker" below. for (GCFrameRegistration* pCurGCFrame = m_pGCFrameRegistrations; pCurGCFrame != NULL; pCurGCFrame = pCurGCFrame->m_pNext) @@ -422,7 +436,7 @@ void Thread::GcScanRoots(ScanFunc * pfnEnumCallback, ScanContext * pvCallbackDat this->CrossThreadUnhijack(); #ifdef HOST_WASM - GcScanWasmShadowStack(pfnEnumCallback, pvCallbackData); + GcScanRootsWorker_Wasm(pfnEnumCallback, pvCallbackData); #else StackFrameIterator frameIterator(this, GetTransitionFrame()); GcScanRootsWorker(pfnEnumCallback, pvCallbackData, frameIterator); @@ -1037,7 +1051,11 @@ EXTERN_C void FASTCALL RhpUnsuppressGcStress() // Standard calling convention variant and actual implementation for RhpWaitForGC EXTERN_C NOINLINE void FASTCALL RhpWaitForGC2(PInvokeTransitionFrame * pFrame) { - Thread * pThread = pFrame->m_pThread; +#ifdef HOST_WASM + Thread* pThread = ThreadStore::GetCurrentThread(); +#else + Thread* pThread = pFrame->m_pThread; +#endif if (pThread->IsDoNotTriggerGcSet()) return; @@ -1047,10 +1065,12 @@ EXTERN_C NOINLINE void FASTCALL RhpWaitForGC2(PInvokeTransitionFrame * pFrame) // Standard calling convention variant and actual implementation for RhpGcPoll EXTERN_C NOINLINE void FASTCALL RhpGcPoll2(PInvokeTransitionFrame* pFrame) { +#ifndef HOST_WASM ASSERT(!Thread::IsHijackTarget(pFrame->m_RIP)); Thread* pThread = ThreadStore::GetCurrentThread(); pFrame->m_pThread = pThread; +#endif // !HOST_WASM RhpWaitForGC2(pFrame); } @@ -1319,6 +1339,7 @@ FCIMPL0(uint64_t, RhCurrentOSThreadId) } FCIMPLEND +#ifndef HOST_WASM // Standard calling convention variant and actual implementation for RhpReversePInvokeAttachOrTrapThread EXTERN_C NOINLINE void FASTCALL RhpReversePInvokeAttachOrTrapThread2(ReversePInvokeFrame* pFrame) { @@ -1330,7 +1351,6 @@ EXTERN_C NOINLINE void FASTCALL RhpReversePInvokeAttachOrTrapThread2(ReversePInv // PInvoke // -#ifndef HOST_WASM FCIMPL1(void, RhpReversePInvoke, ReversePInvokeFrame * pFrame) { Thread * pCurThread = ThreadStore::RawGetCurrentThread(); diff --git a/src/coreclr/nativeaot/Runtime/thread.h b/src/coreclr/nativeaot/Runtime/thread.h index 47e28c6321ac..02f244fb449c 100644 --- a/src/coreclr/nativeaot/Runtime/thread.h +++ b/src/coreclr/nativeaot/Runtime/thread.h @@ -166,19 +166,21 @@ struct RuntimeThreadLocals uint8_t* m_redirectionContextBuffer; // storage for redirection context, allocated on demand #endif //FEATURE_SUSPEND_REDIRECTION +#ifdef HOST_WASM + uint8_t* m_pShadowStackBottom; +#endif // HOST_WASM + #ifdef FEATURE_GC_STRESS uint32_t m_uRand; // current per-thread random number #endif // FEATURE_GC_STRESS -#ifdef HOST_WASM - void* m_pShadowStackBottom; - void* m_pShadowStackTop; -#endif // HOST_WASM }; struct ReversePInvokeFrame { PInvokeTransitionFrame* m_savedPInvokeTransitionFrame; +#ifndef HOST_WASM Thread* m_savedThread; +#endif // !HOST_WASM }; class Thread : private RuntimeThreadLocals @@ -249,8 +251,8 @@ class Thread : private RuntimeThreadLocals PInvokeTransitionFrame* GetTransitionFrame(); #ifdef HOST_WASM - void GcScanWasmShadowStack(ScanFunc* pfnEnumCallback, ScanContext* pvCallbackData); -#endif + void GcScanRootsWorker_Wasm(ScanFunc* pfnEnumCallback, ScanContext* pvCallbackData); +#endif // HOST_WASM void GcScanRootsWorker(ScanFunc* pfnEnumCallback, ScanContext* pvCallbackData, StackFrameIterator & sfIter); @@ -391,17 +393,17 @@ class Thread : private RuntimeThreadLocals pthread_t GetOSThreadHandle() { return m_hOSThread; } #endif +#ifdef HOST_WASM + void* GetShadowStackBottom(); + void* GetShadowStackTop(PInvokeTransitionFrame* pTransitionFrame); + void* InlineTryFastReversePInvoke_Wasm(size_t alignment); + void* ReversePInvokeAttachOrTrapThread_Wasm(size_t alignment); +#endif // HOST_WASM + #ifdef TARGET_X86 void SetPendingRedirect(PCODE eip); bool CheckPendingRedirect(PCODE eip); #endif - -#ifdef HOST_WASM - void* GetShadowStackBottom(); - void SetShadowStackBottom(void* pShadowStack); - void* GetShadowStackTop(); - void SetShadowStackTop(void* pShadowStack); -#endif }; #ifndef __GCENV_BASE_INCLUDED__ diff --git a/src/coreclr/nativeaot/Runtime/thread.inl b/src/coreclr/nativeaot/Runtime/thread.inl index 6b8892791405..cfd149898151 100644 --- a/src/coreclr/nativeaot/Runtime/thread.inl +++ b/src/coreclr/nativeaot/Runtime/thread.inl @@ -75,7 +75,9 @@ inline void Thread::SetDeferredTransitionFrame(PInvokeTransitionFrame* pTransiti { ASSERT(ThreadStore::GetCurrentThread() == this); ASSERT(Thread::IsCurrentThreadInCooperativeMode()); +#ifndef HOST_WASM ASSERT(!Thread::IsHijackTarget(pTransitionFrame->m_RIP)); +#endif // !HOST_WASM m_pDeferredTransitionFrame = pTransitionFrame; } @@ -169,7 +171,9 @@ FORCEINLINE void Thread::InlineReversePInvokeReturn(ReversePInvokeFrame* pFrame) FORCEINLINE void Thread::InlinePInvoke(PInvokeTransitionFrame* pFrame) { ASSERT(!IsDoNotTriggerGcSet() || ThreadStore::IsTrapThreadsRequested()); +#ifndef HOST_WASM pFrame->m_pThread = this; +#endif // set our mode to preemptive VolatileStoreWithoutBarrier(&m_pTransitionFrame, pFrame); } @@ -191,6 +195,7 @@ FORCEINLINE bool Thread::InlineTryFastReversePInvoke(ReversePInvokeFrame* pFrame // remember the current transition frame, so it will be restored when we return from reverse pinvoke pFrame->m_savedPInvokeTransitionFrame = m_pTransitionFrame; +#ifndef HOST_WASM // If the thread is already in cooperative mode, this is a bad transition that will be a fail fast unless we are in // a do not trigger mode. The exception to the rule allows us to have [UnmanagedCallersOnly] methods that are called via // the "restricted GC callouts" as well as from native, which is necessary because the methods are CCW vtable @@ -210,6 +215,7 @@ FORCEINLINE bool Thread::InlineTryFastReversePInvoke(ReversePInvokeFrame* pFrame if (IsCurrentThreadInCooperativeMode()) return false; // bad transition +#endif // !HOST_WASM // this is an ordinary transition to managed code // GC threads should not do that @@ -228,26 +234,3 @@ FORCEINLINE bool Thread::InlineTryFastReversePInvoke(ReversePInvokeFrame* pFrame return true; } - -#ifdef HOST_WASM -FORCEINLINE void* Thread::GetShadowStackBottom() -{ - return m_pShadowStackBottom; -} - -FORCEINLINE void Thread::SetShadowStackBottom(void *pShadowStack) -{ - ASSERT(m_pShadowStackBottom == nullptr); - m_pShadowStackBottom = pShadowStack; -} - -FORCEINLINE void* Thread::GetShadowStackTop() -{ - return m_pShadowStackTop; -} - -FORCEINLINE void Thread::SetShadowStackTop(void* pShadowStack) -{ - m_pShadowStackTop = pShadowStack; -} -#endif diff --git a/src/coreclr/nativeaot/Runtime/threadstore.cpp b/src/coreclr/nativeaot/Runtime/threadstore.cpp index 33d882f340fe..83a04b847f22 100644 --- a/src/coreclr/nativeaot/Runtime/threadstore.cpp +++ b/src/coreclr/nativeaot/Runtime/threadstore.cpp @@ -339,6 +339,7 @@ void ThreadStore::ResumeAllThreads(bool waitForGCEvent) } } // ResumeAllThreads +#ifndef HOST_WASM void ThreadStore::InitiateThreadAbort(Thread* targetThread, Object * threadAbortException, bool doRudeAbort) { SuspendAllThreads(/* waitForGCEvent = */ false); @@ -395,6 +396,7 @@ void ThreadStore::CancelThreadAbort(Thread* targetThread) ResumeAllThreads(/* waitForGCEvent = */ false); } +#endif // !HOST_WASM EXTERN_C void* QCALLTYPE RhpGetCurrentThread() { diff --git a/src/coreclr/nativeaot/Runtime/wasm/AllocFast.cpp b/src/coreclr/nativeaot/Runtime/wasm/AllocFast.cpp index ccf690d80d13..6f1cae1f9e8c 100644 --- a/src/coreclr/nativeaot/Runtime/wasm/AllocFast.cpp +++ b/src/coreclr/nativeaot/Runtime/wasm/AllocFast.cpp @@ -34,7 +34,7 @@ extern "C" void* RhpGcAlloc(MethodTable* pEEType, uint32_t uFlags, uintptr_t num extern "C" void RhExceptionHandling_FailedAllocation(void* pShadowStack, MethodTable* pEEType, bool isOverflow); // Automatic finalization. -extern "C" void RhpPInvoke(void* pShadowStack, PInvokeTransitionFrame* pFrame); +extern "C" void RhpPInvoke(PInvokeTransitionFrame* pFrame); extern "C" void RhpPInvokeReturn(PInvokeTransitionFrame* pFrame); extern bool g_FinalizationRequestPending; void FinalizeFinalizableObjects(); @@ -42,10 +42,9 @@ void FinalizeFinalizableObjects(); static Object* AllocateObject(void* pShadowStack, MethodTable* pEEType, uint32_t uFlags, uintptr_t numElements) { // Save the current shadow stack before calling into GC; we may need to scan it for live references. - PInvokeTransitionFrame frame; + PInvokeTransitionFrame* pFrame = (PInvokeTransitionFrame*)pShadowStack; Thread* pThread = ThreadStore::GetCurrentThread(); - pThread->SetShadowStackTop(pShadowStack); - Object* obj = (Object*)RhpGcAlloc(pEEType, uFlags, numElements, &frame); + Object* obj = (Object*)RhpGcAlloc(pEEType, uFlags, numElements, pFrame); #ifndef FEATURE_WASM_MANAGED_THREADS if (g_FinalizationRequestPending) @@ -61,9 +60,9 @@ static Object* AllocateObject(void* pShadowStack, MethodTable* pEEType, uint32_t } // "FinalizeFinalizableObjects" runs in preemptive mode. - RhpPInvoke(pShadowStack, &frame); + RhpPInvoke(pFrame); FinalizeFinalizableObjects(); - RhpPInvokeReturn(&frame); + RhpPInvokeReturn(pFrame); if (obj != nullptr) { diff --git a/src/coreclr/nativeaot/Runtime/wasm/ExceptionHandling/ExceptionHandling.cpp b/src/coreclr/nativeaot/Runtime/wasm/ExceptionHandling/ExceptionHandling.cpp index 5f16b68e02f9..189724d3e7ae 100644 --- a/src/coreclr/nativeaot/Runtime/wasm/ExceptionHandling/ExceptionHandling.cpp +++ b/src/coreclr/nativeaot/Runtime/wasm/ExceptionHandling/ExceptionHandling.cpp @@ -6,34 +6,22 @@ #include "../wasm.h" -struct SparseVirtualUnwindFrame -{ - SparseVirtualUnwindFrame* Prev; - void* UnwindTable; - size_t UnwindIndex; -}; - // This variable is defined here in native code because: // 1) Unmanaged thread locals are currently much more efficient than managed ones. // 2) Push/pop functions do not need the shadow stack argument. // -thread_local SparseVirtualUnwindFrame* t_pLastSparseVirtualUnwindFrame = nullptr; +// TODO-LLVM-Cleanup: replace with with PLATFORM_THREAD_LOCAL after merge. +__thread SparseVirtualUnwindFrame* t_pLastSparseVirtualUnwindFrame = nullptr; FCIMPL_NO_SS(void, RhpPushSparseVirtualUnwindFrame, SparseVirtualUnwindFrame* pFrame, void* pUnwindTable, size_t unwindIndex) { - ASSERT(t_pLastSparseVirtualUnwindFrame < pFrame); - pFrame->Prev = t_pLastSparseVirtualUnwindFrame; - pFrame->UnwindTable = pUnwindTable; - pFrame->UnwindIndex = unwindIndex; - - t_pLastSparseVirtualUnwindFrame = pFrame; + InlinePushSparseVirtualUnwindFrame(pFrame, pUnwindTable, unwindIndex); } FCIMPLEND FCIMPL_NO_SS(void, RhpPopSparseVirtualUnwindFrame) { - ASSERT(t_pLastSparseVirtualUnwindFrame != nullptr); - t_pLastSparseVirtualUnwindFrame = t_pLastSparseVirtualUnwindFrame->Prev; + InlinePopSparseVirtualUnwindFrame(t_pLastSparseVirtualUnwindFrame); } FCIMPLEND diff --git a/src/coreclr/nativeaot/Runtime/wasm/GcStress.cpp b/src/coreclr/nativeaot/Runtime/wasm/GcStress.cpp index 1581695bb3ef..16a065f8bde5 100644 --- a/src/coreclr/nativeaot/Runtime/wasm/GcStress.cpp +++ b/src/coreclr/nativeaot/Runtime/wasm/GcStress.cpp @@ -42,8 +42,22 @@ FCIMPL2(void*, RhpGcStressOnce, void* obj, uint8_t* pFlag) pThread->PushGCFrameRegistration(&gc); } - pThread->SetShadowStackTop(pShadowStack); + bool isCooperative = pThread->IsCurrentThreadInCooperativeMode(); + if (isCooperative) + { + pThread->SetDeferredTransitionFrame((PInvokeTransitionFrame*)pShadowStack); + } + else // We can be called in preemptive mode - on an exit from a PInvoke. + { + ASSERT(obj == nullptr); + pThread->DeferTransitionFrame(); + pThread->DisablePreemptiveMode(); + } GCHeapUtilities::GetGCHeap()->GarbageCollect(); + if (!isCooperative) + { + pThread->EnablePreemptiveMode(); + } if (obj != nullptr) { diff --git a/src/coreclr/nativeaot/Runtime/wasm/PInvoke.cpp b/src/coreclr/nativeaot/Runtime/wasm/PInvoke.cpp index 3620ccd809a7..7de449d34189 100644 --- a/src/coreclr/nativeaot/Runtime/wasm/PInvoke.cpp +++ b/src/coreclr/nativeaot/Runtime/wasm/PInvoke.cpp @@ -4,9 +4,10 @@ #include #include "common.h" +#include "daccess.h" #include "CommonTypes.h" #include "CommonMacros.h" -#include "daccess.h" +#include "CommonMacros.inl" #include "PalRedhawkCommon.h" #include "PalRedhawk.h" #include "thread.h" @@ -16,61 +17,140 @@ #include "wasm.h" -FCIMPL_NO_SS(void*, RhpGetOrInitShadowStackTop) +void* Thread::GetShadowStackBottom() { - Thread* pCurThread = ThreadStore::RawGetCurrentThread(); + ASSERT(m_pShadowStackBottom != nullptr); + return m_pShadowStackBottom; +} - void* pShadowStack = pCurThread->GetShadowStackTop(); +void* Thread::GetShadowStackTop(PInvokeTransitionFrame* pTransitionFrame) +{ + void* pShadowStack; + if (pTransitionFrame == TOP_OF_STACK_MARKER) // TODO-LLVM: remove this check by replacing TOP_OF_STACK_MARKER with m_pShadowStackBottom. + { + pShadowStack = GetShadowStackBottom(); + } + else + { + pShadowStack = pTransitionFrame; + } + ASSERT(pShadowStack != nullptr); + return pShadowStack; +} - if (pShadowStack == nullptr) +FORCEINLINE static void* GetAlignedShadowStackTop(Thread* pThread, PInvokeTransitionFrame* pTransitionFrame, size_t alignment) +{ + void* pShadowStack = pThread->GetShadowStackTop(pTransitionFrame); + + // Note how this aligning means that the transition frame on exit (saved back into current Thread) may differ + // from its value on entry. This is ok since that value will only grow in a bounded manner, such that calling + // an RPI method in a loop will never lead to runaway shadow stack usage. + if (alignment != 0) { - pShadowStack = malloc(1000000); // ~1MB. - if (pShadowStack == nullptr) - { - RhFailFast(); // Fatal OOM. - } + ASSERT(alignment == 8); + ZeroMemory(pShadowStack, 4); + pShadowStack = ALIGN_UP(pShadowStack, alignment); + } + return pShadowStack; +} + +FORCEINLINE static ReversePInvokeFrame* GetReversePInvokeFrame(void* pShadowStack) +{ + return (ReversePInvokeFrame*)pShadowStack; +} + +FORCEINLINE static SparseVirtualUnwindFrame* GetSparseVirtualUnwindFrame(ReversePInvokeFrame* pFrame) +{ + return (SparseVirtualUnwindFrame*)(pFrame + 1); +} - pCurThread->SetShadowStackBottom(pShadowStack); +FORCEINLINE void* Thread::InlineTryFastReversePInvoke_Wasm(size_t alignment) +{ + PInvokeTransitionFrame* pTransitionFrame = m_pTransitionFrame; + if (pTransitionFrame == nullptr) + return nullptr; // Uninitialized thread or illegal transition. Use the slow path. + + ASSERT(!IsCurrentThreadInCooperativeMode()); + void* pShadowStack = GetAlignedShadowStackTop(this, pTransitionFrame, alignment); + ReversePInvokeFrame* pFrame = GetReversePInvokeFrame(pShadowStack); + if (!InlineTryFastReversePInvoke(pFrame)) + return nullptr; // Need to suspend the thread. + + return pShadowStack; +} + +NOINLINE void* Thread::ReversePInvokeAttachOrTrapThread_Wasm(size_t alignment) +{ + // This check is necessary to support GC callouts, see "InlineTryFastReversePInvoke". + // We move it to the slow path since GC callouts should be very rare on WASM. + if (IsDoNotTriggerGcSet()) + { + // We expect this scenario only when EE is stopped. + ASSERT(ThreadStore::IsTrapThreadsRequested()); + return GetAlignedShadowStackTop(this, GetTransitionFrame(), alignment); // The suspender transition frame. } + // The shadow stack at this point may not have been allocated yet, so we need to use a local RPI frame. + ReversePInvokeFrame localFrame; + ReversePInvokeAttachOrTrapThread(&localFrame); + + void* pShadowStack = GetAlignedShadowStackTop(this, localFrame.m_savedPInvokeTransitionFrame, alignment); + *GetReversePInvokeFrame(pShadowStack) = localFrame; + return pShadowStack; +} + +FCIMPL_NO_SS(void*, RhpReversePInvoke, size_t alignment) +{ + Thread* pThread = ThreadStore::RawGetCurrentThread(); + void* pShadowStack = pThread->InlineTryFastReversePInvoke_Wasm(alignment); + if (pShadowStack == nullptr) + pShadowStack = pThread->ReversePInvokeAttachOrTrapThread_Wasm(alignment); + return pShadowStack; } FCIMPLEND -EXTERN_C NOINLINE void FASTCALL RhpReversePInvokeAttachOrTrapThread2(ReversePInvokeFrame* pFrame); +FCIMPL_NO_SS(void*, RhpReversePInvokeAndPushSparseVirtualUnwindFrame, size_t alignment, void* pUnwindTable, size_t unwindIndex) +{ + Thread* pThread = ThreadStore::RawGetCurrentThread(); + void* pShadowStack = pThread->InlineTryFastReversePInvoke_Wasm(alignment); + if (pShadowStack == nullptr) + pShadowStack = pThread->ReversePInvokeAttachOrTrapThread_Wasm(alignment); + + SparseVirtualUnwindFrame* pSparseVirtualUnwindFrame = GetSparseVirtualUnwindFrame(GetReversePInvokeFrame(pShadowStack)); + InlinePushSparseVirtualUnwindFrame(pSparseVirtualUnwindFrame, pUnwindTable, unwindIndex); + return pShadowStack; +} +FCIMPLEND -FCIMPL1(void, RhpReversePInvoke, ReversePInvokeFrame* pFrame) +FCIMPL_NO_SS(void, RhpReversePInvokeReturn, ReversePInvokeFrame* pFrame) { - Thread* pCurThread = ThreadStore::RawGetCurrentThread(); - pFrame->m_savedThread = pCurThread; - if (pCurThread->InlineTryFastReversePInvoke(pFrame)) - return; - - // The slow path may invoke runtime initialization, which runs managed code. - pCurThread->SetShadowStackTop(pShadowStack); - RhpReversePInvokeAttachOrTrapThread2(pFrame); + ThreadStore::RawGetCurrentThread()->InlineReversePInvokeReturn(pFrame); } FCIMPLEND -FCIMPL_NO_SS(void, RhpReversePInvokeReturn, void* pPreviousShadowStackTop, ReversePInvokeFrame* pFrame) +FCIMPL_NO_SS(void, RhpReversePInvokeReturnAndPopSparseVirtualUnwindFrame, ReversePInvokeFrame* pFrame) { - pFrame->m_savedThread->InlineReversePInvokeReturn(pFrame); - pFrame->m_savedThread->SetShadowStackTop(pPreviousShadowStackTop); + InlinePopSparseVirtualUnwindFrame(GetSparseVirtualUnwindFrame(pFrame)); + ThreadStore::RawGetCurrentThread()->InlineReversePInvokeReturn(pFrame); } FCIMPLEND -FCIMPL1(void, RhpPInvoke, PInvokeTransitionFrame* pFrame) +FCIMPL0(void, RhpPInvoke) { - Thread* pCurThread = ThreadStore::RawGetCurrentThread(); - pCurThread->InlinePInvoke(pFrame); - pCurThread->SetShadowStackTop(pShadowStack); + PInvokeTransitionFrame* pFrame = (PInvokeTransitionFrame*)pShadowStack; + ThreadStore::RawGetCurrentThread()->InlinePInvoke(pFrame); } FCIMPLEND -FCIMPL_NO_SS(void, RhpPInvokeReturn, PInvokeTransitionFrame* pFrame) +FCIMPL0(void, RhpPInvokeReturn) { - //reenter cooperative mode - pFrame->m_pThread->InlinePInvokeReturn(pFrame); + // WASM TLS is cheap: + // 1. Without threading, it is free. + // 2. With threading, it costs a single additional load (of the TLS base global). + // So not caching the current thread in the PI frame doesn't cost us anything. + PInvokeTransitionFrame* pFrame = (PInvokeTransitionFrame*)pShadowStack; + ThreadStore::RawGetCurrentThread()->InlinePInvokeReturn(pFrame); } FCIMPLEND diff --git a/src/coreclr/nativeaot/Runtime/wasm/wasm.h b/src/coreclr/nativeaot/Runtime/wasm/wasm.h index bd86484b1b25..410d8510a4d6 100644 --- a/src/coreclr/nativeaot/Runtime/wasm/wasm.h +++ b/src/coreclr/nativeaot/Runtime/wasm/wasm.h @@ -6,3 +6,29 @@ // serves as simply a marker for such FCalls. // #define FCIMPL_NO_SS(_rettype, _name, ...) extern "C" _rettype _name(__VA_ARGS__) { + +struct SparseVirtualUnwindFrame +{ + SparseVirtualUnwindFrame* Prev; + void* UnwindTable; + size_t UnwindIndex; +}; + +// TODO-LLVM-Cleanup: replace with with PLATFORM_THREAD_LOCAL after merge. +extern __thread SparseVirtualUnwindFrame* t_pLastSparseVirtualUnwindFrame; + +FORCEINLINE void InlinePushSparseVirtualUnwindFrame(SparseVirtualUnwindFrame* pFrame, void* pUnwindTable, size_t unwindIndex) +{ + ASSERT(t_pLastSparseVirtualUnwindFrame < pFrame); + pFrame->Prev = t_pLastSparseVirtualUnwindFrame; + pFrame->UnwindTable = pUnwindTable; + pFrame->UnwindIndex = unwindIndex; + + t_pLastSparseVirtualUnwindFrame = pFrame; +} + +FORCEINLINE void InlinePopSparseVirtualUnwindFrame(SparseVirtualUnwindFrame* pFrame) +{ + ASSERT(t_pLastSparseVirtualUnwindFrame != nullptr); + t_pLastSparseVirtualUnwindFrame = pFrame->Prev; +} diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoHelpFunc.cs b/src/coreclr/tools/Common/JitInterface/CorInfoHelpFunc.cs index d11a739ae0d1..5a7d6692e5b7 100644 --- a/src/coreclr/tools/Common/JitInterface/CorInfoHelpFunc.cs +++ b/src/coreclr/tools/Common/JitInterface/CorInfoHelpFunc.cs @@ -285,14 +285,14 @@ which is the right helper to use to allocate an object of a given type. */ CORINFO_HELP_VALIDATE_INDIRECT_CALL, // CFG: Validate function pointer CORINFO_HELP_DISPATCH_INDIRECT_CALL, // CFG: Validate and dispatch to pointer - CORINFO_HELP_LLVM_GET_OR_INIT_SHADOW_STACK_TOP, CORINFO_HELP_LLVM_EH_CATCH, CORINFO_HELP_LLVM_EH_POP_UNWOUND_VIRTUAL_FRAMES, CORINFO_HELP_LLVM_EH_PUSH_VIRTUAL_UNWIND_FRAME, + CORINFO_HELP_LLVM_EH_REVERSE_PINVOKE_ENTER_AND_PUSH_VIRTUAL_UNWIND_FRAME, CORINFO_HELP_LLVM_EH_POP_VIRTUAL_UNWIND_FRAME, + CORINFO_HELP_LLVM_EH_REVERSE_PINVOKE_EXIT_AND_POP_VIRTUAL_UNWIND_FRAME, CORINFO_HELP_LLVM_EH_UNHANDLED_EXCEPTION, CORINFO_HELP_LLVM_RESOLVE_INTERFACE_CALL_TARGET, - CORINFO_HELP_LLVM_GET_EXTERNAL_CALL_TARGET, CORINFO_HELP_LLVM_STRESS_GC, CORINFO_HELP_COUNT, diff --git a/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs b/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs index d89f39a66abd..7658d65a7644 100644 --- a/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs +++ b/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs @@ -29,7 +29,7 @@ internal unsafe partial class CorInfoImpl private const CORINFO_RUNTIME_ABI TargetABI = CORINFO_RUNTIME_ABI.CORINFO_NATIVEAOT_ABI; private uint OffsetOfDelegateFirstTarget => (uint)(4 * PointerSize); // Delegate._functionPointer - private int SizeOfReversePInvokeTransitionFrame => 2 * PointerSize; + private int SizeOfReversePInvokeTransitionFrame => (_compilation.TypeSystemContext.Target.IsWasm ? 1 : 2) * PointerSize; private RyuJitCompilation _compilation; private MethodDebugInformation _debugInfo; @@ -789,9 +789,6 @@ private ISymbolNode GetHelperFtnUncached(CorInfoHelpFunc ftnNum) case CorInfoHelpFunc.CORINFO_HELP_DISPATCH_INDIRECT_CALL: return _compilation.NodeFactory.ExternIndirectFunctionSymbol("__guard_dispatch_icall_fptr"); - case CorInfoHelpFunc.CORINFO_HELP_LLVM_GET_OR_INIT_SHADOW_STACK_TOP: - mangledName = "RhpGetOrInitShadowStackTop"; - break; case CorInfoHelpFunc.CORINFO_HELP_LLVM_EH_CATCH: mangledName = "RhpHandleExceptionWasmCatch"; break; @@ -804,9 +801,15 @@ private ISymbolNode GetHelperFtnUncached(CorInfoHelpFunc ftnNum) case CorInfoHelpFunc.CORINFO_HELP_LLVM_EH_PUSH_VIRTUAL_UNWIND_FRAME: mangledName = "RhpPushSparseVirtualUnwindFrame"; break; + case CorInfoHelpFunc.CORINFO_HELP_LLVM_EH_REVERSE_PINVOKE_ENTER_AND_PUSH_VIRTUAL_UNWIND_FRAME: + mangledName = "RhpReversePInvokeAndPushSparseVirtualUnwindFrame"; + break; case CorInfoHelpFunc.CORINFO_HELP_LLVM_EH_POP_VIRTUAL_UNWIND_FRAME: mangledName = "RhpPopSparseVirtualUnwindFrame"; break; + case CorInfoHelpFunc.CORINFO_HELP_LLVM_EH_REVERSE_PINVOKE_EXIT_AND_POP_VIRTUAL_UNWIND_FRAME: + mangledName = "RhpReversePInvokeReturnAndPopSparseVirtualUnwindFrame"; + break; case CorInfoHelpFunc.CORINFO_HELP_LLVM_EH_UNHANDLED_EXCEPTION: mangledName = "RhpHandleUnhandledException"; break; @@ -2028,8 +2031,7 @@ private int SizeOfPInvokeTransitionFrame { if (_compilation.TypeSystemContext.Target.IsWasm) { - // Only m_pThread used. - return this.PointerSize; + return 0; // Shadow stack top pointer used as the transition frame. } // struct PInvokeTransitionFrame: