diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h index 86eb405da659cc..a03fe42668f15a 100644 --- a/Include/internal/pycore_ceval.h +++ b/Include/internal/pycore_ceval.h @@ -123,6 +123,22 @@ _PyEval_EvalFrame(PyThreadState *tstate, _PyInterpreterFrame *frame, int throwfl return tstate->interp->eval_frame(tstate, frame, throwflag); } +#ifdef _Py_TIER2 +#ifdef _Py_JIT +_Py_CODEUNIT *_Py_LazyJitTrampoline( + struct _PyExecutorObject *current_executor, _PyInterpreterFrame *frame, + _PyStackRef *stack_pointer, PyThreadState *tstate +); +#else +_Py_CODEUNIT *_PyTier2Interpreter( + struct _PyExecutorObject *current_executor, _PyInterpreterFrame *frame, + _PyStackRef *stack_pointer, PyThreadState *tstate +); +#endif +#endif + +extern _PyJitEntryFuncPtr _Py_jit_entry; + extern PyObject* _PyEval_Vector(PyThreadState *tstate, PyFunctionObject *func, PyObject *locals, diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index 2cb1b104681300..fa9568ab4d0e85 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -765,6 +765,7 @@ struct _Py_unique_id_pool { #endif +typedef _Py_CODEUNIT *(*_PyJitEntryFuncPtr)(struct _PyExecutorObject *exec, _PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate); /* PyInterpreterState holds the global state for one of the runtime's interpreters. Typically the initial (main) interpreter is the only one. diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index 1571e19a35032e..9f930f2107ed5e 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -82,7 +82,6 @@ typedef struct _PyExecutorObject { uint32_t code_size; size_t jit_size; void *jit_code; - void *jit_side_entry; _PyExitData exits[1]; } _PyExecutorObject; diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-08-19-16-07-07.gh-issue-137959.EWj0RZ.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-08-19-16-07-07.gh-issue-137959.EWj0RZ.rst new file mode 100644 index 00000000000000..d1b2650fee6c9a --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-08-19-16-07-07.gh-issue-137959.EWj0RZ.rst @@ -0,0 +1,2 @@ +Replace the shim code added to every piece of jitted code with a single +trampoline function. diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 21bae689320eae..7f89c312b9a815 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2971,7 +2971,7 @@ dummy_func( assert(tstate->current_executor == NULL); assert(executor != tstate->interp->cold_executor); tstate->jit_exit = NULL; - GOTO_TIER_TWO(executor); + TIER1_TO_TIER2(executor); } } else { @@ -3037,7 +3037,7 @@ dummy_func( } assert(executor != tstate->interp->cold_executor); tstate->jit_exit = NULL; - GOTO_TIER_TWO(executor); + TIER1_TO_TIER2(executor); #else Py_FatalError("ENTER_EXECUTOR is not supported in this build"); #endif /* _Py_TIER2 */ @@ -5257,7 +5257,7 @@ dummy_func( } #endif tstate->jit_exit = exit; - GOTO_TIER_TWO(exit->executor); + TIER2_TO_TIER2(exit->executor); } tier2 op(_CHECK_VALIDITY, (--)) { @@ -5353,7 +5353,7 @@ dummy_func( tier2 op(_START_EXECUTOR, (executor/4 --)) { #ifndef _Py_JIT - current_executor = (_PyExecutorObject*)executor; + assert(current_executor == (_PyExecutorObject*)executor); #endif assert(tstate->jit_exit == NULL || tstate->jit_exit->executor == current_executor); tstate->current_executor = (PyObject *)executor; @@ -5434,7 +5434,7 @@ dummy_func( } assert(tstate->jit_exit == exit); exit->executor = executor; - GOTO_TIER_TWO(exit->executor); + TIER2_TO_TIER2(exit->executor); } label(pop_2_error) { diff --git a/Python/ceval.c b/Python/ceval.c index b8c1dd3e3bf74b..578c5d2a8b1420 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -275,7 +275,8 @@ maybe_lltrace_resume_frame(_PyInterpreterFrame *frame, PyObject *globals) } int r = PyDict_Contains(globals, &_Py_ID(__lltrace__)); if (r < 0) { - return -1; + PyErr_Clear(); + return 0; } int lltrace = r * 5; // Levels 1-4 only trace uops if (!lltrace) { @@ -1109,11 +1110,6 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int #endif } -#if defined(_Py_TIER2) && !defined(_Py_JIT) - /* Tier 2 interpreter state */ - _PyExecutorObject *current_executor = NULL; - const _PyUOpInstruction *next_uop = NULL; -#endif #if Py_TAIL_CALL_INTERP # if Py_STATS return _TAIL_CALL_start_frame(frame, NULL, tstate, NULL, 0, lastopcode); @@ -1126,14 +1122,41 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int #endif +early_exit: + assert(_PyErr_Occurred(tstate)); + _Py_LeaveRecursiveCallPy(tstate); + assert(frame->owner != FRAME_OWNED_BY_INTERPRETER); + // GH-99729: We need to unlink the frame *before* clearing it: + _PyInterpreterFrame *dying = frame; + frame = tstate->current_frame = dying->previous; + _PyEval_FrameClearAndPop(tstate, dying); + frame->return_offset = 0; + assert(frame->owner == FRAME_OWNED_BY_INTERPRETER); + /* Restore previous frame and exit */ + tstate->current_frame = frame->previous; + return NULL; +} #ifdef _Py_TIER2 - -// Tier 2 is also here! -enter_tier_two: - #ifdef _Py_JIT - assert(0); +_PyJitEntryFuncPtr _Py_jit_entry = _Py_LazyJitTrampoline; #else +_PyJitEntryFuncPtr _Py_jit_entry = _PyTier2Interpreter; +#endif +#endif + +#if defined(_Py_TIER2) && !defined(_Py_JIT) + +_Py_CODEUNIT * +_PyTier2Interpreter( + _PyExecutorObject *current_executor, _PyInterpreterFrame *frame, + _PyStackRef *stack_pointer, PyThreadState *tstate +) { + const _PyUOpInstruction *next_uop; + int oparg; +tier2_start: + + next_uop = current_executor->trace; + assert(next_uop->opcode == _START_EXECUTOR || next_uop->opcode == _COLD_EXIT); #undef LOAD_IP #define LOAD_IP(UNUSED) (void)0 @@ -1151,7 +1174,6 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int #undef ENABLE_SPECIALIZATION_FT #define ENABLE_SPECIALIZATION_FT 0 - ; // dummy statement after a label, before a declaration uint16_t uopcode; #ifdef Py_STATS int lastuop = 0; @@ -1225,24 +1247,9 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int next_uop = current_executor->trace + target; goto tier2_dispatch; -#endif // _Py_JIT - +} #endif // _Py_TIER2 -early_exit: - assert(_PyErr_Occurred(tstate)); - _Py_LeaveRecursiveCallPy(tstate); - assert(frame->owner != FRAME_OWNED_BY_INTERPRETER); - // GH-99729: We need to unlink the frame *before* clearing it: - _PyInterpreterFrame *dying = frame; - frame = tstate->current_frame = dying->previous; - _PyEval_FrameClearAndPop(tstate, dying); - frame->return_offset = 0; - assert(frame->owner == FRAME_OWNED_BY_INTERPRETER); - /* Restore previous frame and exit */ - tstate->current_frame = frame->previous; - return NULL; -} #ifdef DO_NOT_OPTIMIZE_INTERP_LOOP # pragma optimize("", on) diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index ddbcf2aa1bab02..64ca7716fdbdee 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -133,9 +133,6 @@ do { \ _PyFrame_SetStackPointer(frame, stack_pointer); \ int lltrace = maybe_lltrace_resume_frame(frame, GLOBALS()); \ stack_pointer = _PyFrame_GetStackPointer(frame); \ - if (lltrace < 0) { \ - JUMP_TO_LABEL(exit_unwind); \ - } \ frame->lltrace = lltrace; \ } while (0) #else @@ -354,16 +351,10 @@ _PyFrame_SetStackPointer(frame, stack_pointer) /* Tier-switching macros. */ -#ifdef _Py_JIT -#define GOTO_TIER_TWO(EXECUTOR) \ +#define TIER1_TO_TIER2(EXECUTOR) \ do { \ OPT_STAT_INC(traces_executed); \ - _PyExecutorObject *_executor = (EXECUTOR); \ - jit_func jitted = _executor->jit_code; \ - /* Keep the shim frame alive via the executor: */ \ - Py_INCREF(_executor); \ - next_instr = jitted(frame, stack_pointer, tstate); \ - Py_DECREF(_executor); \ + next_instr = _Py_jit_entry((EXECUTOR), frame, stack_pointer, tstate); \ frame = tstate->current_frame; \ stack_pointer = _PyFrame_GetStackPointer(frame); \ if (next_instr == NULL) { \ @@ -372,31 +363,21 @@ do { \ } \ DISPATCH(); \ } while (0) -#else -#define GOTO_TIER_TWO(EXECUTOR) \ -do { \ - OPT_STAT_INC(traces_executed); \ - _PyExecutorObject *_executor = (EXECUTOR); \ - next_uop = _executor->trace; \ - assert(next_uop->opcode == _START_EXECUTOR || next_uop->opcode == _COLD_EXIT); \ - goto enter_tier_two; \ + +#define TIER2_TO_TIER2(EXECUTOR) \ +do { \ + OPT_STAT_INC(traces_executed); \ + current_executor = (EXECUTOR); \ + goto tier2_start; \ } while (0) -#endif #define GOTO_TIER_ONE(TARGET) \ do \ { \ tstate->current_executor = NULL; \ - next_instr = (TARGET); \ OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); \ _PyFrame_SetStackPointer(frame, stack_pointer); \ - stack_pointer = _PyFrame_GetStackPointer(frame); \ - if (next_instr == NULL) \ - { \ - next_instr = frame->instr_ptr; \ - goto error; \ - } \ - DISPATCH(); \ + return TARGET; \ } while (0) #define CURRENT_OPARG() (next_uop[-1].oparg) diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 182289922c7637..3dcb2decc43737 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -7122,7 +7122,7 @@ } #endif tstate->jit_exit = exit; - GOTO_TIER_TWO(exit->executor); + TIER2_TO_TIER2(exit->executor); break; } @@ -7400,7 +7400,7 @@ case _START_EXECUTOR: { PyObject *executor = (PyObject *)CURRENT_OPERAND0(); #ifndef _Py_JIT - current_executor = (_PyExecutorObject*)executor; + assert(current_executor == (_PyExecutorObject*)executor); #endif assert(tstate->jit_exit == NULL || tstate->jit_exit->executor == current_executor); tstate->current_executor = (PyObject *)executor; @@ -7503,7 +7503,7 @@ } assert(tstate->jit_exit == exit); exit->executor = executor; - GOTO_TIER_TWO(exit->executor); + TIER2_TO_TIER2(exit->executor); break; } diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index b6d183a3e63a9c..7547eaad125370 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -5493,7 +5493,7 @@ } assert(executor != tstate->interp->cold_executor); tstate->jit_exit = NULL; - GOTO_TIER_TWO(executor); + TIER1_TO_TIER2(executor); #else Py_FatalError("ENTER_EXECUTOR is not supported in this build"); #endif /* _Py_TIER2 */ @@ -7667,7 +7667,7 @@ assert(tstate->current_executor == NULL); assert(executor != tstate->interp->cold_executor); tstate->jit_exit = NULL; - GOTO_TIER_TWO(executor); + TIER1_TO_TIER2(executor); } } else { diff --git a/Python/jit.c b/Python/jit.c index bd6a5e17a4164f..01ec9c1fa6e8a9 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -494,10 +494,6 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz size_t code_size = 0; size_t data_size = 0; jit_state state = {0}; - group = &shim; - code_size += group->code_size; - data_size += group->data_size; - combine_symbol_mask(group->trampoline_mask, state.trampolines.mask); for (size_t i = 0; i < length; i++) { const _PyUOpInstruction *instruction = &trace[i]; group = &stencil_groups[instruction->opcode]; @@ -539,13 +535,6 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz unsigned char *code = memory; state.trampolines.mem = memory + code_size; unsigned char *data = memory + code_size + state.trampolines.size + code_padding; - // Compile the shim, which handles converting between the native - // calling convention and the calling convention used by jitted code - // (which may be different for efficiency reasons). - group = &shim; - group->emit(code, data, executor, NULL, &state); - code += group->code_size; - data += group->data_size; assert(trace[0].opcode == _START_EXECUTOR || trace[0].opcode == _COLD_EXIT); for (size_t i = 0; i < length; i++) { const _PyUOpInstruction *instruction = &trace[i]; @@ -566,11 +555,75 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz return -1; } executor->jit_code = memory; - executor->jit_side_entry = memory + shim.code_size; executor->jit_size = total_size; return 0; } +/* One-off compilation of the jit entry trampoline + * We compile this once only as it effectively a normal + * function, but we need to use the JIT because it needs + * to understand the jit-specific calling convention. + */ +static _PyJitEntryFuncPtr +compile_trampoline(void) +{ + _PyExecutorObject dummy; + const StencilGroup *group; + size_t code_size = 0; + size_t data_size = 0; + jit_state state = {0}; + group = &trampoline; + code_size += group->code_size; + data_size += group->data_size; + combine_symbol_mask(group->trampoline_mask, state.trampolines.mask); + // Round up to the nearest page: + size_t page_size = get_page_size(); + assert((page_size & (page_size - 1)) == 0); + size_t code_padding = DATA_ALIGN - ((code_size + state.trampolines.size) & (DATA_ALIGN - 1)); + size_t padding = page_size - ((code_size + state.trampolines.size + code_padding + data_size) & (page_size - 1)); + size_t total_size = code_size + state.trampolines.size + code_padding + data_size + padding; + unsigned char *memory = jit_alloc(total_size); + if (memory == NULL) { + return NULL; + } + unsigned char *code = memory; + state.trampolines.mem = memory + code_size; + unsigned char *data = memory + code_size + state.trampolines.size + code_padding; + // Compile the shim, which handles converting between the native + // calling convention and the calling convention used by jitted code + // (which may be different for efficiency reasons). + group = &trampoline; + group->emit(code, data, &dummy, NULL, &state); + code += group->code_size; + data += group->data_size; + assert(code == memory + code_size); + assert(data == memory + code_size + state.trampolines.size + code_padding + data_size); + if (mark_executable(memory, total_size)) { + jit_free(memory, total_size); + return NULL; + } + return (_PyJitEntryFuncPtr)memory; +} + +static PyMutex lazy_jit_mutex = { 0 }; + +_Py_CODEUNIT * +_Py_LazyJitTrampoline( + _PyExecutorObject *executor, _PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate +) { + PyMutex_Lock(&lazy_jit_mutex); + if (_Py_jit_entry == _Py_LazyJitTrampoline) { + _PyJitEntryFuncPtr trampoline = compile_trampoline(); + if (trampoline == NULL) { + PyMutex_Unlock(&lazy_jit_mutex); + Py_FatalError("Cannot allocate core JIT code"); + } + _Py_jit_entry = trampoline; + } + PyMutex_Unlock(&lazy_jit_mutex); + return _Py_jit_entry(executor, frame, stack_pointer, tstate); +} + void _PyJIT_Free(_PyExecutorObject *executor) { @@ -578,7 +631,6 @@ _PyJIT_Free(_PyExecutorObject *executor) size_t size = executor->jit_size; if (memory) { executor->jit_code = NULL; - executor->jit_side_entry = NULL; executor->jit_size = 0; if (jit_free(memory, size)) { PyErr_FormatUnraisable("Exception ignored while " diff --git a/Python/optimizer.c b/Python/optimizer.c index 1d899ee8971368..bae5cfa50ead58 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -1238,7 +1238,6 @@ make_executor_from_uops(_PyUOpInstruction *buffer, int length, const _PyBloomFil #endif #ifdef _Py_JIT executor->jit_code = NULL; - executor->jit_side_entry = NULL; executor->jit_size = 0; // This is initialized to true so we can prevent the executor // from being immediately detected as cold and invalidated. @@ -1490,7 +1489,6 @@ _PyExecutor_GetColdExecutor(void) ((_PyUOpInstruction *)cold->trace)->opcode = _COLD_EXIT; #ifdef _Py_JIT cold->jit_code = NULL; - cold->jit_side_entry = NULL; cold->jit_size = 0; // This is initialized to true so we can prevent the executor // from being immediately detected as cold and invalidated. diff --git a/Python/pystate.c b/Python/pystate.c index a2914b3718eea2..9091057f6f62cf 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -494,6 +494,11 @@ free_interpreter(PyInterpreterState *interp) static inline int check_interpreter_whence(long); #endif +extern _Py_CODEUNIT * +_Py_LazyJitTrampoline( + struct _PyExecutorObject *exec, _PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate +); + /* Get the interpreter state to a minimal consistent state. Further init happens in pylifecycle.c before it can be used. All fields not initialized here are expected to be zeroed out, diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index 3883671e92aa39..7e261c9f8e297f 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -191,8 +191,8 @@ async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]: with tempfile.TemporaryDirectory() as tempdir: work = pathlib.Path(tempdir).resolve() async with asyncio.TaskGroup() as group: - coro = self._compile("shim", TOOLS_JIT / "shim.c", work) - tasks.append(group.create_task(coro, name="shim")) + coro = self._compile("trampoline", TOOLS_JIT / "trampoline.c", work) + tasks.append(group.create_task(coro, name="trampoline")) template = TOOLS_JIT_TEMPLATE_C.read_text() for case, opname in cases_and_opnames: # Write out a copy of the template with *only* this case diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py index 090b52660f009c..4f373011ebf079 100644 --- a/Tools/jit/_writer.py +++ b/Tools/jit/_writer.py @@ -22,11 +22,11 @@ def _dump_footer( yield " symbol_mask trampoline_mask;" yield "} StencilGroup;" yield "" - yield f"static const StencilGroup shim = {groups['shim'].as_c('shim')};" + yield f"static const StencilGroup trampoline = {groups['trampoline'].as_c('trampoline')};" yield "" yield "static const StencilGroup stencil_groups[MAX_UOP_ID + 1] = {" for opname, group in sorted(groups.items()): - if opname == "shim": + if opname == "trampoline": continue yield f" [{opname}] = {group.as_c(opname)}," yield "};" diff --git a/Tools/jit/shim.c b/Tools/jit/shim.c deleted file mode 100644 index 0c7feb746c9679..00000000000000 --- a/Tools/jit/shim.c +++ /dev/null @@ -1,15 +0,0 @@ -#include "Python.h" - -#include "pycore_ceval.h" -#include "pycore_frame.h" -#include "pycore_jit.h" - -#include "jit.h" - -_Py_CODEUNIT * -_JIT_ENTRY(_PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate) -{ - // Note that this is *not* a tail call: - DECLARE_TARGET(_JIT_CONTINUE); - return _JIT_CONTINUE(frame, stack_pointer, tstate); -} diff --git a/Tools/jit/template.c b/Tools/jit/template.c index 7bbc44a08a0a98..8f71010a1aff58 100644 --- a/Tools/jit/template.c +++ b/Tools/jit/template.c @@ -46,12 +46,12 @@ #undef CURRENT_TARGET #define CURRENT_TARGET() (_target) -#undef GOTO_TIER_TWO -#define GOTO_TIER_TWO(EXECUTOR) \ +#undef TIER2_TO_TIER2 +#define TIER2_TO_TIER2(EXECUTOR) \ do { \ OPT_STAT_INC(traces_executed); \ _PyExecutorObject *_executor = (EXECUTOR); \ - jit_func_preserve_none jitted = _executor->jit_side_entry; \ + jit_func_preserve_none jitted = _executor->jit_code; \ __attribute__((musttail)) return jitted(frame, stack_pointer, tstate); \ } while (0) diff --git a/Tools/jit/trampoline.c b/Tools/jit/trampoline.c new file mode 100644 index 00000000000000..79d6af97961fc9 --- /dev/null +++ b/Tools/jit/trampoline.c @@ -0,0 +1,16 @@ +#include "Python.h" + +#include "pycore_ceval.h" +#include "pycore_frame.h" +#include "pycore_jit.h" + +#include "jit.h" + +_Py_CODEUNIT * +_JIT_ENTRY( + _PyExecutorObject *exec, _PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate +) { + typedef DECLARE_TARGET((*jit_func)); + jit_func jitted = (jit_func)exec->jit_code; + return jitted(frame, stack_pointer, tstate); +}