TechPenguineer
diff --git a/‎.github/workflows/jit.yml‎
Lines changed: 14 additions & 12 deletions b/‎.github/workflows/jit.yml‎
Lines changed: 14 additions & 12 deletions
diff --git a/‎Include/cpython/pystats.h‎
Lines changed: 2 additions & 0 deletions b/‎Include/cpython/pystats.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Include/internal/pycore_backoff.h‎
Lines changed: 15 additions & 2 deletions b/‎Include/internal/pycore_backoff.h‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎Include/internal/pycore_ceval.h‎
Lines changed: 2 additions & 0 deletions b/‎Include/internal/pycore_ceval.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Include/internal/pycore_interp_structs.h‎
Lines changed: 1 addition & 3 deletions b/‎Include/internal/pycore_interp_structs.h‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎Include/internal/pycore_opcode_metadata.h‎
Lines changed: 39 additions & 32 deletions b/‎Include/internal/pycore_opcode_metadata.h‎
Lines changed: 39 additions & 32 deletions
diff --git a/‎Include/internal/pycore_optimizer.h‎
Lines changed: 25 additions & 16 deletions b/‎Include/internal/pycore_optimizer.h‎
Lines changed: 25 additions & 16 deletions
diff --git a/‎Include/internal/pycore_tstate.h‎
Lines changed: 37 additions & 2 deletions b/‎Include/internal/pycore_tstate.h‎
Lines changed: 37 additions & 2 deletions
diff --git a/‎Include/internal/pycore_uop.h‎
Lines changed: 10 additions & 2 deletions b/‎Include/internal/pycore_uop.h‎
Lines changed: 10 additions & 2 deletions
@@ -57,9 +57,10 @@ jobs:
       fail-fast: false
       matrix:
         target:
-          - i686-pc-windows-msvc/msvc
-          - x86_64-pc-windows-msvc/msvc
-          - aarch64-pc-windows-msvc/msvc
+# To re-enable later when we support these.
+#          - i686-pc-windows-msvc/msvc
+#          - x86_64-pc-windows-msvc/msvc
+#          - aarch64-pc-windows-msvc/msvc
           - x86_64-apple-darwin/clang
           - aarch64-apple-darwin/clang
           - x86_64-unknown-linux-gnu/gcc
@@ -70,15 +71,16 @@ jobs:
         llvm:
           - 21
         include:
-          - target: i686-pc-windows-msvc/msvc
-            architecture: Win32
-            runner: windows-2022
-          - target: x86_64-pc-windows-msvc/msvc
-            architecture: x64
-            runner: windows-2022
-          - target: aarch64-pc-windows-msvc/msvc
-            architecture: ARM64
-            runner: windows-11-arm
+# To re-enable later when we support these.
+#          - target: i686-pc-windows-msvc/msvc
+#            architecture: Win32
+#            runner: windows-2022
+#          - target: x86_64-pc-windows-msvc/msvc
+#            architecture: x64
+#            runner: windows-2022
+#          - target: aarch64-pc-windows-msvc/msvc
+#            architecture: ARM64
+#            runner: windows-11-arm
           - target: x86_64-apple-darwin/clang
             architecture: x86_64
             runner: macos-15-intel
 
@@ -150,6 +150,8 @@ typedef struct _optimization_stats {
     uint64_t optimized_trace_length_hist[_Py_UOP_HIST_SIZE];
     uint64_t optimizer_attempts;
     uint64_t optimizer_successes;
+    uint64_t optimizer_contradiction;
+    uint64_t optimizer_frame_overflow;
     uint64_t optimizer_failure_reason_no_memory;
     uint64_t remove_globals_builtins_changed;
     uint64_t remove_globals_incorrect_keys;
 
@@ -95,11 +95,24 @@ backoff_counter_triggers(_Py_BackoffCounter counter)
     return counter.value_and_backoff < UNREACHABLE_BACKOFF;
 }
 
+static inline _Py_BackoffCounter
+trigger_backoff_counter(void)
+{
+    _Py_BackoffCounter result;
+    result.value_and_backoff = 0;
+    return result;
+}
+
 // Initial JUMP_BACKWARD counter.
 // Must be larger than ADAPTIVE_COOLDOWN_VALUE, otherwise when JIT code is
 // invalidated we may construct a new trace before the bytecode has properly
 // re-specialized:
-#define JUMP_BACKWARD_INITIAL_VALUE 4095
+// Note: this should be a prime number-1. This increases the likelihood of
+// finding a "good" loop iteration to trace.
+// For example, 4095 does not work for the nqueens benchmark on pyperformance
+// as we always end up tracing the loop iteration's
+// exhaustion iteration. Which aborts our current tracer.
+#define JUMP_BACKWARD_INITIAL_VALUE 4000
 #define JUMP_BACKWARD_INITIAL_BACKOFF 12
 static inline _Py_BackoffCounter
 initial_jump_backoff_counter(void)
@@ -112,7 +125,7 @@ initial_jump_backoff_counter(void)
  * Must be larger than ADAPTIVE_COOLDOWN_VALUE,
  * otherwise when a side exit warms up we may construct
  * a new trace before the Tier 1 code has properly re-specialized. */
-#define SIDE_EXIT_INITIAL_VALUE 4095
+#define SIDE_EXIT_INITIAL_VALUE 4000
 #define SIDE_EXIT_INITIAL_BACKOFF 12
 
 static inline _Py_BackoffCounter
 
@@ -392,6 +392,8 @@ _PyForIter_VirtualIteratorNext(PyThreadState* tstate, struct _PyInterpreterFrame
 #define SPECIAL___AEXIT__   3
 #define SPECIAL_MAX   3
 
+PyAPI_DATA(const _Py_CODEUNIT *) _Py_INTERPRETER_TRAMPOLINE_INSTRUCTIONS_PTR;
+
 #ifdef __cplusplus
 }
 #endif
 
@@ -14,8 +14,6 @@ extern "C" {
 #include "pycore_structs.h"       // PyHamtObject
 #include "pycore_tstate.h"        // _PyThreadStateImpl
 #include "pycore_typedefs.h"      // _PyRuntimeState
-#include "pycore_uop.h"           // struct _PyUOpInstruction
-
 
 #define CODE_MAX_WATCHERS 8
 #define CONTEXT_MAX_WATCHERS 8
@@ -934,10 +932,10 @@ struct _is {
     PyObject *common_consts[NUM_COMMON_CONSTANTS];
     bool jit;
     bool compiling;
-    struct _PyUOpInstruction *jit_uop_buffer;
     struct _PyExecutorObject *executor_list_head;
     struct _PyExecutorObject *executor_deletion_list_head;
     struct _PyExecutorObject *cold_executor;
+    struct _PyExecutorObject *cold_dynamic_executor;
     int executor_deletion_list_remaining_capacity;
     size_t executor_creation_counter;
     _rare_events rare_events;
 
@@ -21,14 +21,6 @@ typedef struct _PyExecutorLinkListNode {
 } _PyExecutorLinkListNode;
 
 
-/* Bloom filter with m = 256
- * https://en.wikipedia.org/wiki/Bloom_filter */
-#define _Py_BLOOM_FILTER_WORDS 8
-
-typedef struct {
-    uint32_t bits[_Py_BLOOM_FILTER_WORDS];
-} _PyBloomFilter;
-
 typedef struct {
     uint8_t opcode;
     uint8_t oparg;
@@ -44,7 +36,9 @@ typedef struct {
 
 typedef struct _PyExitData {
     uint32_t target;
-    uint16_t index;
+    uint16_t index:14;
+    uint16_t is_dynamic:1;
+    uint16_t is_control_flow:1;
     _Py_BackoffCounter temperature;
     struct _PyExecutorObject *executor;
 } _PyExitData;
@@ -94,9 +88,8 @@ PyAPI_FUNC(void) _Py_Executors_InvalidateCold(PyInterpreterState *interp);
 // This value is arbitrary and was not optimized.
 #define JIT_CLEANUP_THRESHOLD 1000
 
-#define TRACE_STACK_SIZE 5
-
-int _Py_uop_analyze_and_optimize(_PyInterpreterFrame *frame,
+int _Py_uop_analyze_and_optimize(
+    PyFunctionObject *func,
     _PyUOpInstruction *trace, int trace_len, int curr_stackentries,
     _PyBloomFilter *dependencies);
 
@@ -130,7 +123,7 @@ static inline uint16_t uop_get_error_target(const _PyUOpInstruction *inst)
 #define TY_ARENA_SIZE (UOP_MAX_TRACE_LENGTH * 5)
 
 // Need extras for root frame and for overflow frame (see TRACE_STACK_PUSH())
-#define MAX_ABSTRACT_FRAME_DEPTH (TRACE_STACK_SIZE + 2)
+#define MAX_ABSTRACT_FRAME_DEPTH (16)
 
 // The maximum number of side exits that we can take before requiring forward
 // progress (and inserting a new ENTER_EXECUTOR instruction). In practice, this
@@ -258,6 +251,7 @@ struct _Py_UOpsAbstractFrame {
     int stack_len;
     int locals_len;
     PyFunctionObject *func;
+    PyCodeObject *code;
 
     JitOptRef *stack_pointer;
     JitOptRef *stack;
@@ -333,11 +327,11 @@ extern _Py_UOpsAbstractFrame *_Py_uop_frame_new(
     int curr_stackentries,
     JitOptRef *args,
     int arg_len);
-extern int _Py_uop_frame_pop(JitOptContext *ctx);
+extern int _Py_uop_frame_pop(JitOptContext *ctx, PyCodeObject *co, int curr_stackentries);
 
 PyAPI_FUNC(PyObject *) _Py_uop_symbols_test(PyObject *self, PyObject *ignored);
 
-PyAPI_FUNC(int) _PyOptimizer_Optimize(_PyInterpreterFrame *frame, _Py_CODEUNIT *start, _PyExecutorObject **exec_ptr, int chain_depth);
+PyAPI_FUNC(int) _PyOptimizer_Optimize(_PyInterpreterFrame *frame, PyThreadState *tstate);
 
 static inline _PyExecutorObject *_PyExecutor_FromExit(_PyExitData *exit)
 {
@@ -346,6 +340,7 @@ static inline _PyExecutorObject *_PyExecutor_FromExit(_PyExitData *exit)
 }
 
 extern _PyExecutorObject *_PyExecutor_GetColdExecutor(void);
+extern _PyExecutorObject *_PyExecutor_GetColdDynamicExecutor(void);
 
 PyAPI_FUNC(void) _PyExecutor_ClearExit(_PyExitData *exit);
 
@@ -354,7 +349,9 @@ static inline int is_terminator(const _PyUOpInstruction *uop)
     int opcode = uop->opcode;
     return (
         opcode == _EXIT_TRACE ||
-        opcode == _JUMP_TO_TOP
+        opcode == _DEOPT ||
+        opcode == _JUMP_TO_TOP ||
+        opcode == _DYNAMIC_EXIT
     );
 }
 
@@ -365,6 +362,18 @@ PyAPI_FUNC(int) _PyDumpExecutors(FILE *out);
 extern void _Py_ClearExecutorDeletionList(PyInterpreterState *interp);
 #endif
 
+int _PyJit_translate_single_bytecode_to_trace(PyThreadState *tstate, _PyInterpreterFrame *frame, _Py_CODEUNIT *next_instr, bool stop_tracing);
+
+int
+_PyJit_TryInitializeTracing(PyThreadState *tstate, _PyInterpreterFrame *frame,
+    _Py_CODEUNIT *curr_instr, _Py_CODEUNIT *start_instr,
+    _Py_CODEUNIT *close_loop_instr, int curr_stackdepth, int chain_depth, _PyExitData *exit,
+    int oparg);
+
+void _PyJit_FinalizeTracing(PyThreadState *tstate);
+
+void _PyJit_Tracer_InvalidateDependency(PyThreadState *old_tstate, void *obj);
+
 #ifdef __cplusplus
 }
 #endif
 
@@ -12,7 +12,8 @@ extern "C" {
 #include "pycore_freelist_state.h"  // struct _Py_freelists
 #include "pycore_mimalloc.h"        // struct _mimalloc_thread_state
 #include "pycore_qsbr.h"            // struct qsbr
-
+#include "pycore_uop.h"             // struct _PyUOpInstruction
+#include "pycore_structs.h"
 
 #ifdef Py_GIL_DISABLED
 struct _gc_thread_state {
@@ -21,6 +22,38 @@ struct _gc_thread_state {
 };
 #endif
 
+#if _Py_TIER2
+typedef struct _PyJitTracerInitialState {
+    int stack_depth;
+    int chain_depth;
+    struct _PyExitData *exit;
+    PyCodeObject *code; // Strong
+    PyFunctionObject *func; // Strong
+    _Py_CODEUNIT *start_instr;
+    _Py_CODEUNIT *close_loop_instr;
+    _Py_CODEUNIT *jump_backward_instr;
+} _PyJitTracerInitialState;
+
+typedef struct _PyJitTracerPreviousState {
+    bool dependencies_still_valid;
+    bool instr_is_super;
+    int code_max_size;
+    int code_curr_size;
+    int instr_oparg;
+    int instr_stacklevel;
+    _Py_CODEUNIT *instr;
+    PyCodeObject *instr_code; // Strong
+    struct _PyInterpreterFrame *instr_frame;
+    _PyBloomFilter dependencies;
+} _PyJitTracerPreviousState;
+
+typedef struct _PyJitTracerState {
+    _PyUOpInstruction *code_buffer;
+    _PyJitTracerInitialState initial_state;
+    _PyJitTracerPreviousState prev_state;
+} _PyJitTracerState;
+#endif
+
 // Every PyThreadState is actually allocated as a _PyThreadStateImpl. The
 // PyThreadState fields are exposed as part of the C API, although most fields
 // are intended to be private. The _PyThreadStateImpl fields not exposed.
@@ -85,7 +118,9 @@ typedef struct _PyThreadStateImpl {
 #if defined(Py_REF_DEBUG) && defined(Py_GIL_DISABLED)
     Py_ssize_t reftotal;  // this thread's total refcount operations
 #endif
-
+#if _Py_TIER2
+    _PyJitTracerState jit_tracer_state;
+#endif
 } _PyThreadStateImpl;
 
 #ifdef __cplusplus
 
@@ -35,10 +35,18 @@ typedef struct _PyUOpInstruction{
 #endif
 } _PyUOpInstruction;
 
-// This is the length of the trace we project initially.
-#define UOP_MAX_TRACE_LENGTH 1200
+// This is the length of the trace we translate initially.
+#define UOP_MAX_TRACE_LENGTH 3000
 #define UOP_BUFFER_SIZE (UOP_MAX_TRACE_LENGTH * sizeof(_PyUOpInstruction))
 
+/* Bloom filter with m = 256
+ * https://en.wikipedia.org/wiki/Bloom_filter */
+#define _Py_BLOOM_FILTER_WORDS 8
+
+typedef struct {
+    uint32_t bits[_Py_BLOOM_FILTER_WORDS];
+} _PyBloomFilter;
+
 #ifdef __cplusplus
 }
 #endif
Original file line number	Diff line number	Diff line change
`@@ -392,6 +392,8 @@ _PyForIter_VirtualIteratorNext(PyThreadState* tstate, struct _PyInterpreterFrame`
`392`	`392`	`#define SPECIAL___AEXIT__ 3`
`393`	`393`	`#define SPECIAL_MAX 3`
`394`	`394`
	`395`	`+PyAPI_DATA(const _Py_CODEUNIT *) _Py_INTERPRETER_TRAMPOLINE_INSTRUCTIONS_PTR;`
	`396`	`+`
`395`	`397`	`#ifdef __cplusplus`
`396`	`398`	`}`
`397`	`399`	`#endif`