From 1c194e263d9c51adde38838f9a551c5948921ab5 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Fri, 18 Jul 2025 17:00:02 -0700 Subject: [PATCH 1/7] gh-131253: free-threaded build support for pystats Allow the --enable-pystats build option to be used with free-threading. For the free-threaded builds, the stats structure is allocated per-thread and then periodically merged into a global stats structure (on thread exit or when the reporting function is called). Summary of changes: * introduce _Py_tss_stats thread-local variable. This is set when stats are on, replacing the _Py_stats global that's used in the non-free-threaded build. * replace _Py_stats references with _PyStats_GET() * move pystats logic from Python/specialize.c into Python/pystats.c * add some free-threaded specific stat counters --- Include/cpython/pystats.h | 81 ++- Include/internal/pycore_stats.h | 76 ++- Include/internal/pycore_tstate.h | 7 + Makefile.pre.in | 1 + Modules/_xxtestfuzz/fuzzer.c | 4 +- PCbuild/_freeze_module.vcxproj | 1 + PCbuild/_freeze_module.vcxproj.filters | 3 + PCbuild/pythoncore.vcxproj | 1 + Python/ceval_macros.h | 3 +- Python/gc.c | 7 +- Python/gc_free_threading.c | 16 +- Python/lock.c | 3 + Python/pystate.c | 23 + Python/pystats.c | 772 +++++++++++++++++++++++++ Python/qsbr.c | 3 +- Python/specialize.c | 426 +------------- 16 files changed, 958 insertions(+), 469 deletions(-) create mode 100644 Python/pystats.c diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h index cf830b6066f4ab..16b7bd739c4cc0 100644 --- a/Include/cpython/pystats.h +++ b/Include/cpython/pystats.h @@ -4,7 +4,7 @@ // // - _Py_INCREF_STAT_INC() and _Py_DECREF_STAT_INC() used by Py_INCREF() // and Py_DECREF(). -// - _Py_stats variable +// - _PyStats_GET() // // Functions of the sys module: // @@ -14,7 +14,7 @@ // - sys._stats_dump() // // Python must be built with ./configure --enable-pystats to define the -// Py_STATS macro. +// _PyStats_GET() function. // // Define _PY_INTERPRETER macro to increment interpreter_increfs and // interpreter_decrefs. Otherwise, increment increfs and decrefs. @@ -109,6 +109,15 @@ typedef struct _gc_stats { uint64_t objects_not_transitively_reachable; } GCStats; +#ifdef Py_GIL_DISABLED +// stats specific to free-threaded build +typedef struct _ft_stats { + uint64_t mutex_sleeps; + uint64_t qsbr_polls; + uint64_t world_stops; +} FTStats; +#endif + typedef struct _uop_stats { uint64_t execution_count; uint64_t miss; @@ -173,22 +182,74 @@ typedef struct _stats { CallStats call_stats; ObjectStats object_stats; OptimizationStats optimization_stats; +#ifdef Py_GIL_DISABLED + FTStats ft_stats; +#endif RareEventStats rare_event_stats; GCStats *gc_stats; } PyStats; +#ifdef Py_GIL_DISABLED + +#if defined(HAVE_THREAD_LOCAL) && !defined(Py_BUILD_CORE_MODULE) +extern _Py_thread_local PyStats *_Py_tss_stats; +#endif + +// Export for most shared extensions, used via _PyStats_GET() static +// inline function. +PyAPI_FUNC(PyStats *) _PyStats_GetLocal(void); + +#else // !Py_GIL_DISABLED + // Export for shared extensions like 'math' PyAPI_DATA(PyStats*) _Py_stats; +#endif + +// Return pointer to the PyStats structure, NULL if recording is off. +static inline PyStats* +_PyStats_GET(void) +{ +#ifdef Py_GIL_DISABLED + +#if defined(HAVE_THREAD_LOCAL) && !defined(Py_BUILD_CORE_MODULE) + return _Py_tss_stats; +#else + return _PyStats_GetLocal(); +#endif + +#else // !Py_GIL_DISABLED + + return _Py_stats; + +#endif +} + +#define _Py_STATS_EXPR(expr) \ + do { \ + PyStats *s = _PyStats_GET(); \ + if (s != NULL) { \ + s->expr; \ + } \ + } while (0) + +#define _Py_STATS_COND_EXPR(cond, expr) \ + do { \ + PyStats *s = _PyStats_GET(); \ + if (s != NULL && cond) { \ + s->expr; \ + } \ + } while (0) + #ifdef _PY_INTERPRETER -# define _Py_INCREF_STAT_INC() do { if (_Py_stats) _Py_stats->object_stats.interpreter_increfs++; } while (0) -# define _Py_DECREF_STAT_INC() do { if (_Py_stats) _Py_stats->object_stats.interpreter_decrefs++; } while (0) -# define _Py_INCREF_IMMORTAL_STAT_INC() do { if (_Py_stats) _Py_stats->object_stats.interpreter_immortal_increfs++; } while (0) -# define _Py_DECREF_IMMORTAL_STAT_INC() do { if (_Py_stats) _Py_stats->object_stats.interpreter_immortal_decrefs++; } while (0) +# define _Py_INCREF_STAT_INC() _Py_STATS_EXPR(object_stats.interpreter_increfs++) +# define _Py_DECREF_STAT_INC() _Py_STATS_EXPR(object_stats.interpreter_decrefs++) +# define _Py_INCREF_IMMORTAL_STAT_INC() _Py_STATS_EXPR(object_stats.interpreter_immortal_increfs++) +# define _Py_DECREF_IMMORTAL_STAT_INC() _Py_STATS_EXPR(object_stats.interpreter_immortal_decrefs++) #else -# define _Py_INCREF_STAT_INC() do { if (_Py_stats) _Py_stats->object_stats.increfs++; } while (0) -# define _Py_DECREF_STAT_INC() do { if (_Py_stats) _Py_stats->object_stats.decrefs++; } while (0) -# define _Py_INCREF_IMMORTAL_STAT_INC() do { if (_Py_stats) _Py_stats->object_stats.immortal_increfs++; } while (0) -# define _Py_DECREF_IMMORTAL_STAT_INC() do { if (_Py_stats) _Py_stats->object_stats.immortal_decrefs++; } while (0) +# define _Py_INCREF_STAT_INC() _Py_STATS_EXPR(object_stats.increfs++) +# define _Py_DECREF_STAT_INC() _Py_STATS_EXPR(object_stats.decrefs++) +# define _Py_INCREF_IMMORTAL_STAT_INC() _Py_STATS_EXPR(object_stats.immortal_increfs++) +# define _Py_DECREF_IMMORTAL_STAT_INC() _Py_STATS_EXPR(object_stats.immortal_decrefs++) #endif diff --git a/Include/internal/pycore_stats.h b/Include/internal/pycore_stats.h index ab649574f33dbf..fd7755f128f7c5 100644 --- a/Include/internal/pycore_stats.h +++ b/Include/internal/pycore_stats.h @@ -15,39 +15,56 @@ extern "C" { #include "pycore_bitutils.h" // _Py_bit_length -#define STAT_INC(opname, name) do { if (_Py_stats) _Py_stats->opcode_stats[opname].specialization.name++; } while (0) -#define STAT_DEC(opname, name) do { if (_Py_stats) _Py_stats->opcode_stats[opname].specialization.name--; } while (0) -#define OPCODE_EXE_INC(opname) do { if (_Py_stats) _Py_stats->opcode_stats[opname].execution_count++; } while (0) -#define CALL_STAT_INC(name) do { if (_Py_stats) _Py_stats->call_stats.name++; } while (0) -#define OBJECT_STAT_INC(name) do { if (_Py_stats) _Py_stats->object_stats.name++; } while (0) -#define OBJECT_STAT_INC_COND(name, cond) \ - do { if (_Py_stats && cond) _Py_stats->object_stats.name++; } while (0) -#define EVAL_CALL_STAT_INC(name) do { if (_Py_stats) _Py_stats->call_stats.eval_calls[name]++; } while (0) -#define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) \ - do { if (_Py_stats && PyFunction_Check(callable)) _Py_stats->call_stats.eval_calls[name]++; } while (0) -#define GC_STAT_ADD(gen, name, n) do { if (_Py_stats) _Py_stats->gc_stats[(gen)].name += (n); } while (0) -#define OPT_STAT_INC(name) do { if (_Py_stats) _Py_stats->optimization_stats.name++; } while (0) -#define OPT_STAT_ADD(name, n) do { if (_Py_stats) _Py_stats->optimization_stats.name += (n); } while (0) -#define UOP_STAT_INC(opname, name) do { if (_Py_stats) { assert(opname < 512); _Py_stats->optimization_stats.opcode[opname].name++; } } while (0) -#define UOP_PAIR_INC(uopcode, lastuop) \ - do { \ - if (lastuop && _Py_stats) { \ - _Py_stats->optimization_stats.opcode[lastuop].pair_count[uopcode]++; \ - } \ - lastuop = uopcode; \ +#define STAT_INC(opname, name) _Py_STATS_EXPR(opcode_stats[opname].specialization.name++) +#define STAT_DEC(opname, name) _Py_STATS_EXPR(opcode_stats[opname].specialization.name--) +#define OPCODE_EXE_INC(opname) _Py_STATS_EXPR(opcode_stats[opname].execution_count++) +#define CALL_STAT_INC(name) _Py_STATS_EXPR(call_stats.name++) +#define OBJECT_STAT_INC(name) _Py_STATS_EXPR(object_stats.name++) +#define OBJECT_STAT_INC_COND(name, cond) _Py_STATS_COND_EXPR(cond, object_stats.name++) +#define EVAL_CALL_STAT_INC(name) _Py_STATS_EXPR(call_stats.eval_calls[name]++) +#define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) _Py_STATS_COND_EXPR(PyFunction_Check(callable), call_stats.eval_calls[name]++) +#define GC_STAT_ADD(gen, name, n) _Py_STATS_EXPR(gc_stats[(gen)].name += (n)) +#define OPT_STAT_INC(name) _Py_STATS_EXPR(optimization_stats.name++) +#define OPT_STAT_ADD(name, n) _Py_STATS_EXPR(optimization_stats.name += (n)) +#define UOP_STAT_INC(opname, name) \ + do { \ + PyStats *s = _PyStats_GET(); \ + if (s) { \ + assert(opname < 512); \ + s->optimization_stats.opcode[opname].name++; \ + } \ + } while (0) +#define UOP_PAIR_INC(uopcode, lastuop) \ + do { \ + PyStats *s = _PyStats_GET(); \ + if (lastuop && s) { \ + s->optimization_stats.opcode[lastuop].pair_count[uopcode]++; \ + } \ + lastuop = uopcode; \ } while (0) -#define OPT_UNSUPPORTED_OPCODE(opname) do { if (_Py_stats) _Py_stats->optimization_stats.unsupported_opcode[opname]++; } while (0) -#define OPT_ERROR_IN_OPCODE(opname) do { if (_Py_stats) _Py_stats->optimization_stats.error_in_opcode[opname]++; } while (0) +#define OPT_UNSUPPORTED_OPCODE(opname) _Py_STATS_EXPR(optimization_stats.unsupported_opcode[opname]++) +#define OPT_ERROR_IN_OPCODE(opname) _Py_STATS_EXPR(optimization_stats.error_in_opcode[opname]++) #define OPT_HIST(length, name) \ do { \ - if (_Py_stats) { \ + PyStats *s = _PyStats_GET(); \ + if (s) { \ int bucket = _Py_bit_length(length >= 1 ? length - 1 : 0); \ bucket = (bucket >= _Py_UOP_HIST_SIZE) ? _Py_UOP_HIST_SIZE - 1 : bucket; \ - _Py_stats->optimization_stats.name[bucket]++; \ + s->optimization_stats.name[bucket]++; \ } \ } while (0) -#define RARE_EVENT_STAT_INC(name) do { if (_Py_stats) _Py_stats->rare_event_stats.name++; } while (0) -#define OPCODE_DEFERRED_INC(opname) do { if (_Py_stats && opcode == opname) _Py_stats->opcode_stats[opname].specialization.deferred++; } while (0) +#define RARE_EVENT_STAT_INC(name) _Py_STATS_EXPR(rare_event_stats.name++) +#define OPCODE_DEFERRED_INC(opname) _Py_STATS_COND_EXPR(opcode==opname, opcode_stats[opname].specialization.deferred++) + +#ifdef Py_GIL_DISABLED +#define FT_STAT_MUTEX_SLEEP_INC() _Py_STATS_EXPR(ft_stats.mutex_sleeps++) +#define FT_STAT_QSBR_POLL_INC() _Py_STATS_EXPR(ft_stats.qsbr_polls++) +#define FT_STAT_WORLD_STOP_INC() _Py_STATS_EXPR(ft_stats.world_stops++) +#else +#define FT_STAT_MUTEX_SLEEP_INC() +#define FT_STAT_QSBR_POLL_INC() +#define FT_STAT_WORLD_STOP_INC() +#endif // Export for '_opcode' shared extension PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void); @@ -71,6 +88,9 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void); #define OPT_HIST(length, name) ((void)0) #define RARE_EVENT_STAT_INC(name) ((void)0) #define OPCODE_DEFERRED_INC(opname) ((void)0) +#define FT_STAT_MUTEX_SLEEP_INC() +#define FT_STAT_QSBR_POLL_INC() +#define FT_STAT_WORLD_STOP_INC() #endif // !Py_STATS @@ -90,6 +110,10 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void); RARE_EVENT_INTERP_INC(interp, name); \ } while (0); \ +bool _PyStats_ThreadInit(_PyThreadStateImpl *); +void _PyStats_ThreadFini(_PyThreadStateImpl *); +void _PyStats_Attach(_PyThreadStateImpl *); +void _PyStats_Detach(_PyThreadStateImpl *); #ifdef __cplusplus } diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index bad968428c73a1..0c28647a094a24 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -70,8 +70,15 @@ typedef struct _PyThreadStateImpl { // When >1, code objects do not immortalize their non-string constants. int suppress_co_const_immortalization; + +#ifdef Py_STATS + // per-thread stats, will be merged into the _Py_stats_struct global + PyStats *pystats_struct; // allocated by _PyStats_ThreadInit() + PyStats **pystats_tss; // pointer to tss variable #endif +#endif // Py_GIL_DISABLED + #if defined(Py_REF_DEBUG) && defined(Py_GIL_DISABLED) Py_ssize_t reftotal; // this thread's total refcount operations #endif diff --git a/Makefile.pre.in b/Makefile.pre.in index b7b16ef4cb9d19..97099673e3e0cd 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -484,6 +484,7 @@ PYTHON_OBJS= \ Python/pylifecycle.o \ Python/pymath.o \ Python/pystate.o \ + Python/pystats.o \ Python/pythonrun.o \ Python/pytime.o \ Python/qsbr.o \ diff --git a/Modules/_xxtestfuzz/fuzzer.c b/Modules/_xxtestfuzz/fuzzer.c index a04f1412eefda1..0cbe10c79ab4a6 100644 --- a/Modules/_xxtestfuzz/fuzzer.c +++ b/Modules/_xxtestfuzz/fuzzer.c @@ -10,8 +10,8 @@ See the source code for LLVMFuzzerTestOneInput for details. */ -#ifndef Py_BUILD_CORE -# define Py_BUILD_CORE 1 +#ifndef Py_BUILD_CORE_MODULE +# define Py_BUILD_CORE_MODULE 1 #endif #include diff --git a/PCbuild/_freeze_module.vcxproj b/PCbuild/_freeze_module.vcxproj index 5ceddf759b8f3b..29c0e8ccea923b 100644 --- a/PCbuild/_freeze_module.vcxproj +++ b/PCbuild/_freeze_module.vcxproj @@ -255,6 +255,7 @@ + diff --git a/PCbuild/_freeze_module.vcxproj.filters b/PCbuild/_freeze_module.vcxproj.filters index 332d466b1f7409..2ac552befac877 100644 --- a/PCbuild/_freeze_module.vcxproj.filters +++ b/PCbuild/_freeze_module.vcxproj.filters @@ -370,6 +370,9 @@ Source Files + + Source Files + Source Files diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index c59b380d814ed9..bc011d421db7d2 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -655,6 +655,7 @@ + diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 187ec8fdd26584..168c36734a69c8 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -62,8 +62,9 @@ #ifdef Py_STATS #define INSTRUCTION_STATS(op) \ do { \ + PyStats *s = _PyStats_GET(); \ OPCODE_EXE_INC(op); \ - if (_Py_stats) _Py_stats->opcode_stats[lastopcode].pair_count[op]++; \ + if (s) s->opcode_stats[lastopcode].pair_count[op]++; \ lastopcode = op; \ } while (0) #else diff --git a/Python/gc.c b/Python/gc.c index 2e697faac032b5..a2cd354fb6304b 100644 --- a/Python/gc.c +++ b/Python/gc.c @@ -2062,10 +2062,11 @@ _PyGC_Collect(PyThreadState *tstate, int generation, _PyGC_Reason reason) _PyErr_SetRaisedException(tstate, exc); GC_STAT_ADD(generation, objects_collected, stats.collected); #ifdef Py_STATS - if (_Py_stats) { + PyStats *s = _PyStats_GET(); + if (s) { GC_STAT_ADD(generation, object_visits, - _Py_stats->object_stats.object_visits); - _Py_stats->object_stats.object_visits = 0; + s->object_stats.object_visits); + s->object_stats.object_visits = 0; } #endif validate_spaces(gcstate); diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 0b0ddf227e4952..4ef0870b21f986 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -2302,8 +2302,9 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) assert(generation >= 0 && generation < NUM_GENERATIONS); #ifdef Py_STATS - if (_Py_stats) { - _Py_stats->object_stats.object_visits = 0; + PyStats *s = _PyStats_GET(); + if (s) { + s->object_stats.object_visits = 0; } #endif GC_STAT_ADD(generation, collections, 1); @@ -2366,10 +2367,13 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) GC_STAT_ADD(generation, objects_collected, m); #ifdef Py_STATS - if (_Py_stats) { - GC_STAT_ADD(generation, object_visits, - _Py_stats->object_stats.object_visits); - _Py_stats->object_stats.object_visits = 0; + { + PyStats *s = _PyStats_GET(); + if (s) { + GC_STAT_ADD(generation, object_visits, + s->object_stats.object_visits); + s->object_stats.object_visits = 0; + } } #endif diff --git a/Python/lock.c b/Python/lock.c index a49d587a1686d9..60659d43c0f862 100644 --- a/Python/lock.c +++ b/Python/lock.c @@ -6,6 +6,7 @@ #include "pycore_parking_lot.h" #include "pycore_semaphore.h" #include "pycore_time.h" // _PyTime_Add() +#include "pycore_stats.h" // FT_STAT_MUTEX_SLEEP_INC() #ifdef MS_WINDOWS # ifndef WIN32_LEAN_AND_MEAN @@ -62,6 +63,8 @@ _PyMutex_LockTimed(PyMutex *m, PyTime_t timeout, _PyLockFlags flags) return PY_LOCK_FAILURE; } + FT_STAT_MUTEX_SLEEP_INC(); + PyTime_t now; // silently ignore error: cannot report error to the caller (void)PyTime_MonotonicRaw(&now); diff --git a/Python/pystate.c b/Python/pystate.c index 04ca6edb4aaa0e..9cbdca384aab10 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -24,6 +24,8 @@ #include "pycore_stackref.h" // Py_STACKREF_DEBUG #include "pycore_time.h" // _PyTime_Init() #include "pycore_uniqueid.h" // _PyObject_FinalizePerThreadRefcounts() +#include "pycore_stats.h" // FT_STAT_WORLD_STOP_INC() + /* -------------------------------------------------------------------------- @@ -1398,6 +1400,9 @@ static void free_threadstate(_PyThreadStateImpl *tstate) { PyInterpreterState *interp = tstate->base.interp; +#ifdef Py_STATS + _PyStats_ThreadFini(tstate); +#endif // The initial thread state of the interpreter is allocated // as part of the interpreter state so should not be freed. if (tstate == &interp->_initial_thread) { @@ -1525,6 +1530,13 @@ new_threadstate(PyInterpreterState *interp, int whence) return NULL; } #endif +#ifdef Py_STATS + // The PyStats structure is quite large and is allocated separated from tstate. + if (!_PyStats_ThreadInit(tstate)) { + free_threadstate(tstate); + return NULL; + } +#endif /* We serialize concurrent creation to protect global state. */ HEAD_LOCK(interp->runtime); @@ -1840,6 +1852,9 @@ _PyThreadState_DeleteCurrent(PyThreadState *tstate) _Py_EnsureTstateNotNULL(tstate); #ifdef Py_GIL_DISABLED _Py_qsbr_detach(((_PyThreadStateImpl *)tstate)->qsbr); +#endif +#ifdef Py_STATS + _PyStats_Detach((_PyThreadStateImpl *)tstate); #endif current_fast_clear(tstate->interp->runtime); tstate_delete_common(tstate, 1); // release GIL as part of call @@ -2002,6 +2017,9 @@ tstate_activate(PyThreadState *tstate) if (!tstate->_status.bound_gilstate) { bind_gilstate_tstate(tstate); } +#ifdef Py_STATS + _PyStats_Attach((_PyThreadStateImpl *)tstate); +#endif tstate->_status.active = 1; } @@ -2014,6 +2032,10 @@ tstate_deactivate(PyThreadState *tstate) assert(tstate_is_bound(tstate)); assert(tstate->_status.active); +#if Py_STATS + _PyStats_Detach((_PyThreadStateImpl *)tstate); +#endif + tstate->_status.active = 0; // We do not unbind the gilstate tstate here. @@ -2264,6 +2286,7 @@ stop_the_world(struct _stoptheworld_state *stw) stw->thread_countdown = 0; stw->stop_event = (PyEvent){0}; // zero-initialize (unset) stw->requester = _PyThreadState_GET(); // may be NULL + FT_STAT_WORLD_STOP_INC(); _Py_FOR_EACH_STW_INTERP(stw, i) { _Py_FOR_EACH_TSTATE_UNLOCKED(i, t) { diff --git a/Python/pystats.c b/Python/pystats.c new file mode 100644 index 00000000000000..e0217ae0fef851 --- /dev/null +++ b/Python/pystats.c @@ -0,0 +1,772 @@ +#include "Python.h" + +#include "pycore_opcode_metadata.h" // _PyOpcode_Caches +#include "pycore_uop_metadata.h" // _PyOpcode_uop_name +#include "pycore_uop_ids.h" // MAX_UOP_ID +#include "pycore_pylifecycle.h" // _PyOS_URandomNonblock() +#include "pycore_pystate.h" // _PyThreadState_GET() +#include "pycore_runtime.h" // NUM_GENERATIONS + +#include // rand() + +extern const char *_PyUOpName(int index); + +#ifdef Py_STATS + +static bool pystats_was_enabled; + +static GCStats pystats_gc[NUM_GENERATIONS] = { 0 }; + +static PyStats pystats_struct = { .gc_stats = pystats_gc }; + + +#ifdef Py_GIL_DISABLED + +// true if recording of pystats is on, this is used when new threads +// are created to decide if recording should be on for them +static bool pystats_enabled; + +// held when global pystats structure is being updated +static PyMutex pystats_mutex; + +// Pointer to Thread-local stats structure, null if recording is off. +_Py_thread_local PyStats *_Py_tss_stats; + +PyStats * +_PyStats_GetLocal(void) +{ + return _Py_tss_stats; +} + +#else // !Py_GIL_DISABLED + +PyStats *_Py_stats; + +#endif // Py_GIL_DISABLED + + +#if PYSTATS_MAX_UOP_ID < MAX_UOP_ID +#error "Not enough space allocated for pystats. Increase PYSTATS_MAX_UOP_ID to at least MAX_UOP_ID" +#endif + +#define ADD_STAT_TO_DICT(res, field) \ + do { \ + PyObject *val = PyLong_FromUnsignedLongLong(stats->field); \ + if (val == NULL) { \ + Py_DECREF(res); \ + return NULL; \ + } \ + if (PyDict_SetItemString(res, #field, val) == -1) { \ + Py_DECREF(res); \ + Py_DECREF(val); \ + return NULL; \ + } \ + Py_DECREF(val); \ + } while(0); + +static PyObject* +stats_to_dict(SpecializationStats *stats) +{ + PyObject *res = PyDict_New(); + if (res == NULL) { + return NULL; + } + ADD_STAT_TO_DICT(res, success); + ADD_STAT_TO_DICT(res, failure); + ADD_STAT_TO_DICT(res, hit); + ADD_STAT_TO_DICT(res, deferred); + ADD_STAT_TO_DICT(res, miss); + ADD_STAT_TO_DICT(res, deopt); + PyObject *failure_kinds = PyTuple_New(SPECIALIZATION_FAILURE_KINDS); + if (failure_kinds == NULL) { + Py_DECREF(res); + return NULL; + } + for (int i = 0; i < SPECIALIZATION_FAILURE_KINDS; i++) { + PyObject *stat = PyLong_FromUnsignedLongLong(stats->failure_kinds[i]); + if (stat == NULL) { + Py_DECREF(res); + Py_DECREF(failure_kinds); + return NULL; + } + PyTuple_SET_ITEM(failure_kinds, i, stat); + } + if (PyDict_SetItemString(res, "failure_kinds", failure_kinds)) { + Py_DECREF(res); + Py_DECREF(failure_kinds); + return NULL; + } + Py_DECREF(failure_kinds); + return res; +} +#undef ADD_STAT_TO_DICT + +static int +add_stat_dict( + PyObject *res, + int opcode, + const char *name) { + + SpecializationStats *stats = &pystats_struct.opcode_stats[opcode].specialization; + PyObject *d = stats_to_dict(stats); + if (d == NULL) { + return -1; + } + int err = PyDict_SetItemString(res, name, d); + Py_DECREF(d); + return err; +} + +PyObject* +_Py_GetSpecializationStats(void) { + PyObject *stats = PyDict_New(); + if (stats == NULL) { + return NULL; + } + int err = 0; + err += add_stat_dict(stats, CONTAINS_OP, "contains_op"); + err += add_stat_dict(stats, LOAD_SUPER_ATTR, "load_super_attr"); + err += add_stat_dict(stats, LOAD_ATTR, "load_attr"); + err += add_stat_dict(stats, LOAD_GLOBAL, "load_global"); + err += add_stat_dict(stats, STORE_SUBSCR, "store_subscr"); + err += add_stat_dict(stats, STORE_ATTR, "store_attr"); + err += add_stat_dict(stats, JUMP_BACKWARD, "jump_backward"); + err += add_stat_dict(stats, CALL, "call"); + err += add_stat_dict(stats, CALL_KW, "call_kw"); + err += add_stat_dict(stats, BINARY_OP, "binary_op"); + err += add_stat_dict(stats, COMPARE_OP, "compare_op"); + err += add_stat_dict(stats, UNPACK_SEQUENCE, "unpack_sequence"); + err += add_stat_dict(stats, FOR_ITER, "for_iter"); + err += add_stat_dict(stats, TO_BOOL, "to_bool"); + err += add_stat_dict(stats, SEND, "send"); + if (err < 0) { + Py_DECREF(stats); + return NULL; + } + return stats; +} + + +#define PRINT_STAT(i, field) \ + if (stats[i].field) { \ + fprintf(out, " opcode[%s]." #field " : %" PRIu64 "\n", _PyOpcode_OpName[i], stats[i].field); \ + } + +static void +print_spec_stats(FILE *out, OpcodeStats *stats) +{ + /* Mark some opcodes as specializable for stats, + * even though we don't specialize them yet. */ + fprintf(out, "opcode[BINARY_SLICE].specializable : 1\n"); + fprintf(out, "opcode[STORE_SLICE].specializable : 1\n"); + fprintf(out, "opcode[GET_ITER].specializable : 1\n"); + for (int i = 0; i < 256; i++) { + if (_PyOpcode_Caches[i]) { + /* Ignore jumps as they cannot be specialized */ + switch (i) { + case POP_JUMP_IF_FALSE: + case POP_JUMP_IF_TRUE: + case POP_JUMP_IF_NONE: + case POP_JUMP_IF_NOT_NONE: + case JUMP_BACKWARD: + break; + default: + fprintf(out, "opcode[%s].specializable : 1\n", _PyOpcode_OpName[i]); + } + } + PRINT_STAT(i, specialization.success); + PRINT_STAT(i, specialization.failure); + PRINT_STAT(i, specialization.hit); + PRINT_STAT(i, specialization.deferred); + PRINT_STAT(i, specialization.miss); + PRINT_STAT(i, specialization.deopt); + PRINT_STAT(i, execution_count); + for (int j = 0; j < SPECIALIZATION_FAILURE_KINDS; j++) { + uint64_t val = stats[i].specialization.failure_kinds[j]; + if (val) { + fprintf(out, " opcode[%s].specialization.failure_kinds[%d] : %" + PRIu64 "\n", _PyOpcode_OpName[i], j, val); + } + } + for (int j = 0; j < 256; j++) { + if (stats[i].pair_count[j]) { + fprintf(out, "opcode[%s].pair_count[%s] : %" PRIu64 "\n", + _PyOpcode_OpName[i], _PyOpcode_OpName[j], stats[i].pair_count[j]); + } + } + } +} +#undef PRINT_STAT + + +static void +print_call_stats(FILE *out, CallStats *stats) +{ + fprintf(out, "Calls to PyEval_EvalDefault: %" PRIu64 "\n", stats->pyeval_calls); + fprintf(out, "Calls to Python functions inlined: %" PRIu64 "\n", stats->inlined_py_calls); + fprintf(out, "Frames pushed: %" PRIu64 "\n", stats->frames_pushed); + fprintf(out, "Frame objects created: %" PRIu64 "\n", stats->frame_objects_created); + for (int i = 0; i < EVAL_CALL_KINDS; i++) { + fprintf(out, "Calls via PyEval_EvalFrame[%d] : %" PRIu64 "\n", i, stats->eval_calls[i]); + } +} + +static void +print_object_stats(FILE *out, ObjectStats *stats) +{ + fprintf(out, "Object allocations from freelist: %" PRIu64 "\n", stats->from_freelist); + fprintf(out, "Object frees to freelist: %" PRIu64 "\n", stats->to_freelist); + fprintf(out, "Object allocations: %" PRIu64 "\n", stats->allocations); + fprintf(out, "Object allocations to 512 bytes: %" PRIu64 "\n", stats->allocations512); + fprintf(out, "Object allocations to 4 kbytes: %" PRIu64 "\n", stats->allocations4k); + fprintf(out, "Object allocations over 4 kbytes: %" PRIu64 "\n", stats->allocations_big); + fprintf(out, "Object frees: %" PRIu64 "\n", stats->frees); + fprintf(out, "Object inline values: %" PRIu64 "\n", stats->inline_values); + fprintf(out, "Object interpreter mortal increfs: %" PRIu64 "\n", stats->interpreter_increfs); + fprintf(out, "Object interpreter mortal decrefs: %" PRIu64 "\n", stats->interpreter_decrefs); + fprintf(out, "Object mortal increfs: %" PRIu64 "\n", stats->increfs); + fprintf(out, "Object mortal decrefs: %" PRIu64 "\n", stats->decrefs); + fprintf(out, "Object interpreter immortal increfs: %" PRIu64 "\n", stats->interpreter_immortal_increfs); + fprintf(out, "Object interpreter immortal decrefs: %" PRIu64 "\n", stats->interpreter_immortal_decrefs); + fprintf(out, "Object immortal increfs: %" PRIu64 "\n", stats->immortal_increfs); + fprintf(out, "Object immortal decrefs: %" PRIu64 "\n", stats->immortal_decrefs); + fprintf(out, "Object materialize dict (on request): %" PRIu64 "\n", stats->dict_materialized_on_request); + fprintf(out, "Object materialize dict (new key): %" PRIu64 "\n", stats->dict_materialized_new_key); + fprintf(out, "Object materialize dict (too big): %" PRIu64 "\n", stats->dict_materialized_too_big); + fprintf(out, "Object materialize dict (str subclass): %" PRIu64 "\n", stats->dict_materialized_str_subclass); + fprintf(out, "Object method cache hits: %" PRIu64 "\n", stats->type_cache_hits); + fprintf(out, "Object method cache misses: %" PRIu64 "\n", stats->type_cache_misses); + fprintf(out, "Object method cache collisions: %" PRIu64 "\n", stats->type_cache_collisions); + fprintf(out, "Object method cache dunder hits: %" PRIu64 "\n", stats->type_cache_dunder_hits); + fprintf(out, "Object method cache dunder misses: %" PRIu64 "\n", stats->type_cache_dunder_misses); +} + +static void +print_gc_stats(FILE *out, GCStats *stats) +{ + for (int i = 0; i < NUM_GENERATIONS; i++) { + fprintf(out, "GC[%d] collections: %" PRIu64 "\n", i, stats[i].collections); + fprintf(out, "GC[%d] object visits: %" PRIu64 "\n", i, stats[i].object_visits); + fprintf(out, "GC[%d] objects collected: %" PRIu64 "\n", i, stats[i].objects_collected); + fprintf(out, "GC[%d] objects reachable from roots: %" PRIu64 "\n", i, stats[i].objects_transitively_reachable); + fprintf(out, "GC[%d] objects not reachable from roots: %" PRIu64 "\n", i, stats[i].objects_not_transitively_reachable); + } +} + +#ifdef _Py_TIER2 +static void +print_histogram(FILE *out, const char *name, uint64_t hist[_Py_UOP_HIST_SIZE]) +{ + for (int i = 0; i < _Py_UOP_HIST_SIZE; i++) { + fprintf(out, "%s[%" PRIu64"]: %" PRIu64 "\n", name, (uint64_t)1 << i, hist[i]); + } +} + +static void +print_optimization_stats(FILE *out, OptimizationStats *stats) +{ + fprintf(out, "Optimization attempts: %" PRIu64 "\n", stats->attempts); + fprintf(out, "Optimization traces created: %" PRIu64 "\n", stats->traces_created); + fprintf(out, "Optimization traces executed: %" PRIu64 "\n", stats->traces_executed); + fprintf(out, "Optimization uops executed: %" PRIu64 "\n", stats->uops_executed); + fprintf(out, "Optimization trace stack overflow: %" PRIu64 "\n", stats->trace_stack_overflow); + fprintf(out, "Optimization trace stack underflow: %" PRIu64 "\n", stats->trace_stack_underflow); + fprintf(out, "Optimization trace too long: %" PRIu64 "\n", stats->trace_too_long); + fprintf(out, "Optimization trace too short: %" PRIu64 "\n", stats->trace_too_short); + fprintf(out, "Optimization inner loop: %" PRIu64 "\n", stats->inner_loop); + fprintf(out, "Optimization recursive call: %" PRIu64 "\n", stats->recursive_call); + fprintf(out, "Optimization low confidence: %" PRIu64 "\n", stats->low_confidence); + fprintf(out, "Optimization unknown callee: %" PRIu64 "\n", stats->unknown_callee); + fprintf(out, "Executors invalidated: %" PRIu64 "\n", stats->executors_invalidated); + + print_histogram(out, "Trace length", stats->trace_length_hist); + print_histogram(out, "Trace run length", stats->trace_run_length_hist); + print_histogram(out, "Optimized trace length", stats->optimized_trace_length_hist); + + fprintf(out, "Optimization optimizer attempts: %" PRIu64 "\n", stats->optimizer_attempts); + fprintf(out, "Optimization optimizer successes: %" PRIu64 "\n", stats->optimizer_successes); + fprintf(out, "Optimization optimizer failure no memory: %" PRIu64 "\n", + stats->optimizer_failure_reason_no_memory); + fprintf(out, "Optimizer remove globals builtins changed: %" PRIu64 "\n", stats->remove_globals_builtins_changed); + fprintf(out, "Optimizer remove globals incorrect keys: %" PRIu64 "\n", stats->remove_globals_incorrect_keys); + for (int i = 0; i <= MAX_UOP_ID; i++) { + if (stats->opcode[i].execution_count) { + fprintf(out, "uops[%s].execution_count : %" PRIu64 "\n", _PyUOpName(i), stats->opcode[i].execution_count); + } + if (stats->opcode[i].miss) { + fprintf(out, "uops[%s].specialization.miss : %" PRIu64 "\n", _PyUOpName(i), stats->opcode[i].miss); + } + } + for (int i = 0; i < 256; i++) { + if (stats->unsupported_opcode[i]) { + fprintf( + out, + "unsupported_opcode[%s].count : %" PRIu64 "\n", + _PyOpcode_OpName[i], + stats->unsupported_opcode[i] + ); + } + } + + for (int i = 1; i <= MAX_UOP_ID; i++){ + for (int j = 1; j <= MAX_UOP_ID; j++) { + if (stats->opcode[i].pair_count[j]) { + fprintf(out, "uop[%s].pair_count[%s] : %" PRIu64 "\n", + _PyOpcode_uop_name[i], _PyOpcode_uop_name[j], stats->opcode[i].pair_count[j]); + } + } + } + for (int i = 0; i < MAX_UOP_ID; i++) { + if (stats->error_in_opcode[i]) { + fprintf( + out, + "error_in_opcode[%s].count : %" PRIu64 "\n", + _PyUOpName(i), + stats->error_in_opcode[i] + ); + } + } + fprintf(out, "JIT total memory size: %" PRIu64 "\n", stats->jit_total_memory_size); + fprintf(out, "JIT code size: %" PRIu64 "\n", stats->jit_code_size); + fprintf(out, "JIT trampoline size: %" PRIu64 "\n", stats->jit_trampoline_size); + fprintf(out, "JIT data size: %" PRIu64 "\n", stats->jit_data_size); + fprintf(out, "JIT padding size: %" PRIu64 "\n", stats->jit_padding_size); + fprintf(out, "JIT freed memory size: %" PRIu64 "\n", stats->jit_freed_memory_size); + + print_histogram(out, "Trace total memory size", stats->trace_total_memory_hist); +} +#endif + +#ifdef Py_GIL_DISABLED +static void +print_ft_stats(FILE *out, FTStats *stats) +{ + fprintf(out, "Mutex sleeps (mutex_sleeps): %" PRIu64 "\n", stats->mutex_sleeps); + fprintf(out, "QSBR polls (qsbr_polls): %" PRIu64 "\n", stats->qsbr_polls); + fprintf(out, "World stops (world_stops): %" PRIu64 "\n", stats->world_stops); +} +#endif + +static void +print_rare_event_stats(FILE *out, RareEventStats *stats) +{ + fprintf(out, "Rare event (set_class): %" PRIu64 "\n", stats->set_class); + fprintf(out, "Rare event (set_bases): %" PRIu64 "\n", stats->set_bases); + fprintf(out, "Rare event (set_eval_frame_func): %" PRIu64 "\n", stats->set_eval_frame_func); + fprintf(out, "Rare event (builtin_dict): %" PRIu64 "\n", stats->builtin_dict); + fprintf(out, "Rare event (func_modification): %" PRIu64 "\n", stats->func_modification); + fprintf(out, "Rare event (watched_dict_modification): %" PRIu64 "\n", stats->watched_dict_modification); + fprintf(out, "Rare event (watched_globals_modification): %" PRIu64 "\n", stats->watched_globals_modification); +} + +static void +print_stats(FILE *out, PyStats *stats) +{ + print_spec_stats(out, stats->opcode_stats); + print_call_stats(out, &stats->call_stats); + print_object_stats(out, &stats->object_stats); + print_gc_stats(out, stats->gc_stats); +#ifdef _Py_TIER2 + print_optimization_stats(out, &stats->optimization_stats); +#endif +#ifdef Py_GIL_DISABLED + print_ft_stats(out, &stats->ft_stats); +#endif + print_rare_event_stats(out, &stats->rare_event_stats); +} + +#ifdef Py_GIL_DISABLED +static void +merge_specialization_stats(SpecializationStats *dest, const SpecializationStats *src) +{ + dest->success += src->success; + dest->failure += src->failure; + dest->hit += src->hit; + dest->deferred += src->deferred; + dest->miss += src->miss; + dest->deopt += src->deopt; + for (int i = 0; i < SPECIALIZATION_FAILURE_KINDS; i++) { + dest->failure_kinds[i] += src->failure_kinds[i]; + } +} + +static void +merge_opcode_stats_array(OpcodeStats *dest, const OpcodeStats *src) +{ + for (int i = 0; i < 256; i++) { + merge_specialization_stats(&dest[i].specialization, &src[i].specialization); + dest[i].execution_count += src[i].execution_count; + for (int j = 0; j < 256; j++) { + dest[i].pair_count[j] += src[i].pair_count[j]; + } + } +} + +static void +merge_call_stats(CallStats *dest, const CallStats *src) +{ + dest->inlined_py_calls += src->inlined_py_calls; + dest->pyeval_calls += src->pyeval_calls; + dest->frames_pushed += src->frames_pushed; + dest->frame_objects_created += src->frame_objects_created; + for (int i = 0; i < EVAL_CALL_KINDS; i++) { + dest->eval_calls[i] += src->eval_calls[i]; + } +} + +static void +merge_object_stats(ObjectStats *dest, const ObjectStats *src) +{ + dest->increfs += src->increfs; + dest->decrefs += src->decrefs; + dest->interpreter_increfs += src->interpreter_increfs; + dest->interpreter_decrefs += src->interpreter_decrefs; + dest->immortal_increfs += src->immortal_increfs; + dest->immortal_decrefs += src->immortal_decrefs; + dest->interpreter_immortal_increfs += src->interpreter_immortal_increfs; + dest->interpreter_immortal_decrefs += src->interpreter_immortal_decrefs; + dest->allocations += src->allocations; + dest->allocations512 += src->allocations512; + dest->allocations4k += src->allocations4k; + dest->allocations_big += src->allocations_big; + dest->frees += src->frees; + dest->to_freelist += src->to_freelist; + dest->from_freelist += src->from_freelist; + dest->inline_values += src->inline_values; + dest->dict_materialized_on_request += src->dict_materialized_on_request; + dest->dict_materialized_new_key += src->dict_materialized_new_key; + dest->dict_materialized_too_big += src->dict_materialized_too_big; + dest->dict_materialized_str_subclass += src->dict_materialized_str_subclass; + dest->type_cache_hits += src->type_cache_hits; + dest->type_cache_misses += src->type_cache_misses; + dest->type_cache_dunder_hits += src->type_cache_dunder_hits; + dest->type_cache_dunder_misses += src->type_cache_dunder_misses; + dest->type_cache_collisions += src->type_cache_collisions; + dest->object_visits += src->object_visits; +} + +static void +merge_uop_stats_array(UOpStats *dest, const UOpStats *src) +{ + for (int i = 0; i <= PYSTATS_MAX_UOP_ID; i++) { + dest[i].execution_count += src[i].execution_count; + dest[i].miss += src[i].miss; + for (int j = 0; j <= PYSTATS_MAX_UOP_ID; j++) { + dest[i].pair_count[j] += src[i].pair_count[j]; + } + } +} + +static void +merge_optimization_stats(OptimizationStats *dest, const OptimizationStats *src) +{ + dest->attempts += src->attempts; + dest->traces_created += src->traces_created; + dest->traces_executed += src->traces_executed; + dest->uops_executed += src->uops_executed; + dest->trace_stack_overflow += src->trace_stack_overflow; + dest->trace_stack_underflow += src->trace_stack_underflow; + dest->trace_too_long += src->trace_too_long; + dest->trace_too_short += src->trace_too_short; + dest->inner_loop += src->inner_loop; + dest->recursive_call += src->recursive_call; + dest->low_confidence += src->low_confidence; + dest->unknown_callee += src->unknown_callee; + dest->executors_invalidated += src->executors_invalidated; + dest->optimizer_attempts += src->optimizer_attempts; + dest->optimizer_successes += src->optimizer_successes; + dest->optimizer_failure_reason_no_memory += src->optimizer_failure_reason_no_memory; + dest->remove_globals_builtins_changed += src->remove_globals_builtins_changed; + dest->remove_globals_incorrect_keys += src->remove_globals_incorrect_keys; + dest->jit_total_memory_size += src->jit_total_memory_size; + dest->jit_code_size += src->jit_code_size; + dest->jit_trampoline_size += src->jit_trampoline_size; + dest->jit_data_size += src->jit_data_size; + dest->jit_padding_size += src->jit_padding_size; + dest->jit_freed_memory_size += src->jit_freed_memory_size; + + merge_uop_stats_array(dest->opcode, src->opcode); + + for (int i = 0; i < 256; i++) { + dest->unsupported_opcode[i] += src->unsupported_opcode[i]; + } + for (int i = 0; i < _Py_UOP_HIST_SIZE; i++) { + dest->trace_length_hist[i] += src->trace_length_hist[i]; + dest->trace_run_length_hist[i] += src->trace_run_length_hist[i]; + dest->optimized_trace_length_hist[i] += src->optimized_trace_length_hist[i]; + dest->trace_total_memory_hist[i] += src->trace_total_memory_hist[i]; + } + for (int i = 0; i <= PYSTATS_MAX_UOP_ID; i++) { + dest->error_in_opcode[i] += src->error_in_opcode[i]; + } +} + +static void +merge_ft_stats(FTStats *dest, const FTStats *src) +{ + dest->mutex_sleeps = src->mutex_sleeps; + dest->qsbr_polls = src->qsbr_polls; + dest->world_stops = src->world_stops; +} + +static void +merge_rare_event_stats(RareEventStats *dest, const RareEventStats *src) +{ + dest->set_class += src->set_class; + dest->set_bases += src->set_bases; + dest->set_eval_frame_func += src->set_eval_frame_func; + dest->builtin_dict += src->builtin_dict; + dest->func_modification += src->func_modification; + dest->watched_dict_modification += src->watched_dict_modification; + dest->watched_globals_modification += src->watched_globals_modification; +} + +#if 0 +static void +merge_gc_stats_array(GCStats *dest, const GCStats *src) +{ + for (int i = 0; i < NUM_GENERATIONS; i++) { + dest[i].collections += src[i].collections; + dest[i].object_visits += src[i].object_visits; + dest[i].objects_collected += src[i].objects_collected; + dest[i].objects_transitively_reachable += src[i].objects_transitively_reachable; + dest[i].objects_not_transitively_reachable += src[i].objects_not_transitively_reachable; + } +} +#endif + +// merge stats for a single thread into the global structure +void +stats_merge_thread(_PyThreadStateImpl *tstate, bool zero) +{ + PyStats *src = tstate->pystats_struct; + PyStats *dest = &pystats_struct; + + if (src == NULL) { + return; + } + + // Merge each category of stats using the helper functions. + merge_opcode_stats_array(dest->opcode_stats, src->opcode_stats); + merge_call_stats(&dest->call_stats, &src->call_stats); + merge_object_stats(&dest->object_stats, &src->object_stats); + merge_optimization_stats(&dest->optimization_stats, &src->optimization_stats); +#ifdef Py_GIL_DISABLED + merge_ft_stats(&dest->ft_stats, &src->ft_stats); +#endif + merge_rare_event_stats(&dest->rare_event_stats, &src->rare_event_stats); + //merge_gc_stats_array(dest->gc_stats, src->gc_stats); + + if (zero) { + // Zero the source stat counters + memset(src, 0, sizeof(pystats_struct)); + src->gc_stats = pystats_gc; + } +} + +// toggle stats collection on or off for all threads +static void +stats_toggle_on_off(void) +{ + PyThreadState *tstate = _PyThreadState_GET(); + if (tstate == NULL) { + return; + } + PyMutex_Lock(&pystats_mutex); + PyInterpreterState *interp = tstate->interp; + _Py_FOR_EACH_TSTATE_BEGIN(interp, ts) { + if (!ts->_status.active) { + continue; + } + _PyThreadStateImpl *ts_impl = (_PyThreadStateImpl *)ts; + PyStats *s; + if (pystats_enabled) { + s = ts_impl->pystats_struct; + } + else { + s = NULL; + } + // write to the tss variable for the 'ts' thread + *ts_impl->pystats_tss = s; + } + _Py_FOR_EACH_TSTATE_END(interp); + PyMutex_Unlock(&pystats_mutex); +} +#endif // Py_GIL_DISABLED + +// merge stats for all threads into the global structure +static void +stats_merge_all(void) +{ +#ifdef Py_GIL_DISABLED + if (!pystats_was_enabled) { + return; + } + PyThreadState *tstate = _PyThreadState_GET(); + if (tstate == NULL) { + return; + } + PyMutex_Lock(&pystats_mutex); + PyInterpreterState *interp = tstate->interp; + _Py_FOR_EACH_TSTATE_BEGIN(interp, ts) { + stats_merge_thread((_PyThreadStateImpl*)ts, true); + } + _Py_FOR_EACH_TSTATE_END(interp); + PyMutex_Unlock(&pystats_mutex); +#endif +} + +void +_Py_StatsOn(void) +{ + pystats_was_enabled = true; +#ifdef Py_GIL_DISABLED + pystats_enabled = true; + stats_toggle_on_off(); +#else + _Py_stats = &pystats_struct; +#endif +} + +void +_Py_StatsOff(void) +{ +#ifdef Py_GIL_DISABLED + pystats_enabled = false; + stats_toggle_on_off(); +#else + _Py_stats = NULL; +#endif +} + +void +_Py_StatsClear(void) +{ + stats_merge_all(); + memset(&pystats_gc, 0, sizeof(pystats_gc)); + memset(&pystats_struct, 0, sizeof(pystats_struct)); + pystats_struct.gc_stats = pystats_gc; +} + +static int +mem_is_zero(unsigned char *ptr, size_t size) +{ + for (size_t i=0; i < size; i++) { + if (*ptr != 0) { + return 0; + } + ptr++; + } + return 1; +} + +int +_Py_PrintSpecializationStats(int to_file) +{ + assert(to_file); + stats_merge_all(); + PyStats *stats = &pystats_struct; +#define MEM_IS_ZERO(DATA) mem_is_zero((unsigned char*)DATA, sizeof(*(DATA))) + int is_zero = ( + MEM_IS_ZERO(stats->gc_stats) // is a pointer + && MEM_IS_ZERO(&stats->opcode_stats) + && MEM_IS_ZERO(&stats->call_stats) + && MEM_IS_ZERO(&stats->object_stats) + ); +#undef MEM_IS_ZERO + if (is_zero) { + // gh-108753: -X pystats command line was used, but then _stats_off() + // and _stats_clear() have been called: in this case, avoid printing + // useless "all zeros" statistics. + return 0; + } + + FILE *out = stderr; + if (to_file) { + /* Write to a file instead of stderr. */ +# ifdef MS_WINDOWS + const char *dirname = "c:\\temp\\py_stats\\"; +# else + const char *dirname = "/tmp/py_stats/"; +# endif + /* Use random 160 bit number as file name, + * to avoid both accidental collisions and + * symlink attacks. */ + unsigned char rand[20]; + char hex_name[41]; + _PyOS_URandomNonblock(rand, 20); + for (int i = 0; i < 20; i++) { + hex_name[2*i] = Py_hexdigits[rand[i]&15]; + hex_name[2*i+1] = Py_hexdigits[(rand[i]>>4)&15]; + } + hex_name[40] = '\0'; + char buf[64]; + assert(strlen(dirname) + 40 + strlen(".txt") < 64); + sprintf(buf, "%s%s.txt", dirname, hex_name); + FILE *fout = fopen(buf, "w"); + if (fout) { + out = fout; + } + } + else { + fprintf(out, "Specialization stats:\n"); + } + print_stats(out, stats); + if (out != stderr) { + fclose(out); + } + return 1; +} + +bool +_PyStats_ThreadInit(_PyThreadStateImpl *tstate) +{ +#ifdef Py_GIL_DISABLED + tstate->pystats_struct = PyMem_RawCalloc(1, sizeof(pystats_struct)); + if (tstate->pystats_struct == NULL) { + return false; + } + tstate->pystats_struct->gc_stats = pystats_gc; +#endif + return true; +} + +void +_PyStats_ThreadFini(_PyThreadStateImpl *tstate) +{ +#ifdef Py_GIL_DISABLED + if (pystats_was_enabled) { + stats_merge_thread(tstate, false); + } + PyMem_RawFree(tstate->pystats_struct); +#endif +} + +void +_PyStats_Attach(_PyThreadStateImpl *tstate) +{ +#ifdef Py_GIL_DISABLED + PyStats *s; + if (pystats_enabled) { + s = tstate->pystats_struct; + } + else { + s = NULL; + } + // use correct TSS variable for thread + tstate->pystats_tss = &_Py_tss_stats; + // write to the tss variable for the 'ts' thread + _Py_tss_stats = s; +#endif +} + +void +_PyStats_Detach(_PyThreadStateImpl *tstate) +{ +#ifdef Py_GIL_DISABLED + tstate->pystats_tss = NULL; + _Py_tss_stats = NULL; +#endif +} + +#endif // Py_STATS diff --git a/Python/qsbr.c b/Python/qsbr.c index c992c285cb13e4..b2153bf9d67230 100644 --- a/Python/qsbr.c +++ b/Python/qsbr.c @@ -36,6 +36,7 @@ #include "pycore_pystate.h" // _PyThreadState_GET() #include "pycore_qsbr.h" #include "pycore_tstate.h" // _PyThreadStateImpl +#include "pycore_stats.h" // FT_STAT_QSBR_POLL_INC() // Starting size of the array of qsbr thread states @@ -158,7 +159,7 @@ _Py_qsbr_poll(struct _qsbr_thread_state *qsbr, uint64_t goal) if (_Py_qbsr_goal_reached(qsbr, goal)) { return true; } - + FT_STAT_QSBR_POLL_INC(); uint64_t rd_seq = qsbr_poll_scan(qsbr->shared); return QSBR_LEQ(goal, rd_seq); } diff --git a/Python/specialize.c b/Python/specialize.c index fe8d04cf3442f1..e62f9550c827b3 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -22,437 +22,23 @@ #include // rand() -extern const char *_PyUOpName(int index); - /* For guidance on adding or extending families of instructions see * InternalDocs/interpreter.md `Specialization` section. */ -#ifdef Py_STATS -GCStats _py_gc_stats[NUM_GENERATIONS] = { 0 }; -static PyStats _Py_stats_struct = { .gc_stats = _py_gc_stats }; -PyStats *_Py_stats = NULL; - -#if PYSTATS_MAX_UOP_ID < MAX_UOP_ID -#error "Not enough space allocated for pystats. Increase PYSTATS_MAX_UOP_ID to at least MAX_UOP_ID" -#endif - -#define ADD_STAT_TO_DICT(res, field) \ - do { \ - PyObject *val = PyLong_FromUnsignedLongLong(stats->field); \ - if (val == NULL) { \ - Py_DECREF(res); \ - return NULL; \ - } \ - if (PyDict_SetItemString(res, #field, val) == -1) { \ - Py_DECREF(res); \ - Py_DECREF(val); \ - return NULL; \ - } \ - Py_DECREF(val); \ - } while(0); - -static PyObject* -stats_to_dict(SpecializationStats *stats) -{ - PyObject *res = PyDict_New(); - if (res == NULL) { - return NULL; - } - ADD_STAT_TO_DICT(res, success); - ADD_STAT_TO_DICT(res, failure); - ADD_STAT_TO_DICT(res, hit); - ADD_STAT_TO_DICT(res, deferred); - ADD_STAT_TO_DICT(res, miss); - ADD_STAT_TO_DICT(res, deopt); - PyObject *failure_kinds = PyTuple_New(SPECIALIZATION_FAILURE_KINDS); - if (failure_kinds == NULL) { - Py_DECREF(res); - return NULL; - } - for (int i = 0; i < SPECIALIZATION_FAILURE_KINDS; i++) { - PyObject *stat = PyLong_FromUnsignedLongLong(stats->failure_kinds[i]); - if (stat == NULL) { - Py_DECREF(res); - Py_DECREF(failure_kinds); - return NULL; - } - PyTuple_SET_ITEM(failure_kinds, i, stat); - } - if (PyDict_SetItemString(res, "failure_kinds", failure_kinds)) { - Py_DECREF(res); - Py_DECREF(failure_kinds); - return NULL; - } - Py_DECREF(failure_kinds); - return res; -} -#undef ADD_STAT_TO_DICT - -static int -add_stat_dict( - PyObject *res, - int opcode, - const char *name) { - - SpecializationStats *stats = &_Py_stats_struct.opcode_stats[opcode].specialization; - PyObject *d = stats_to_dict(stats); - if (d == NULL) { - return -1; - } - int err = PyDict_SetItemString(res, name, d); - Py_DECREF(d); - return err; -} - -PyObject* -_Py_GetSpecializationStats(void) { - PyObject *stats = PyDict_New(); - if (stats == NULL) { - return NULL; - } - int err = 0; - err += add_stat_dict(stats, CONTAINS_OP, "contains_op"); - err += add_stat_dict(stats, LOAD_SUPER_ATTR, "load_super_attr"); - err += add_stat_dict(stats, LOAD_ATTR, "load_attr"); - err += add_stat_dict(stats, LOAD_GLOBAL, "load_global"); - err += add_stat_dict(stats, STORE_SUBSCR, "store_subscr"); - err += add_stat_dict(stats, STORE_ATTR, "store_attr"); - err += add_stat_dict(stats, JUMP_BACKWARD, "jump_backward"); - err += add_stat_dict(stats, CALL, "call"); - err += add_stat_dict(stats, CALL_KW, "call_kw"); - err += add_stat_dict(stats, BINARY_OP, "binary_op"); - err += add_stat_dict(stats, COMPARE_OP, "compare_op"); - err += add_stat_dict(stats, UNPACK_SEQUENCE, "unpack_sequence"); - err += add_stat_dict(stats, FOR_ITER, "for_iter"); - err += add_stat_dict(stats, TO_BOOL, "to_bool"); - err += add_stat_dict(stats, SEND, "send"); - if (err < 0) { - Py_DECREF(stats); - return NULL; - } - return stats; -} - - -#define PRINT_STAT(i, field) \ - if (stats[i].field) { \ - fprintf(out, " opcode[%s]." #field " : %" PRIu64 "\n", _PyOpcode_OpName[i], stats[i].field); \ - } - -static void -print_spec_stats(FILE *out, OpcodeStats *stats) -{ - /* Mark some opcodes as specializable for stats, - * even though we don't specialize them yet. */ - fprintf(out, "opcode[BINARY_SLICE].specializable : 1\n"); - fprintf(out, "opcode[STORE_SLICE].specializable : 1\n"); - fprintf(out, "opcode[GET_ITER].specializable : 1\n"); - for (int i = 0; i < 256; i++) { - if (_PyOpcode_Caches[i]) { - /* Ignore jumps as they cannot be specialized */ - switch (i) { - case POP_JUMP_IF_FALSE: - case POP_JUMP_IF_TRUE: - case POP_JUMP_IF_NONE: - case POP_JUMP_IF_NOT_NONE: - case JUMP_BACKWARD: - break; - default: - fprintf(out, "opcode[%s].specializable : 1\n", _PyOpcode_OpName[i]); - } - } - PRINT_STAT(i, specialization.success); - PRINT_STAT(i, specialization.failure); - PRINT_STAT(i, specialization.hit); - PRINT_STAT(i, specialization.deferred); - PRINT_STAT(i, specialization.miss); - PRINT_STAT(i, specialization.deopt); - PRINT_STAT(i, execution_count); - for (int j = 0; j < SPECIALIZATION_FAILURE_KINDS; j++) { - uint64_t val = stats[i].specialization.failure_kinds[j]; - if (val) { - fprintf(out, " opcode[%s].specialization.failure_kinds[%d] : %" - PRIu64 "\n", _PyOpcode_OpName[i], j, val); - } - } - for (int j = 0; j < 256; j++) { - if (stats[i].pair_count[j]) { - fprintf(out, "opcode[%s].pair_count[%s] : %" PRIu64 "\n", - _PyOpcode_OpName[i], _PyOpcode_OpName[j], stats[i].pair_count[j]); - } - } - } -} -#undef PRINT_STAT - - -static void -print_call_stats(FILE *out, CallStats *stats) -{ - fprintf(out, "Calls to PyEval_EvalDefault: %" PRIu64 "\n", stats->pyeval_calls); - fprintf(out, "Calls to Python functions inlined: %" PRIu64 "\n", stats->inlined_py_calls); - fprintf(out, "Frames pushed: %" PRIu64 "\n", stats->frames_pushed); - fprintf(out, "Frame objects created: %" PRIu64 "\n", stats->frame_objects_created); - for (int i = 0; i < EVAL_CALL_KINDS; i++) { - fprintf(out, "Calls via PyEval_EvalFrame[%d] : %" PRIu64 "\n", i, stats->eval_calls[i]); - } -} - -static void -print_object_stats(FILE *out, ObjectStats *stats) -{ - fprintf(out, "Object allocations from freelist: %" PRIu64 "\n", stats->from_freelist); - fprintf(out, "Object frees to freelist: %" PRIu64 "\n", stats->to_freelist); - fprintf(out, "Object allocations: %" PRIu64 "\n", stats->allocations); - fprintf(out, "Object allocations to 512 bytes: %" PRIu64 "\n", stats->allocations512); - fprintf(out, "Object allocations to 4 kbytes: %" PRIu64 "\n", stats->allocations4k); - fprintf(out, "Object allocations over 4 kbytes: %" PRIu64 "\n", stats->allocations_big); - fprintf(out, "Object frees: %" PRIu64 "\n", stats->frees); - fprintf(out, "Object inline values: %" PRIu64 "\n", stats->inline_values); - fprintf(out, "Object interpreter mortal increfs: %" PRIu64 "\n", stats->interpreter_increfs); - fprintf(out, "Object interpreter mortal decrefs: %" PRIu64 "\n", stats->interpreter_decrefs); - fprintf(out, "Object mortal increfs: %" PRIu64 "\n", stats->increfs); - fprintf(out, "Object mortal decrefs: %" PRIu64 "\n", stats->decrefs); - fprintf(out, "Object interpreter immortal increfs: %" PRIu64 "\n", stats->interpreter_immortal_increfs); - fprintf(out, "Object interpreter immortal decrefs: %" PRIu64 "\n", stats->interpreter_immortal_decrefs); - fprintf(out, "Object immortal increfs: %" PRIu64 "\n", stats->immortal_increfs); - fprintf(out, "Object immortal decrefs: %" PRIu64 "\n", stats->immortal_decrefs); - fprintf(out, "Object materialize dict (on request): %" PRIu64 "\n", stats->dict_materialized_on_request); - fprintf(out, "Object materialize dict (new key): %" PRIu64 "\n", stats->dict_materialized_new_key); - fprintf(out, "Object materialize dict (too big): %" PRIu64 "\n", stats->dict_materialized_too_big); - fprintf(out, "Object materialize dict (str subclass): %" PRIu64 "\n", stats->dict_materialized_str_subclass); - fprintf(out, "Object method cache hits: %" PRIu64 "\n", stats->type_cache_hits); - fprintf(out, "Object method cache misses: %" PRIu64 "\n", stats->type_cache_misses); - fprintf(out, "Object method cache collisions: %" PRIu64 "\n", stats->type_cache_collisions); - fprintf(out, "Object method cache dunder hits: %" PRIu64 "\n", stats->type_cache_dunder_hits); - fprintf(out, "Object method cache dunder misses: %" PRIu64 "\n", stats->type_cache_dunder_misses); -} - -static void -print_gc_stats(FILE *out, GCStats *stats) -{ - for (int i = 0; i < NUM_GENERATIONS; i++) { - fprintf(out, "GC[%d] collections: %" PRIu64 "\n", i, stats[i].collections); - fprintf(out, "GC[%d] object visits: %" PRIu64 "\n", i, stats[i].object_visits); - fprintf(out, "GC[%d] objects collected: %" PRIu64 "\n", i, stats[i].objects_collected); - fprintf(out, "GC[%d] objects reachable from roots: %" PRIu64 "\n", i, stats[i].objects_transitively_reachable); - fprintf(out, "GC[%d] objects not reachable from roots: %" PRIu64 "\n", i, stats[i].objects_not_transitively_reachable); - } -} - -#ifdef _Py_TIER2 -static void -print_histogram(FILE *out, const char *name, uint64_t hist[_Py_UOP_HIST_SIZE]) -{ - for (int i = 0; i < _Py_UOP_HIST_SIZE; i++) { - fprintf(out, "%s[%" PRIu64"]: %" PRIu64 "\n", name, (uint64_t)1 << i, hist[i]); - } -} - -static void -print_optimization_stats(FILE *out, OptimizationStats *stats) -{ - fprintf(out, "Optimization attempts: %" PRIu64 "\n", stats->attempts); - fprintf(out, "Optimization traces created: %" PRIu64 "\n", stats->traces_created); - fprintf(out, "Optimization traces executed: %" PRIu64 "\n", stats->traces_executed); - fprintf(out, "Optimization uops executed: %" PRIu64 "\n", stats->uops_executed); - fprintf(out, "Optimization trace stack overflow: %" PRIu64 "\n", stats->trace_stack_overflow); - fprintf(out, "Optimization trace stack underflow: %" PRIu64 "\n", stats->trace_stack_underflow); - fprintf(out, "Optimization trace too long: %" PRIu64 "\n", stats->trace_too_long); - fprintf(out, "Optimization trace too short: %" PRIu64 "\n", stats->trace_too_short); - fprintf(out, "Optimization inner loop: %" PRIu64 "\n", stats->inner_loop); - fprintf(out, "Optimization recursive call: %" PRIu64 "\n", stats->recursive_call); - fprintf(out, "Optimization low confidence: %" PRIu64 "\n", stats->low_confidence); - fprintf(out, "Optimization unknown callee: %" PRIu64 "\n", stats->unknown_callee); - fprintf(out, "Executors invalidated: %" PRIu64 "\n", stats->executors_invalidated); - - print_histogram(out, "Trace length", stats->trace_length_hist); - print_histogram(out, "Trace run length", stats->trace_run_length_hist); - print_histogram(out, "Optimized trace length", stats->optimized_trace_length_hist); - - fprintf(out, "Optimization optimizer attempts: %" PRIu64 "\n", stats->optimizer_attempts); - fprintf(out, "Optimization optimizer successes: %" PRIu64 "\n", stats->optimizer_successes); - fprintf(out, "Optimization optimizer failure no memory: %" PRIu64 "\n", - stats->optimizer_failure_reason_no_memory); - fprintf(out, "Optimizer remove globals builtins changed: %" PRIu64 "\n", stats->remove_globals_builtins_changed); - fprintf(out, "Optimizer remove globals incorrect keys: %" PRIu64 "\n", stats->remove_globals_incorrect_keys); - for (int i = 0; i <= MAX_UOP_ID; i++) { - if (stats->opcode[i].execution_count) { - fprintf(out, "uops[%s].execution_count : %" PRIu64 "\n", _PyUOpName(i), stats->opcode[i].execution_count); - } - if (stats->opcode[i].miss) { - fprintf(out, "uops[%s].specialization.miss : %" PRIu64 "\n", _PyUOpName(i), stats->opcode[i].miss); - } - } - for (int i = 0; i < 256; i++) { - if (stats->unsupported_opcode[i]) { - fprintf( - out, - "unsupported_opcode[%s].count : %" PRIu64 "\n", - _PyOpcode_OpName[i], - stats->unsupported_opcode[i] - ); - } - } - - for (int i = 1; i <= MAX_UOP_ID; i++){ - for (int j = 1; j <= MAX_UOP_ID; j++) { - if (stats->opcode[i].pair_count[j]) { - fprintf(out, "uop[%s].pair_count[%s] : %" PRIu64 "\n", - _PyOpcode_uop_name[i], _PyOpcode_uop_name[j], stats->opcode[i].pair_count[j]); - } - } - } - for (int i = 0; i < MAX_UOP_ID; i++) { - if (stats->error_in_opcode[i]) { - fprintf( - out, - "error_in_opcode[%s].count : %" PRIu64 "\n", - _PyUOpName(i), - stats->error_in_opcode[i] - ); - } - } - fprintf(out, "JIT total memory size: %" PRIu64 "\n", stats->jit_total_memory_size); - fprintf(out, "JIT code size: %" PRIu64 "\n", stats->jit_code_size); - fprintf(out, "JIT trampoline size: %" PRIu64 "\n", stats->jit_trampoline_size); - fprintf(out, "JIT data size: %" PRIu64 "\n", stats->jit_data_size); - fprintf(out, "JIT padding size: %" PRIu64 "\n", stats->jit_padding_size); - fprintf(out, "JIT freed memory size: %" PRIu64 "\n", stats->jit_freed_memory_size); - - print_histogram(out, "Trace total memory size", stats->trace_total_memory_hist); -} -#endif - -static void -print_rare_event_stats(FILE *out, RareEventStats *stats) -{ - fprintf(out, "Rare event (set_class): %" PRIu64 "\n", stats->set_class); - fprintf(out, "Rare event (set_bases): %" PRIu64 "\n", stats->set_bases); - fprintf(out, "Rare event (set_eval_frame_func): %" PRIu64 "\n", stats->set_eval_frame_func); - fprintf(out, "Rare event (builtin_dict): %" PRIu64 "\n", stats->builtin_dict); - fprintf(out, "Rare event (func_modification): %" PRIu64 "\n", stats->func_modification); - fprintf(out, "Rare event (watched_dict_modification): %" PRIu64 "\n", stats->watched_dict_modification); - fprintf(out, "Rare event (watched_globals_modification): %" PRIu64 "\n", stats->watched_globals_modification); -} - -static void -print_stats(FILE *out, PyStats *stats) -{ - print_spec_stats(out, stats->opcode_stats); - print_call_stats(out, &stats->call_stats); - print_object_stats(out, &stats->object_stats); - print_gc_stats(out, stats->gc_stats); -#ifdef _Py_TIER2 - print_optimization_stats(out, &stats->optimization_stats); -#endif - print_rare_event_stats(out, &stats->rare_event_stats); -} - -void -_Py_StatsOn(void) -{ - _Py_stats = &_Py_stats_struct; -} - -void -_Py_StatsOff(void) -{ - _Py_stats = NULL; -} - -void -_Py_StatsClear(void) -{ - memset(&_py_gc_stats, 0, sizeof(_py_gc_stats)); - memset(&_Py_stats_struct, 0, sizeof(_Py_stats_struct)); - _Py_stats_struct.gc_stats = _py_gc_stats; -} - -static int -mem_is_zero(unsigned char *ptr, size_t size) -{ - for (size_t i=0; i < size; i++) { - if (*ptr != 0) { - return 0; - } - ptr++; - } - return 1; -} - -int -_Py_PrintSpecializationStats(int to_file) -{ - PyStats *stats = &_Py_stats_struct; -#define MEM_IS_ZERO(DATA) mem_is_zero((unsigned char*)DATA, sizeof(*(DATA))) - int is_zero = ( - MEM_IS_ZERO(stats->gc_stats) // is a pointer - && MEM_IS_ZERO(&stats->opcode_stats) - && MEM_IS_ZERO(&stats->call_stats) - && MEM_IS_ZERO(&stats->object_stats) - ); -#undef MEM_IS_ZERO - if (is_zero) { - // gh-108753: -X pystats command line was used, but then _stats_off() - // and _stats_clear() have been called: in this case, avoid printing - // useless "all zeros" statistics. - return 0; - } - - FILE *out = stderr; - if (to_file) { - /* Write to a file instead of stderr. */ -# ifdef MS_WINDOWS - const char *dirname = "c:\\temp\\py_stats\\"; -# else - const char *dirname = "/tmp/py_stats/"; -# endif - /* Use random 160 bit number as file name, - * to avoid both accidental collisions and - * symlink attacks. */ - unsigned char rand[20]; - char hex_name[41]; - _PyOS_URandomNonblock(rand, 20); - for (int i = 0; i < 20; i++) { - hex_name[2*i] = Py_hexdigits[rand[i]&15]; - hex_name[2*i+1] = Py_hexdigits[(rand[i]>>4)&15]; - } - hex_name[40] = '\0'; - char buf[64]; - assert(strlen(dirname) + 40 + strlen(".txt") < 64); - sprintf(buf, "%s%s.txt", dirname, hex_name); - FILE *fout = fopen(buf, "w"); - if (fout) { - out = fout; - } - } - else { - fprintf(out, "Specialization stats:\n"); - } - print_stats(out, stats); - if (out != stderr) { - fclose(out); - } - return 1; -} - +#if Py_STATS #define SPECIALIZATION_FAIL(opcode, kind) \ do { \ - if (_Py_stats) { \ + PyStats *s = _PyStats_GET(); \ + if (s) { \ int _kind = (kind); \ assert(_kind < SPECIALIZATION_FAILURE_KINDS); \ - _Py_stats->opcode_stats[opcode].specialization.failure_kinds[_kind]++; \ + s->opcode_stats[opcode].specialization.failure_kinds[_kind]++; \ } \ } while (0) - -#endif // Py_STATS - - -#ifndef SPECIALIZATION_FAIL +#else # define SPECIALIZATION_FAIL(opcode, kind) ((void)0) -#endif +#endif // Py_STATS // Initialize warmup counters and optimize instructions. This cannot fail. void From 7dc919150cb2f92fa5ba189d3ed0cd48b777f77a Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Tue, 29 Jul 2025 17:47:39 -0700 Subject: [PATCH 2/7] Make pystats global state be per-interpreter. --- Include/cpython/pystats.h | 19 +- Include/internal/pycore_interp_structs.h | 14 +- Include/internal/pycore_pystats.h | 2 +- Include/internal/pycore_stats.h | 2 +- Include/internal/pycore_tstate.h | 8 +- Objects/object.c | 7 + Python/initconfig.c | 6 - Python/pystate.c | 18 +- Python/pystats.c | 211 ++++++++++++----------- Python/sysmodule.c | 4 +- Tools/c-analyzer/cpython/ignored.tsv | 1 + 11 files changed, 153 insertions(+), 139 deletions(-) diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h index 16b7bd739c4cc0..cfa28da9d0170d 100644 --- a/Include/cpython/pystats.h +++ b/Include/cpython/pystats.h @@ -186,12 +186,10 @@ typedef struct _stats { FTStats ft_stats; #endif RareEventStats rare_event_stats; - GCStats *gc_stats; + GCStats gc_stats[3]; // must match NUM_GENERATIONS } PyStats; -#ifdef Py_GIL_DISABLED - #if defined(HAVE_THREAD_LOCAL) && !defined(Py_BUILD_CORE_MODULE) extern _Py_thread_local PyStats *_Py_tss_stats; #endif @@ -200,30 +198,15 @@ extern _Py_thread_local PyStats *_Py_tss_stats; // inline function. PyAPI_FUNC(PyStats *) _PyStats_GetLocal(void); -#else // !Py_GIL_DISABLED - -// Export for shared extensions like 'math' -PyAPI_DATA(PyStats*) _Py_stats; - -#endif - // Return pointer to the PyStats structure, NULL if recording is off. static inline PyStats* _PyStats_GET(void) { -#ifdef Py_GIL_DISABLED - #if defined(HAVE_THREAD_LOCAL) && !defined(Py_BUILD_CORE_MODULE) return _Py_tss_stats; #else return _PyStats_GetLocal(); #endif - -#else // !Py_GIL_DISABLED - - return _Py_stats; - -#endif } #define _Py_STATS_EXPR(expr) \ diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index 542a75617b4d3c..e4a90c2286851b 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -198,7 +198,7 @@ enum _GCPhase { }; /* If we change this, we need to change the default value in the - signature of gc.collect. */ + signature of gc.collect and change the size of PyStats.gc_stats */ #define NUM_GENERATIONS 3 struct _gc_runtime_state { @@ -968,6 +968,18 @@ struct _is { # ifdef Py_STACKREF_CLOSE_DEBUG _Py_hashtable_t *closed_stackrefs_table; # endif +#endif + +#ifdef Py_STATS + // true if recording of pystats is on, this is used when new threads + // are created to decide if recording should be on for them + bool pystats_enabled; + // allocated when (and if) stats are first enabled + PyStats *pystats_struct; +#ifdef Py_GIL_DISABLED + // held when pystats related interpreter state is being updated + PyMutex pystats_mutex; +#endif #endif /* the initial PyInterpreterState.threads.head */ diff --git a/Include/internal/pycore_pystats.h b/Include/internal/pycore_pystats.h index f8af398a560586..50ab21aa0f1902 100644 --- a/Include/internal/pycore_pystats.h +++ b/Include/internal/pycore_pystats.h @@ -9,7 +9,7 @@ extern "C" { #endif #ifdef Py_STATS -extern void _Py_StatsOn(void); +extern int _Py_StatsOn(void); extern void _Py_StatsOff(void); extern void _Py_StatsClear(void); extern int _Py_PrintSpecializationStats(int to_file); diff --git a/Include/internal/pycore_stats.h b/Include/internal/pycore_stats.h index fd7755f128f7c5..7d39ac1bc129ec 100644 --- a/Include/internal/pycore_stats.h +++ b/Include/internal/pycore_stats.h @@ -110,7 +110,7 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void); RARE_EVENT_INTERP_INC(interp, name); \ } while (0); \ -bool _PyStats_ThreadInit(_PyThreadStateImpl *); +bool _PyStats_ThreadInit(PyInterpreterState *, _PyThreadStateImpl *); void _PyStats_ThreadFini(_PyThreadStateImpl *); void _PyStats_Attach(_PyThreadStateImpl *); void _PyStats_Detach(_PyThreadStateImpl *); diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index 0c28647a094a24..11fc55dc238eb7 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -71,14 +71,16 @@ typedef struct _PyThreadStateImpl { // When >1, code objects do not immortalize their non-string constants. int suppress_co_const_immortalization; +#endif // Py_GIL_DISABLED + #ifdef Py_STATS - // per-thread stats, will be merged into the _Py_stats_struct global +#ifdef Py_GIL_DISABLED + // per-thread stats, will be merged into interp->pystats_struct PyStats *pystats_struct; // allocated by _PyStats_ThreadInit() +#endif PyStats **pystats_tss; // pointer to tss variable #endif -#endif // Py_GIL_DISABLED - #if defined(Py_REF_DEBUG) && defined(Py_GIL_DISABLED) Py_ssize_t reftotal; // this thread's total refcount operations #endif diff --git a/Objects/object.c b/Objects/object.c index 479f4176a46039..62a27ae1f4e585 100644 --- a/Objects/object.c +++ b/Objects/object.c @@ -2418,6 +2418,13 @@ _PyObject_InitState(PyInterpreterState *interp) if (refchain_init(interp) < 0) { return _PyStatus_NO_MEMORY(); } +#endif +#ifdef Py_STATS + if (interp->config._pystats) { + // start with pystats enabled, can be disabled via sys._stats_off() + // this needs to be set before the first tstate is created + interp->pystats_enabled = true; + } #endif return _PyStatus_OK(); } diff --git a/Python/initconfig.c b/Python/initconfig.c index cc0db19d416058..8c4a72217b5343 100644 --- a/Python/initconfig.c +++ b/Python/initconfig.c @@ -2807,12 +2807,6 @@ _PyConfig_Write(const PyConfig *config, _PyRuntimeState *runtime) return _PyStatus_NO_MEMORY(); } -#ifdef Py_STATS - if (config->_pystats) { - _Py_StatsOn(); - } -#endif - return _PyStatus_OK(); } diff --git a/Python/pystate.c b/Python/pystate.c index 9cbdca384aab10..1674ef9b37ecc8 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -22,10 +22,9 @@ #include "pycore_runtime.h" // _PyRuntime #include "pycore_runtime_init.h" // _PyRuntimeState_INIT #include "pycore_stackref.h" // Py_STACKREF_DEBUG +#include "pycore_stats.h" // FT_STAT_WORLD_STOP_INC() #include "pycore_time.h" // _PyTime_Init() #include "pycore_uniqueid.h" // _PyObject_FinalizePerThreadRefcounts() -#include "pycore_stats.h" // FT_STAT_WORLD_STOP_INC() - /* -------------------------------------------------------------------------- @@ -480,6 +479,12 @@ alloc_interpreter(void) static void free_interpreter(PyInterpreterState *interp) { +#ifdef Py_STATS + if (interp->pystats_struct) { + PyMem_RawFree(interp->pystats_struct); + interp->pystats_struct = NULL; + } +#endif // The main interpreter is statically allocated so // should not be freed. if (interp != &_PyRuntime._main_interpreter) { @@ -1532,7 +1537,7 @@ new_threadstate(PyInterpreterState *interp, int whence) #endif #ifdef Py_STATS // The PyStats structure is quite large and is allocated separated from tstate. - if (!_PyStats_ThreadInit(tstate)) { + if (!_PyStats_ThreadInit(interp, tstate)) { free_threadstate(tstate); return NULL; } @@ -2017,9 +2022,6 @@ tstate_activate(PyThreadState *tstate) if (!tstate->_status.bound_gilstate) { bind_gilstate_tstate(tstate); } -#ifdef Py_STATS - _PyStats_Attach((_PyThreadStateImpl *)tstate); -#endif tstate->_status.active = 1; } @@ -2139,6 +2141,10 @@ _PyThreadState_Attach(PyThreadState *tstate) _PyCriticalSection_Resume(tstate); } +#ifdef Py_STATS + _PyStats_Attach((_PyThreadStateImpl *)tstate); +#endif + #if defined(Py_DEBUG) errno = err; #endif diff --git a/Python/pystats.c b/Python/pystats.c index e0217ae0fef851..24564faf3bb3f4 100644 --- a/Python/pystats.c +++ b/Python/pystats.c @@ -9,26 +9,8 @@ #include // rand() -extern const char *_PyUOpName(int index); - #ifdef Py_STATS -static bool pystats_was_enabled; - -static GCStats pystats_gc[NUM_GENERATIONS] = { 0 }; - -static PyStats pystats_struct = { .gc_stats = pystats_gc }; - - -#ifdef Py_GIL_DISABLED - -// true if recording of pystats is on, this is used when new threads -// are created to decide if recording should be on for them -static bool pystats_enabled; - -// held when global pystats structure is being updated -static PyMutex pystats_mutex; - // Pointer to Thread-local stats structure, null if recording is off. _Py_thread_local PyStats *_Py_tss_stats; @@ -38,11 +20,13 @@ _PyStats_GetLocal(void) return _Py_tss_stats; } -#else // !Py_GIL_DISABLED - -PyStats *_Py_stats; - -#endif // Py_GIL_DISABLED +#ifdef Py_GIL_DISABLED +#define STATS_LOCK(interp) PyMutex_Lock(&interp->pystats_mutex) +#define STATS_UNLOCK(interp) PyMutex_Unlock(&interp->pystats_mutex) +#else +#define STATS_LOCK(interp) +#define STATS_UNLOCK(interp) +#endif #if PYSTATS_MAX_UOP_ID < MAX_UOP_ID @@ -103,11 +87,12 @@ stats_to_dict(SpecializationStats *stats) static int add_stat_dict( + PyStats *src, PyObject *res, int opcode, const char *name) { - SpecializationStats *stats = &pystats_struct.opcode_stats[opcode].specialization; + SpecializationStats *stats = &src->opcode_stats[opcode].specialization; PyObject *d = stats_to_dict(stats); if (d == NULL) { return -1; @@ -119,26 +104,31 @@ add_stat_dict( PyObject* _Py_GetSpecializationStats(void) { + PyThreadState *tstate = _PyThreadState_GET(); + PyStats *src = tstate->interp->pystats_struct; + if (src == NULL) { + Py_RETURN_NONE; + } PyObject *stats = PyDict_New(); if (stats == NULL) { return NULL; } int err = 0; - err += add_stat_dict(stats, CONTAINS_OP, "contains_op"); - err += add_stat_dict(stats, LOAD_SUPER_ATTR, "load_super_attr"); - err += add_stat_dict(stats, LOAD_ATTR, "load_attr"); - err += add_stat_dict(stats, LOAD_GLOBAL, "load_global"); - err += add_stat_dict(stats, STORE_SUBSCR, "store_subscr"); - err += add_stat_dict(stats, STORE_ATTR, "store_attr"); - err += add_stat_dict(stats, JUMP_BACKWARD, "jump_backward"); - err += add_stat_dict(stats, CALL, "call"); - err += add_stat_dict(stats, CALL_KW, "call_kw"); - err += add_stat_dict(stats, BINARY_OP, "binary_op"); - err += add_stat_dict(stats, COMPARE_OP, "compare_op"); - err += add_stat_dict(stats, UNPACK_SEQUENCE, "unpack_sequence"); - err += add_stat_dict(stats, FOR_ITER, "for_iter"); - err += add_stat_dict(stats, TO_BOOL, "to_bool"); - err += add_stat_dict(stats, SEND, "send"); + err += add_stat_dict(src, stats, CONTAINS_OP, "contains_op"); + err += add_stat_dict(src, stats, LOAD_SUPER_ATTR, "load_super_attr"); + err += add_stat_dict(src, stats, LOAD_ATTR, "load_attr"); + err += add_stat_dict(src, stats, LOAD_GLOBAL, "load_global"); + err += add_stat_dict(src, stats, STORE_SUBSCR, "store_subscr"); + err += add_stat_dict(src, stats, STORE_ATTR, "store_attr"); + err += add_stat_dict(src, stats, JUMP_BACKWARD, "jump_backward"); + err += add_stat_dict(src, stats, CALL, "call"); + err += add_stat_dict(src, stats, CALL_KW, "call_kw"); + err += add_stat_dict(src, stats, BINARY_OP, "binary_op"); + err += add_stat_dict(src, stats, COMPARE_OP, "compare_op"); + err += add_stat_dict(src, stats, UNPACK_SEQUENCE, "unpack_sequence"); + err += add_stat_dict(src, stats, FOR_ITER, "for_iter"); + err += add_stat_dict(src, stats, TO_BOOL, "to_bool"); + err += add_stat_dict(src, stats, SEND, "send"); if (err < 0) { Py_DECREF(stats); return NULL; @@ -262,6 +252,8 @@ print_histogram(FILE *out, const char *name, uint64_t hist[_Py_UOP_HIST_SIZE]) } } +extern const char *_PyUOpName(int index); + static void print_optimization_stats(FILE *out, OptimizationStats *stats) { @@ -376,6 +368,7 @@ print_stats(FILE *out, PyStats *stats) } #ifdef Py_GIL_DISABLED + static void merge_specialization_stats(SpecializationStats *dest, const SpecializationStats *src) { @@ -521,7 +514,6 @@ merge_rare_event_stats(RareEventStats *dest, const RareEventStats *src) dest->watched_globals_modification += src->watched_globals_modification; } -#if 0 static void merge_gc_stats_array(GCStats *dest, const GCStats *src) { @@ -533,16 +525,15 @@ merge_gc_stats_array(GCStats *dest, const GCStats *src) dest[i].objects_not_transitively_reachable += src[i].objects_not_transitively_reachable; } } -#endif // merge stats for a single thread into the global structure void stats_merge_thread(_PyThreadStateImpl *tstate, bool zero) { PyStats *src = tstate->pystats_struct; - PyStats *dest = &pystats_struct; + PyStats *dest = ((PyThreadState *)tstate)->interp->pystats_struct; - if (src == NULL) { + if (src == NULL || dest == NULL) { return; } @@ -551,28 +542,21 @@ stats_merge_thread(_PyThreadStateImpl *tstate, bool zero) merge_call_stats(&dest->call_stats, &src->call_stats); merge_object_stats(&dest->object_stats, &src->object_stats); merge_optimization_stats(&dest->optimization_stats, &src->optimization_stats); -#ifdef Py_GIL_DISABLED merge_ft_stats(&dest->ft_stats, &src->ft_stats); -#endif merge_rare_event_stats(&dest->rare_event_stats, &src->rare_event_stats); - //merge_gc_stats_array(dest->gc_stats, src->gc_stats); + merge_gc_stats_array(dest->gc_stats, src->gc_stats); if (zero) { // Zero the source stat counters - memset(src, 0, sizeof(pystats_struct)); - src->gc_stats = pystats_gc; + memset(src, 0, sizeof(PyStats)); } } +#endif // Py_GIL_DISABLED // toggle stats collection on or off for all threads static void -stats_toggle_on_off(void) +stats_toggle_on_off(PyThreadState *tstate) { - PyThreadState *tstate = _PyThreadState_GET(); - if (tstate == NULL) { - return; - } - PyMutex_Lock(&pystats_mutex); PyInterpreterState *interp = tstate->interp; _Py_FOR_EACH_TSTATE_BEGIN(interp, ts) { if (!ts->_status.active) { @@ -580,8 +564,12 @@ stats_toggle_on_off(void) } _PyThreadStateImpl *ts_impl = (_PyThreadStateImpl *)ts; PyStats *s; - if (pystats_enabled) { + if (interp->pystats_enabled) { +#ifdef Py_GIL_DISABLED s = ts_impl->pystats_struct; +#else + s = ((PyThreadState *)tstate)->interp->pystats_struct; +#endif } else { s = NULL; @@ -590,62 +578,64 @@ stats_toggle_on_off(void) *ts_impl->pystats_tss = s; } _Py_FOR_EACH_TSTATE_END(interp); - PyMutex_Unlock(&pystats_mutex); } -#endif // Py_GIL_DISABLED // merge stats for all threads into the global structure static void stats_merge_all(void) { #ifdef Py_GIL_DISABLED - if (!pystats_was_enabled) { - return; - } PyThreadState *tstate = _PyThreadState_GET(); if (tstate == NULL) { return; } - PyMutex_Lock(&pystats_mutex); PyInterpreterState *interp = tstate->interp; _Py_FOR_EACH_TSTATE_BEGIN(interp, ts) { stats_merge_thread((_PyThreadStateImpl*)ts, true); } _Py_FOR_EACH_TSTATE_END(interp); - PyMutex_Unlock(&pystats_mutex); #endif } -void +int _Py_StatsOn(void) { - pystats_was_enabled = true; -#ifdef Py_GIL_DISABLED - pystats_enabled = true; - stats_toggle_on_off(); -#else - _Py_stats = &pystats_struct; -#endif + PyThreadState *tstate = _PyThreadState_GET(); + PyInterpreterState *interp = tstate->interp; + STATS_LOCK(interp); + tstate->interp->pystats_enabled = true; + if (interp->pystats_struct == NULL) { + interp->pystats_struct = PyMem_RawCalloc(1, sizeof(PyStats)); + if (interp->pystats_struct == NULL) { + STATS_UNLOCK(interp); + return -1; + } + } + stats_toggle_on_off(tstate); + STATS_UNLOCK(interp); + return 0; } void _Py_StatsOff(void) { -#ifdef Py_GIL_DISABLED - pystats_enabled = false; - stats_toggle_on_off(); -#else - _Py_stats = NULL; -#endif + PyThreadState *tstate = _PyThreadState_GET(); + STATS_LOCK(tstate->interp); + tstate->interp->pystats_enabled = false; + stats_toggle_on_off(tstate); + STATS_UNLOCK(tstate->interp); } void _Py_StatsClear(void) { - stats_merge_all(); - memset(&pystats_gc, 0, sizeof(pystats_gc)); - memset(&pystats_struct, 0, sizeof(pystats_struct)); - pystats_struct.gc_stats = pystats_gc; + PyThreadState *tstate = _PyThreadState_GET(); + STATS_LOCK(tstate->interp); + if (tstate->interp->pystats_struct != NULL) { + stats_merge_all(); + memset(tstate->interp->pystats_struct, 0, sizeof(PyStats)); + } + STATS_UNLOCK(tstate->interp); } static int @@ -664,8 +654,13 @@ int _Py_PrintSpecializationStats(int to_file) { assert(to_file); - stats_merge_all(); - PyStats *stats = &pystats_struct; + PyThreadState *tstate = _PyThreadState_GET(); + STATS_LOCK(tstate->interp); + PyStats *stats = tstate->interp->pystats_struct; + if (stats == NULL) { + STATS_UNLOCK(tstate->interp); + return 0; + } #define MEM_IS_ZERO(DATA) mem_is_zero((unsigned char*)DATA, sizeof(*(DATA))) int is_zero = ( MEM_IS_ZERO(stats->gc_stats) // is a pointer @@ -674,6 +669,7 @@ _Py_PrintSpecializationStats(int to_file) && MEM_IS_ZERO(&stats->object_stats) ); #undef MEM_IS_ZERO + STATS_UNLOCK(tstate->interp); if (is_zero) { // gh-108753: -X pystats command line was used, but then _stats_off() // and _stats_clear() have been called: in this case, avoid printing @@ -711,7 +707,9 @@ _Py_PrintSpecializationStats(int to_file) else { fprintf(out, "Specialization stats:\n"); } + STATS_LOCK(tstate->interp); print_stats(out, stats); + STATS_UNLOCK(tstate->interp); if (out != stderr) { fclose(out); } @@ -719,14 +717,22 @@ _Py_PrintSpecializationStats(int to_file) } bool -_PyStats_ThreadInit(_PyThreadStateImpl *tstate) -{ +_PyStats_ThreadInit(PyInterpreterState *interp, _PyThreadStateImpl *tstate) +{ + STATS_LOCK(interp); + if (interp->pystats_enabled) { + interp->pystats_struct = PyMem_RawCalloc(1, sizeof(PyStats)); + if (interp->pystats_struct == NULL) { + STATS_UNLOCK(interp); + return false; + } + } + STATS_UNLOCK(interp); #ifdef Py_GIL_DISABLED - tstate->pystats_struct = PyMem_RawCalloc(1, sizeof(pystats_struct)); + tstate->pystats_struct = PyMem_RawCalloc(1, sizeof(PyStats)); if (tstate->pystats_struct == NULL) { return false; } - tstate->pystats_struct->gc_stats = pystats_gc; #endif return true; } @@ -735,9 +741,7 @@ void _PyStats_ThreadFini(_PyThreadStateImpl *tstate) { #ifdef Py_GIL_DISABLED - if (pystats_was_enabled) { - stats_merge_thread(tstate, false); - } + stats_merge_thread(tstate, false); PyMem_RawFree(tstate->pystats_struct); #endif } @@ -745,28 +749,31 @@ _PyStats_ThreadFini(_PyThreadStateImpl *tstate) void _PyStats_Attach(_PyThreadStateImpl *tstate) { + PyStats *s; + PyInterpreterState *interp = ((PyThreadState *)tstate)->interp; + STATS_LOCK(interp); + if (interp->pystats_enabled) { #ifdef Py_GIL_DISABLED - PyStats *s; - if (pystats_enabled) { - s = tstate->pystats_struct; - } - else { - s = NULL; - } - // use correct TSS variable for thread - tstate->pystats_tss = &_Py_tss_stats; - // write to the tss variable for the 'ts' thread - _Py_tss_stats = s; + s = tstate->pystats_struct; +#else + s = ((PyThreadState *)tstate)->interp->pystats_struct; #endif + } + else { + s = NULL; + } + STATS_UNLOCK(interp); + // use correct TSS variable for thread + tstate->pystats_tss = &_Py_tss_stats; + // write to the tss variable for the 'ts' thread + _Py_tss_stats = s; } void _PyStats_Detach(_PyThreadStateImpl *tstate) { -#ifdef Py_GIL_DISABLED tstate->pystats_tss = NULL; _Py_tss_stats = NULL; -#endif } #endif // Py_STATS diff --git a/Python/sysmodule.c b/Python/sysmodule.c index ae6cf306735939..5aaca50f0ee15d 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -2275,7 +2275,9 @@ static PyObject * sys__stats_on_impl(PyObject *module) /*[clinic end generated code: output=aca53eafcbb4d9fe input=43b5bfe145299e55]*/ { - _Py_StatsOn(); + if (_Py_StatsOn() < 0) { + return NULL; + } Py_RETURN_NONE; } diff --git a/Tools/c-analyzer/cpython/ignored.tsv b/Tools/c-analyzer/cpython/ignored.tsv index dc626e4bea0f59..22f90c743ba072 100644 --- a/Tools/c-analyzer/cpython/ignored.tsv +++ b/Tools/c-analyzer/cpython/ignored.tsv @@ -194,6 +194,7 @@ Python/pyfpe.c - PyFPE_counter - Python/import.c - pkgcontext - Python/pystate.c - _Py_tss_tstate - Python/pystate.c - _Py_tss_gilstate - +Python/pystats.c - _Py_tss_stats - ##----------------------- ## should be const From 60454a2e70407c415ee5d316d9221c22b550e4ee Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Tue, 29 Jul 2025 17:51:20 -0700 Subject: [PATCH 3/7] Add NEWS. --- .../2025-07-29-17-51-14.gh-issue-131253.GpRjWy.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-07-29-17-51-14.gh-issue-131253.GpRjWy.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-07-29-17-51-14.gh-issue-131253.GpRjWy.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-07-29-17-51-14.gh-issue-131253.GpRjWy.rst new file mode 100644 index 00000000000000..2826fad233058a --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-07-29-17-51-14.gh-issue-131253.GpRjWy.rst @@ -0,0 +1 @@ +Support the ``--enable-pystats`` build option for the free-threaded build. From c94e5c4325e9a98582d2f9834c8d34464b66db67 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Thu, 31 Jul 2025 13:27:00 -0700 Subject: [PATCH 4/7] Fix some data race issues. Need to do a merge before reporting (I lost that bit of code on a re-factor). Fix various issues with data races. When merging from all threads, we need to stop-the-world to avoid races. When toggling on or off, also need to stop-the-world. Remove the need for locking for _PyStats_Attach(). --- Include/internal/pycore_interp_structs.h | 2 +- Objects/object.c | 2 +- Python/pystats.c | 92 +++++++++++++----------- 3 files changed, 53 insertions(+), 43 deletions(-) diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index e4a90c2286851b..ebc5fc9ac9fd61 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -973,7 +973,7 @@ struct _is { #ifdef Py_STATS // true if recording of pystats is on, this is used when new threads // are created to decide if recording should be on for them - bool pystats_enabled; + int pystats_enabled; // allocated when (and if) stats are first enabled PyStats *pystats_struct; #ifdef Py_GIL_DISABLED diff --git a/Objects/object.c b/Objects/object.c index 62a27ae1f4e585..31bf81b826e1d8 100644 --- a/Objects/object.c +++ b/Objects/object.c @@ -2423,7 +2423,7 @@ _PyObject_InitState(PyInterpreterState *interp) if (interp->config._pystats) { // start with pystats enabled, can be disabled via sys._stats_off() // this needs to be set before the first tstate is created - interp->pystats_enabled = true; + interp->pystats_enabled = 1; } #endif return _PyStatus_OK(); diff --git a/Python/pystats.c b/Python/pystats.c index 24564faf3bb3f4..9715d6ddb94384 100644 --- a/Python/pystats.c +++ b/Python/pystats.c @@ -1,9 +1,10 @@ #include "Python.h" #include "pycore_opcode_metadata.h" // _PyOpcode_Caches +#include "pycore_pyatomic_ft_wrappers.h" +#include "pycore_pylifecycle.h" // _PyOS_URandomNonblock() #include "pycore_uop_metadata.h" // _PyOpcode_uop_name #include "pycore_uop_ids.h" // MAX_UOP_ID -#include "pycore_pylifecycle.h" // _PyOS_URandomNonblock() #include "pycore_pystate.h" // _PyThreadState_GET() #include "pycore_runtime.h" // NUM_GENERATIONS @@ -105,7 +106,7 @@ add_stat_dict( PyObject* _Py_GetSpecializationStats(void) { PyThreadState *tstate = _PyThreadState_GET(); - PyStats *src = tstate->interp->pystats_struct; + PyStats *src = FT_ATOMIC_LOAD_PTR_RELAXED(tstate->interp->pystats_struct); if (src == NULL) { Py_RETURN_NONE; } @@ -554,11 +555,30 @@ stats_merge_thread(_PyThreadStateImpl *tstate, bool zero) #endif // Py_GIL_DISABLED // toggle stats collection on or off for all threads -static void -stats_toggle_on_off(PyThreadState *tstate) +static int +stats_toggle_on_off(PyThreadState *tstate, int on) { + bool changed = false; PyInterpreterState *interp = tstate->interp; - _Py_FOR_EACH_TSTATE_BEGIN(interp, ts) { + STATS_LOCK(interp); + if (on && interp->pystats_struct == NULL) { + PyStats *s = PyMem_RawCalloc(1, sizeof(PyStats)); + if (s == NULL) { + STATS_UNLOCK(interp); + return -1; + } + FT_ATOMIC_STORE_PTR_RELAXED(interp->pystats_struct, s); + } + if (tstate->interp->pystats_enabled != on) { + FT_ATOMIC_STORE_INT_RELAXED(tstate->interp->pystats_enabled, on); + changed = true; + } + STATS_UNLOCK(interp); + if (!changed) { + return 0; + } + _PyEval_StopTheWorld(interp); + _Py_FOR_EACH_TSTATE_UNLOCKED(interp, ts) { if (!ts->_status.active) { continue; } @@ -577,65 +597,53 @@ stats_toggle_on_off(PyThreadState *tstate) // write to the tss variable for the 'ts' thread *ts_impl->pystats_tss = s; } - _Py_FOR_EACH_TSTATE_END(interp); + _PyEval_StartTheWorld(interp); + return 0; } -// merge stats for all threads into the global structure +// merge stats for all threads into the per-interpreter structure +// if 'zero' is true then the per-interpreter stats are zeroed after merging static void -stats_merge_all(void) +stats_merge_all(bool zero) { -#ifdef Py_GIL_DISABLED PyThreadState *tstate = _PyThreadState_GET(); if (tstate == NULL) { return; } + if (FT_ATOMIC_LOAD_PTR_RELAXED(tstate->interp->pystats_struct) == NULL) { + return; + } PyInterpreterState *interp = tstate->interp; - _Py_FOR_EACH_TSTATE_BEGIN(interp, ts) { - stats_merge_thread((_PyThreadStateImpl*)ts, true); + _PyEval_StopTheWorld(interp); +#ifdef Py_GIL_DISABLED + _Py_FOR_EACH_TSTATE_UNLOCKED(interp, ts) { + stats_merge_thread((_PyThreadStateImpl *)ts, true); } - _Py_FOR_EACH_TSTATE_END(interp); #endif + if (zero) { + memset(interp->pystats_struct, 0, sizeof(PyStats)); + } + _PyEval_StartTheWorld(interp); } int _Py_StatsOn(void) { PyThreadState *tstate = _PyThreadState_GET(); - PyInterpreterState *interp = tstate->interp; - STATS_LOCK(interp); - tstate->interp->pystats_enabled = true; - if (interp->pystats_struct == NULL) { - interp->pystats_struct = PyMem_RawCalloc(1, sizeof(PyStats)); - if (interp->pystats_struct == NULL) { - STATS_UNLOCK(interp); - return -1; - } - } - stats_toggle_on_off(tstate); - STATS_UNLOCK(interp); - return 0; + return stats_toggle_on_off(tstate, 1); } void _Py_StatsOff(void) { PyThreadState *tstate = _PyThreadState_GET(); - STATS_LOCK(tstate->interp); - tstate->interp->pystats_enabled = false; - stats_toggle_on_off(tstate); - STATS_UNLOCK(tstate->interp); + stats_toggle_on_off(tstate, 0); } void _Py_StatsClear(void) { - PyThreadState *tstate = _PyThreadState_GET(); - STATS_LOCK(tstate->interp); - if (tstate->interp->pystats_struct != NULL) { - stats_merge_all(); - memset(tstate->interp->pystats_struct, 0, sizeof(PyStats)); - } - STATS_UNLOCK(tstate->interp); + stats_merge_all(true); } static int @@ -654,6 +662,7 @@ int _Py_PrintSpecializationStats(int to_file) { assert(to_file); + stats_merge_all(false); PyThreadState *tstate = _PyThreadState_GET(); STATS_LOCK(tstate->interp); PyStats *stats = tstate->interp->pystats_struct; @@ -721,11 +730,12 @@ _PyStats_ThreadInit(PyInterpreterState *interp, _PyThreadStateImpl *tstate) { STATS_LOCK(interp); if (interp->pystats_enabled) { - interp->pystats_struct = PyMem_RawCalloc(1, sizeof(PyStats)); - if (interp->pystats_struct == NULL) { + PyStats *s = PyMem_RawCalloc(1, sizeof(PyStats)); + if (s == NULL) { STATS_UNLOCK(interp); return false; } + FT_ATOMIC_STORE_PTR_RELAXED(interp->pystats_struct, s); } STATS_UNLOCK(interp); #ifdef Py_GIL_DISABLED @@ -741,7 +751,9 @@ void _PyStats_ThreadFini(_PyThreadStateImpl *tstate) { #ifdef Py_GIL_DISABLED + STATS_LOCK(((PyThreadState *)tstate)->interp); stats_merge_thread(tstate, false); + STATS_UNLOCK(((PyThreadState *)tstate)->interp); PyMem_RawFree(tstate->pystats_struct); #endif } @@ -751,8 +763,7 @@ _PyStats_Attach(_PyThreadStateImpl *tstate) { PyStats *s; PyInterpreterState *interp = ((PyThreadState *)tstate)->interp; - STATS_LOCK(interp); - if (interp->pystats_enabled) { + if (FT_ATOMIC_LOAD_INT_RELAXED(interp->pystats_enabled)) { #ifdef Py_GIL_DISABLED s = tstate->pystats_struct; #else @@ -762,7 +773,6 @@ _PyStats_Attach(_PyThreadStateImpl *tstate) else { s = NULL; } - STATS_UNLOCK(interp); // use correct TSS variable for thread tstate->pystats_tss = &_Py_tss_stats; // write to the tss variable for the 'ts' thread From e77af2c703999ee20ebffa8380931a806a10e9d0 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Wed, 6 Aug 2025 15:49:46 -0700 Subject: [PATCH 5/7] Remove _Py_tss_stats thread local. Add pystats pointer to PyThreadState and use from there instead. This is slightly slower but shouldn't matter in practice. This simplifies the attach/detach logic as well. --- Include/cpython/pystate.h | 19 +++++++++++++++ Include/cpython/pystats.h | 24 +++++++------------ Include/internal/pycore_tstate.h | 11 ++++----- Python/pystats.c | 35 +++++++++++++--------------- Tools/c-analyzer/cpython/ignored.tsv | 1 - 5 files changed, 48 insertions(+), 42 deletions(-) diff --git a/Include/cpython/pystate.h b/Include/cpython/pystate.h index be582122118e44..d7eef3edaf69c2 100644 --- a/Include/cpython/pystate.h +++ b/Include/cpython/pystate.h @@ -208,6 +208,10 @@ struct _ts { */ PyObject *threading_local_sentinel; _PyRemoteDebuggerSupport remote_debugger_support; + +#ifdef Py_STATS + PyStats *pystats; // pointer PyStats structure, NULL if recording is off +#endif }; /* other API */ @@ -230,6 +234,21 @@ PyAPI_FUNC(void) PyThreadState_EnterTracing(PyThreadState *tstate); // function is set, otherwise disable them. PyAPI_FUNC(void) PyThreadState_LeaveTracing(PyThreadState *tstate); +#ifdef Py_STATS +#if defined(HAVE_THREAD_LOCAL) && !defined(Py_BUILD_CORE_MODULE) +extern _Py_thread_local PyThreadState* _Py_tss_tstate; + +static inline PyStats* +_PyThreadState_GetStatsFast(void) +{ + if (_Py_tss_tstate == NULL) { + return NULL; // no attached thread state + } + return _Py_tss_tstate->pystats; +} +#endif +#endif // Py_STATS + /* PyGILState */ /* Helper/diagnostic function - return 1 if the current thread diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h index cfa28da9d0170d..d2c976955813e6 100644 --- a/Include/cpython/pystats.h +++ b/Include/cpython/pystats.h @@ -14,7 +14,7 @@ // - sys._stats_dump() // // Python must be built with ./configure --enable-pystats to define the -// _PyStats_GET() function. +// _PyStats_GET() macro. // // Define _PY_INTERPRETER macro to increment interpreter_increfs and // interpreter_decrefs. Otherwise, increment increfs and decrefs. @@ -112,8 +112,11 @@ typedef struct _gc_stats { #ifdef Py_GIL_DISABLED // stats specific to free-threaded build typedef struct _ft_stats { + // number of times interpreter had to spin or park when trying to acquire a mutex uint64_t mutex_sleeps; + // number of times that the QSBR mechanism polled (compute read sequence value) uint64_t qsbr_polls; + // number of times stop-the-world mechanism was used uint64_t world_stops; } FTStats; #endif @@ -189,25 +192,16 @@ typedef struct _stats { GCStats gc_stats[3]; // must match NUM_GENERATIONS } PyStats; - -#if defined(HAVE_THREAD_LOCAL) && !defined(Py_BUILD_CORE_MODULE) -extern _Py_thread_local PyStats *_Py_tss_stats; -#endif - -// Export for most shared extensions, used via _PyStats_GET() static -// inline function. +// Export for most shared extensions PyAPI_FUNC(PyStats *) _PyStats_GetLocal(void); -// Return pointer to the PyStats structure, NULL if recording is off. -static inline PyStats* -_PyStats_GET(void) -{ #if defined(HAVE_THREAD_LOCAL) && !defined(Py_BUILD_CORE_MODULE) - return _Py_tss_stats; +// use inline function version defined in cpython/pystate.h +static inline PyStats* _PyThreadState_GetStatsFast(void); +#define _PyStats_GET _PyThreadState_GetStatsFast #else - return _PyStats_GetLocal(); +#define _PyStats_GET _PyStats_GetLocal #endif -} #define _Py_STATS_EXPR(expr) \ do { \ diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index 11fc55dc238eb7..29ebdfd7e01613 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -71,16 +71,13 @@ typedef struct _PyThreadStateImpl { // When >1, code objects do not immortalize their non-string constants. int suppress_co_const_immortalization; -#endif // Py_GIL_DISABLED - #ifdef Py_STATS -#ifdef Py_GIL_DISABLED - // per-thread stats, will be merged into interp->pystats_struct - PyStats *pystats_struct; // allocated by _PyStats_ThreadInit() -#endif - PyStats **pystats_tss; // pointer to tss variable + // per-thread stats, will be merged into interp->pystats_struct + PyStats *pystats_struct; // allocated by _PyStats_ThreadInit() #endif +#endif // Py_GIL_DISABLED + #if defined(Py_REF_DEBUG) && defined(Py_GIL_DISABLED) Py_ssize_t reftotal; // this thread's total refcount operations #endif diff --git a/Python/pystats.c b/Python/pystats.c index 9715d6ddb94384..0f95646b536285 100644 --- a/Python/pystats.c +++ b/Python/pystats.c @@ -3,6 +3,7 @@ #include "pycore_opcode_metadata.h" // _PyOpcode_Caches #include "pycore_pyatomic_ft_wrappers.h" #include "pycore_pylifecycle.h" // _PyOS_URandomNonblock() +#include "pycore_tstate.h" #include "pycore_uop_metadata.h" // _PyOpcode_uop_name #include "pycore_uop_ids.h" // MAX_UOP_ID #include "pycore_pystate.h" // _PyThreadState_GET() @@ -12,13 +13,14 @@ #ifdef Py_STATS -// Pointer to Thread-local stats structure, null if recording is off. -_Py_thread_local PyStats *_Py_tss_stats; - PyStats * _PyStats_GetLocal(void) { - return _Py_tss_stats; + PyThreadState *tstate = _PyThreadState_GET(); + if (tstate) { + return tstate->pystats; + } + return NULL; } #ifdef Py_GIL_DISABLED @@ -582,11 +584,10 @@ stats_toggle_on_off(PyThreadState *tstate, int on) if (!ts->_status.active) { continue; } - _PyThreadStateImpl *ts_impl = (_PyThreadStateImpl *)ts; PyStats *s; if (interp->pystats_enabled) { #ifdef Py_GIL_DISABLED - s = ts_impl->pystats_struct; + s = ((_PyThreadStateImpl *)ts)->pystats_struct; #else s = ((PyThreadState *)tstate)->interp->pystats_struct; #endif @@ -594,8 +595,7 @@ stats_toggle_on_off(PyThreadState *tstate, int on) else { s = NULL; } - // write to the tss variable for the 'ts' thread - *ts_impl->pystats_tss = s; + ts->pystats = s; } _PyEval_StartTheWorld(interp); return 0; @@ -759,31 +759,28 @@ _PyStats_ThreadFini(_PyThreadStateImpl *tstate) } void -_PyStats_Attach(_PyThreadStateImpl *tstate) +_PyStats_Attach(_PyThreadStateImpl *tstate_impl) { PyStats *s; - PyInterpreterState *interp = ((PyThreadState *)tstate)->interp; + PyThreadState *tstate = (PyThreadState *)tstate_impl; + PyInterpreterState *interp = tstate->interp; if (FT_ATOMIC_LOAD_INT_RELAXED(interp->pystats_enabled)) { #ifdef Py_GIL_DISABLED - s = tstate->pystats_struct; + s = ((_PyThreadStateImpl *)tstate)->pystats_struct; #else - s = ((PyThreadState *)tstate)->interp->pystats_struct; + s = tstate->interp->pystats_struct; #endif } else { s = NULL; } - // use correct TSS variable for thread - tstate->pystats_tss = &_Py_tss_stats; - // write to the tss variable for the 'ts' thread - _Py_tss_stats = s; + tstate->pystats = s; } void -_PyStats_Detach(_PyThreadStateImpl *tstate) +_PyStats_Detach(_PyThreadStateImpl *tstate_impl) { - tstate->pystats_tss = NULL; - _Py_tss_stats = NULL; + ((PyThreadState *)tstate_impl)->pystats = NULL; } #endif // Py_STATS diff --git a/Tools/c-analyzer/cpython/ignored.tsv b/Tools/c-analyzer/cpython/ignored.tsv index 22f90c743ba072..dc626e4bea0f59 100644 --- a/Tools/c-analyzer/cpython/ignored.tsv +++ b/Tools/c-analyzer/cpython/ignored.tsv @@ -194,7 +194,6 @@ Python/pyfpe.c - PyFPE_counter - Python/import.c - pkgcontext - Python/pystate.c - _Py_tss_tstate - Python/pystate.c - _Py_tss_gilstate - -Python/pystats.c - _Py_tss_stats - ##----------------------- ## should be const From a48c7f6d2b824e7c97616a3804222ac773dc5113 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Thu, 7 Aug 2025 14:50:56 -0700 Subject: [PATCH 6/7] Improve _Py_STATS_COND_EXPR macro. --- Include/cpython/pystats.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h index d2c976955813e6..ad3af6da1849ce 100644 --- a/Include/cpython/pystats.h +++ b/Include/cpython/pystats.h @@ -214,7 +214,7 @@ static inline PyStats* _PyThreadState_GetStatsFast(void); #define _Py_STATS_COND_EXPR(cond, expr) \ do { \ PyStats *s = _PyStats_GET(); \ - if (s != NULL && cond) { \ + if (s != NULL && (cond)) { \ s->expr; \ } \ } while (0) From 612ea96430525e0b0c54de1c53abdae311d2f746 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Thu, 7 Aug 2025 14:56:58 -0700 Subject: [PATCH 7/7] Improve comment for ts->pystats member. --- Include/cpython/pystate.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Include/cpython/pystate.h b/Include/cpython/pystate.h index c030b8a1938069..41bf65e8df9522 100644 --- a/Include/cpython/pystate.h +++ b/Include/cpython/pystate.h @@ -213,7 +213,12 @@ struct _ts { _PyRemoteDebuggerSupport remote_debugger_support; #ifdef Py_STATS - PyStats *pystats; // pointer PyStats structure, NULL if recording is off + // Pointer PyStats structure, NULL if recording is off. For the + // free-threaded build, the structure is per-thread (stored as a pointer + // in _PyThreadStateImpl). For the default build, the structure is stored + // in the PyInterpreterState structure (threads do not have their own + // structure and all share the same per-interpreter structure). + PyStats *pystats; #endif };