diff --git a/.github/workflows/jit.yml b/.github/workflows/jit.yml index c32bf4fd63cc8f..69d900091a3bd1 100644 --- a/.github/workflows/jit.yml +++ b/.github/workflows/jit.yml @@ -68,7 +68,7 @@ jobs: - true - false llvm: - - 19 + - 20 include: - target: i686-pc-windows-msvc/msvc architecture: Win32 @@ -138,7 +138,7 @@ jobs: fail-fast: false matrix: llvm: - - 19 + - 20 steps: - uses: actions/checkout@v4 with: @@ -166,7 +166,7 @@ jobs: fail-fast: false matrix: llvm: - - 19 + - 20 steps: - uses: actions/checkout@v4 with: @@ -183,3 +183,27 @@ jobs: - name: Run tests without optimizations run: | PYTHON_UOPS_OPTIMIZE=0 ./python -m test --multiprocess 0 --timeout 4500 --verbose2 --verbose3 + + tail-call-jit: + name: JIT with tail calling interpreter + needs: interpreter + runs-on: ubuntu-24.04 + timeout-minutes: 90 + strategy: + fail-fast: false + matrix: + llvm: + - 20 + steps: + - uses: actions/checkout@v4 + with: + persist-credentials: false + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + - name: Build with JIT and tailcall + run: | + sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" ./llvm.sh ${{ matrix.llvm }} + export PATH="$(llvm-config-${{ matrix.llvm }} --bindir):$PATH" + CC=clang-${{ matrix.llvm }} ./configure --enable-experimental-jit --with-tail-call-interp --with-pydebug + make all --jobs 4 diff --git a/.github/workflows/reusable-wasi.yml b/.github/workflows/reusable-wasi.yml index 7c40e566b2a0d1..18feb564822116 100644 --- a/.github/workflows/reusable-wasi.yml +++ b/.github/workflows/reusable-wasi.yml @@ -60,24 +60,24 @@ jobs: with: path: ${{ env.CROSS_BUILD_PYTHON }}/config.cache # Include env.pythonLocation in key to avoid changes in environment when setup-python updates Python. - # Include the hash of `Tools/wasm/wasi.py` as it may change the environment variables. + # Include the hash of `Tools/wasm/wasi/__main__.py` as it may change the environment variables. # (Make sure to keep the key in sync with the other config.cache step below.) - key: ${{ github.job }}-${{ env.IMAGE_OS_VERSION }}-${{ env.WASI_SDK_VERSION }}-${{ env.WASMTIME_VERSION }}-${{ inputs.config_hash }}-${{ hashFiles('Tools/wasm/wasi.py') }}-${{ env.pythonLocation }} + key: ${{ github.job }}-${{ env.IMAGE_OS_VERSION }}-${{ env.WASI_SDK_VERSION }}-${{ env.WASMTIME_VERSION }}-${{ inputs.config_hash }}-${{ hashFiles('Tools/wasm/wasi/__main__.py') }}-${{ env.pythonLocation }} - name: "Configure build Python" - run: python3 Tools/wasm/wasi.py configure-build-python -- --config-cache --with-pydebug + run: python3 Tools/wasm/wasi configure-build-python -- --config-cache --with-pydebug - name: "Make build Python" - run: python3 Tools/wasm/wasi.py make-build-python + run: python3 Tools/wasm/wasi make-build-python - name: "Restore host config.cache" uses: actions/cache@v4 with: path: ${{ env.CROSS_BUILD_WASI }}/config.cache # Should be kept in sync with the other config.cache step above. - key: ${{ github.job }}-${{ env.IMAGE_OS_VERSION }}-${{ env.WASI_SDK_VERSION }}-${{ env.WASMTIME_VERSION }}-${{ inputs.config_hash }}-${{ hashFiles('Tools/wasm/wasi.py') }}-${{ env.pythonLocation }} + key: ${{ github.job }}-${{ env.IMAGE_OS_VERSION }}-${{ env.WASI_SDK_VERSION }}-${{ env.WASMTIME_VERSION }}-${{ inputs.config_hash }}-${{ hashFiles('Tools/wasm/wasi/__main__.py') }}-${{ env.pythonLocation }} - name: "Configure host" # `--with-pydebug` inferred from configure-build-python - run: python3 Tools/wasm/wasi.py configure-host -- --config-cache + run: python3 Tools/wasm/wasi configure-host -- --config-cache - name: "Make host" - run: python3 Tools/wasm/wasi.py make-host + run: python3 Tools/wasm/wasi make-host - name: "Display build info" run: make --directory "${CROSS_BUILD_WASI}" pythoninfo - name: "Test" diff --git a/Doc/c-api/cell.rst b/Doc/c-api/cell.rst index 61eb994c370946..2501ed9580df89 100644 --- a/Doc/c-api/cell.rst +++ b/Doc/c-api/cell.rst @@ -7,7 +7,7 @@ Cell Objects "Cell" objects are used to implement variables referenced by multiple scopes. For each such variable, a cell object is created to store the value; the local -variables of each stack frame that references the value contains a reference to +variables of each stack frame that references the value contain a reference to the cells from outer scopes which also use that variable. When the value is accessed, the value contained in the cell is used instead of the cell object itself. This de-referencing of the cell object requires support from the diff --git a/Doc/c-api/complex.rst b/Doc/c-api/complex.rst index d135637a7417ab..629312bd771beb 100644 --- a/Doc/c-api/complex.rst +++ b/Doc/c-api/complex.rst @@ -82,7 +82,7 @@ Complex Number Objects .. c:type:: Py_complex - This C structure defines export format for a Python complex + This C structure defines an export format for a Python complex number object. .. c:member:: double real diff --git a/Doc/c-api/datetime.rst b/Doc/c-api/datetime.rst index d2d4d5309c7098..f311aad5f15499 100644 --- a/Doc/c-api/datetime.rst +++ b/Doc/c-api/datetime.rst @@ -46,7 +46,7 @@ macros. .. c:var:: PyTypeObject PyDateTime_DeltaType - This instance of :c:type:`PyTypeObject` represents Python type for + This instance of :c:type:`PyTypeObject` represents the Python type for the difference between two datetime values; it is the same object as :class:`datetime.timedelta` in the Python layer. diff --git a/Doc/c-api/descriptor.rst b/Doc/c-api/descriptor.rst index b32c113e5f0457..ff0df575279d96 100644 --- a/Doc/c-api/descriptor.rst +++ b/Doc/c-api/descriptor.rst @@ -32,7 +32,7 @@ found in the dictionary of type objects. .. c:function:: int PyDescr_IsData(PyObject *descr) - Return non-zero if the descriptor objects *descr* describes a data attribute, or + Return non-zero if the descriptor object *descr* describes a data attribute, or ``0`` if it describes a method. *descr* must be a descriptor object; there is no error checking. diff --git a/Doc/c-api/init_config.rst b/Doc/c-api/init_config.rst index b20495e672dd75..c345029e4acd49 100644 --- a/Doc/c-api/init_config.rst +++ b/Doc/c-api/init_config.rst @@ -102,7 +102,7 @@ Error Handling * Set *\*err_msg* and return ``1`` if an error is set. * Set *\*err_msg* to ``NULL`` and return ``0`` otherwise. - An error message is an UTF-8 encoded string. + An error message is a UTF-8 encoded string. If *config* has an exit code, format the exit code as an error message. diff --git a/Doc/c-api/mapping.rst b/Doc/c-api/mapping.rst index 1f55c0aa955c75..2476ebb9b69dce 100644 --- a/Doc/c-api/mapping.rst +++ b/Doc/c-api/mapping.rst @@ -102,7 +102,7 @@ See also :c:func:`PyObject_GetItem`, :c:func:`PyObject_SetItem` and .. note:: - Exceptions which occur when this calls :meth:`~object.__getitem__` + Exceptions which occur when this calls the :meth:`~object.__getitem__` method are silently ignored. For proper error handling, use :c:func:`PyMapping_HasKeyWithError`, :c:func:`PyMapping_GetOptionalItem` or :c:func:`PyObject_GetItem()` instead. @@ -116,7 +116,7 @@ See also :c:func:`PyObject_GetItem`, :c:func:`PyObject_SetItem` and .. note:: - Exceptions that occur when this calls :meth:`~object.__getitem__` + Exceptions that occur when this calls the :meth:`~object.__getitem__` method or while creating the temporary :class:`str` object are silently ignored. For proper error handling, use :c:func:`PyMapping_HasKeyStringWithError`, diff --git a/Doc/c-api/marshal.rst b/Doc/c-api/marshal.rst index 61218a1bf6f171..668a163b2df5a1 100644 --- a/Doc/c-api/marshal.rst +++ b/Doc/c-api/marshal.rst @@ -82,7 +82,7 @@ The following functions allow marshalled values to be read back in. assumes that no further objects will be read from the file, allowing it to aggressively load file data into memory so that the de-serialization can operate from data in memory rather than reading a byte at a time from the - file. Only use these variant if you are certain that you won't be reading + file. Only use this variant if you are certain that you won't be reading anything else from the file. On error, sets the appropriate exception (:exc:`EOFError`, :exc:`ValueError` diff --git a/Doc/c-api/memory.rst b/Doc/c-api/memory.rst index df1bb0ce370919..2395898010214d 100644 --- a/Doc/c-api/memory.rst +++ b/Doc/c-api/memory.rst @@ -102,7 +102,7 @@ All allocating functions belong to one of three different "domains" (see also strategies and are optimized for different purposes. The specific details on how every domain allocates memory or what internal functions each domain calls is considered an implementation detail, but for debugging purposes a simplified -table can be found at :ref:`here `. +table can be found at :ref:`default-memory-allocators`. The APIs used to allocate and free a block of memory must be from the same domain. For example, :c:func:`PyMem_Free` must be used to free memory allocated using :c:func:`PyMem_Malloc`. diff --git a/Doc/c-api/module.rst b/Doc/c-api/module.rst index 6626f628fcc47f..ed2a7663375920 100644 --- a/Doc/c-api/module.rst +++ b/Doc/c-api/module.rst @@ -103,7 +103,7 @@ Module Objects created, or ``NULL`` if the module wasn't created from a definition. On error, return ``NULL`` with an exception set. - Use :c:func:`PyErr_Occurred` to tell this case apart from a mising + Use :c:func:`PyErr_Occurred` to tell this case apart from a missing :c:type:`!PyModuleDef`. diff --git a/Doc/c-api/monitoring.rst b/Doc/c-api/monitoring.rst index 7926148302af0b..b0227c2f4faf15 100644 --- a/Doc/c-api/monitoring.rst +++ b/Doc/c-api/monitoring.rst @@ -136,7 +136,7 @@ Managing the Monitoring State ----------------------------- Monitoring states can be managed with the help of monitoring scopes. A scope -would typically correspond to a python function. +would typically correspond to a Python function. .. c:function:: int PyMonitoring_EnterScope(PyMonitoringState *state_array, uint64_t *version, const uint8_t *event_types, Py_ssize_t length) diff --git a/Doc/c-api/object.rst b/Doc/c-api/object.rst index 8629b768a2948e..96353266ac7300 100644 --- a/Doc/c-api/object.rst +++ b/Doc/c-api/object.rst @@ -73,7 +73,7 @@ Object Protocol Flag to be used with multiple functions that print the object (like :c:func:`PyObject_Print` and :c:func:`PyFile_WriteObject`). - If passed, these function would use the :func:`str` of the object + If passed, these functions use the :func:`str` of the object instead of the :func:`repr`. diff --git a/Doc/c-api/stable.rst b/Doc/c-api/stable.rst index b08a7bf1b2fa4f..f5e6b7ad157e99 100644 --- a/Doc/c-api/stable.rst +++ b/Doc/c-api/stable.rst @@ -279,7 +279,7 @@ The full API is described below for advanced use cases. .. c:member:: uint8_t abiinfo_minor_version - The major version of :c:struct:`PyABIInfo`. + The minor version of :c:struct:`PyABIInfo`. Must be set to ``0``; larger values are reserved for backwards-compatible future versions of :c:struct:`!PyABIInfo`. diff --git a/Doc/c-api/tuple.rst b/Doc/c-api/tuple.rst index 14a7c05efeac87..d0add48d7e8457 100644 --- a/Doc/c-api/tuple.rst +++ b/Doc/c-api/tuple.rst @@ -61,7 +61,7 @@ Tuple Objects .. c:function:: Py_ssize_t PyTuple_Size(PyObject *p) Take a pointer to a tuple object, and return the size of that tuple. - On error, return ``-1`` and with an exception set. + On error, return ``-1`` with an exception set. .. c:function:: Py_ssize_t PyTuple_GET_SIZE(PyObject *p) diff --git a/Doc/c-api/veryhigh.rst b/Doc/c-api/veryhigh.rst index ee0595a9e089c9..0b2b55b6387bd1 100644 --- a/Doc/c-api/veryhigh.rst +++ b/Doc/c-api/veryhigh.rst @@ -140,7 +140,7 @@ the same library that the Python runtime is using. interpreter prompt is about to become idle and wait for user input from the terminal. The return value is ignored. Overriding this hook can be used to integrate the interpreter's prompt with other - event loops, as done in the :file:`Modules/_tkinter.c` in the + event loops, as done in :file:`Modules/_tkinter.c` in the Python source code. .. versionchanged:: 3.12 diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst index 2a5994b11d83d9..305e5d07a3529e 100644 --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -1088,7 +1088,7 @@ alias for the ``'utf_8'`` codec. refer to the source :source:`aliases.py ` file. On Windows, ``cpXXX`` codecs are available for all code pages. -But only codecs listed in the following table are guarantead to exist on +But only codecs listed in the following table are guaranteed to exist on other platforms. .. impl-detail:: diff --git a/Doc/library/idle.rst b/Doc/library/idle.rst index fabea611e0ebcd..e547c96b580bfd 100644 --- a/Doc/library/idle.rst +++ b/Doc/library/idle.rst @@ -204,7 +204,7 @@ New Indent Width Open a dialog to change indent width. The accepted default by the Python community is 4 spaces. -Strip Trailing Chitespace +Strip Trailing Whitespace Remove trailing space and other whitespace characters after the last non-whitespace character of a line by applying str.rstrip to each line, including lines within multiline strings. Except for Shell windows, diff --git a/Include/cpython/pystate.h b/Include/cpython/pystate.h index ac8798ff6129a0..dd2ea1202b3795 100644 --- a/Include/cpython/pystate.h +++ b/Include/cpython/pystate.h @@ -217,6 +217,15 @@ struct _ts { */ PyObject *threading_local_sentinel; _PyRemoteDebuggerSupport remote_debugger_support; + +#ifdef Py_STATS + // Pointer to PyStats structure, NULL if recording is off. For the + // free-threaded build, the structure is per-thread (stored as a pointer + // in _PyThreadStateImpl). For the default build, the structure is stored + // in the PyInterpreterState structure (threads do not have their own + // structure and all share the same per-interpreter structure). + PyStats *pystats; +#endif }; /* other API */ @@ -239,6 +248,21 @@ PyAPI_FUNC(void) PyThreadState_EnterTracing(PyThreadState *tstate); // function is set, otherwise disable them. PyAPI_FUNC(void) PyThreadState_LeaveTracing(PyThreadState *tstate); +#ifdef Py_STATS +#if defined(HAVE_THREAD_LOCAL) && !defined(Py_BUILD_CORE_MODULE) +extern _Py_thread_local PyThreadState *_Py_tss_tstate; + +static inline PyStats* +_PyThreadState_GetStatsFast(void) +{ + if (_Py_tss_tstate == NULL) { + return NULL; // no attached thread state + } + return _Py_tss_tstate->pystats; +} +#endif +#endif // Py_STATS + /* PyGILState */ /* Helper/diagnostic function - return 1 if the current thread diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h index cf830b6066f4ab..d0a925a3055485 100644 --- a/Include/cpython/pystats.h +++ b/Include/cpython/pystats.h @@ -4,7 +4,7 @@ // // - _Py_INCREF_STAT_INC() and _Py_DECREF_STAT_INC() used by Py_INCREF() // and Py_DECREF(). -// - _Py_stats variable +// - _PyStats_GET() // // Functions of the sys module: // @@ -14,7 +14,7 @@ // - sys._stats_dump() // // Python must be built with ./configure --enable-pystats to define the -// Py_STATS macro. +// _PyStats_GET() macro. // // Define _PY_INTERPRETER macro to increment interpreter_increfs and // interpreter_decrefs. Otherwise, increment increfs and decrefs. @@ -109,6 +109,18 @@ typedef struct _gc_stats { uint64_t objects_not_transitively_reachable; } GCStats; +#ifdef Py_GIL_DISABLED +// stats specific to free-threaded build +typedef struct _ft_stats { + // number of times interpreter had to spin or park when trying to acquire a mutex + uint64_t mutex_sleeps; + // number of times that the QSBR mechanism polled (compute read sequence value) + uint64_t qsbr_polls; + // number of times stop-the-world mechanism was used + uint64_t world_stops; +} FTStats; +#endif + typedef struct _uop_stats { uint64_t execution_count; uint64_t miss; @@ -173,22 +185,48 @@ typedef struct _stats { CallStats call_stats; ObjectStats object_stats; OptimizationStats optimization_stats; +#ifdef Py_GIL_DISABLED + FTStats ft_stats; +#endif RareEventStats rare_event_stats; - GCStats *gc_stats; + GCStats gc_stats[3]; // must match NUM_GENERATIONS } PyStats; +// Export for most shared extensions +PyAPI_FUNC(PyStats *) _PyStats_GetLocal(void); + +#if defined(HAVE_THREAD_LOCAL) && !defined(Py_BUILD_CORE_MODULE) +// use inline function version defined in cpython/pystate.h +static inline PyStats *_PyThreadState_GetStatsFast(void); +#define _PyStats_GET _PyThreadState_GetStatsFast +#else +#define _PyStats_GET _PyStats_GetLocal +#endif -// Export for shared extensions like 'math' -PyAPI_DATA(PyStats*) _Py_stats; +#define _Py_STATS_EXPR(expr) \ + do { \ + PyStats *s = _PyStats_GET(); \ + if (s != NULL) { \ + s->expr; \ + } \ + } while (0) + +#define _Py_STATS_COND_EXPR(cond, expr) \ + do { \ + PyStats *s = _PyStats_GET(); \ + if (s != NULL && (cond)) { \ + s->expr; \ + } \ + } while (0) #ifdef _PY_INTERPRETER -# define _Py_INCREF_STAT_INC() do { if (_Py_stats) _Py_stats->object_stats.interpreter_increfs++; } while (0) -# define _Py_DECREF_STAT_INC() do { if (_Py_stats) _Py_stats->object_stats.interpreter_decrefs++; } while (0) -# define _Py_INCREF_IMMORTAL_STAT_INC() do { if (_Py_stats) _Py_stats->object_stats.interpreter_immortal_increfs++; } while (0) -# define _Py_DECREF_IMMORTAL_STAT_INC() do { if (_Py_stats) _Py_stats->object_stats.interpreter_immortal_decrefs++; } while (0) +# define _Py_INCREF_STAT_INC() _Py_STATS_EXPR(object_stats.interpreter_increfs++) +# define _Py_DECREF_STAT_INC() _Py_STATS_EXPR(object_stats.interpreter_decrefs++) +# define _Py_INCREF_IMMORTAL_STAT_INC() _Py_STATS_EXPR(object_stats.interpreter_immortal_increfs++) +# define _Py_DECREF_IMMORTAL_STAT_INC() _Py_STATS_EXPR(object_stats.interpreter_immortal_decrefs++) #else -# define _Py_INCREF_STAT_INC() do { if (_Py_stats) _Py_stats->object_stats.increfs++; } while (0) -# define _Py_DECREF_STAT_INC() do { if (_Py_stats) _Py_stats->object_stats.decrefs++; } while (0) -# define _Py_INCREF_IMMORTAL_STAT_INC() do { if (_Py_stats) _Py_stats->object_stats.immortal_increfs++; } while (0) -# define _Py_DECREF_IMMORTAL_STAT_INC() do { if (_Py_stats) _Py_stats->object_stats.immortal_decrefs++; } while (0) +# define _Py_INCREF_STAT_INC() _Py_STATS_EXPR(object_stats.increfs++) +# define _Py_DECREF_STAT_INC() _Py_STATS_EXPR(object_stats.decrefs++) +# define _Py_INCREF_IMMORTAL_STAT_INC() _Py_STATS_EXPR(object_stats.immortal_increfs++) +# define _Py_DECREF_IMMORTAL_STAT_INC() _Py_STATS_EXPR(object_stats.immortal_decrefs++) #endif diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index 9cdaa950e3479a..e8cbe9d894e1c7 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -199,7 +199,7 @@ enum _GCPhase { }; /* If we change this, we need to change the default value in the - signature of gc.collect. */ + signature of gc.collect and change the size of PyStats.gc_stats */ #define NUM_GENERATIONS 3 struct _gc_runtime_state { @@ -963,6 +963,18 @@ struct _is { # ifdef Py_STACKREF_CLOSE_DEBUG _Py_hashtable_t *closed_stackrefs_table; # endif +#endif + +#ifdef Py_STATS + // true if recording of pystats is on, this is used when new threads + // are created to decide if recording should be on for them + int pystats_enabled; + // allocated when (and if) stats are first enabled + PyStats *pystats_struct; +#ifdef Py_GIL_DISABLED + // held when pystats related interpreter state is being updated + PyMutex pystats_mutex; +#endif #endif /* the initial PyInterpreterState.threads.head */ diff --git a/Include/internal/pycore_pystats.h b/Include/internal/pycore_pystats.h index f8af398a560586..50ab21aa0f1902 100644 --- a/Include/internal/pycore_pystats.h +++ b/Include/internal/pycore_pystats.h @@ -9,7 +9,7 @@ extern "C" { #endif #ifdef Py_STATS -extern void _Py_StatsOn(void); +extern int _Py_StatsOn(void); extern void _Py_StatsOff(void); extern void _Py_StatsClear(void); extern int _Py_PrintSpecializationStats(int to_file); diff --git a/Include/internal/pycore_stats.h b/Include/internal/pycore_stats.h index 24f239a2135b93..850e6ea455227c 100644 --- a/Include/internal/pycore_stats.h +++ b/Include/internal/pycore_stats.h @@ -15,39 +15,56 @@ extern "C" { #include "pycore_bitutils.h" // _Py_bit_length -#define STAT_INC(opname, name) do { if (_Py_stats) _Py_stats->opcode_stats[opname].specialization.name++; } while (0) -#define STAT_DEC(opname, name) do { if (_Py_stats) _Py_stats->opcode_stats[opname].specialization.name--; } while (0) -#define OPCODE_EXE_INC(opname) do { if (_Py_stats) _Py_stats->opcode_stats[opname].execution_count++; } while (0) -#define CALL_STAT_INC(name) do { if (_Py_stats) _Py_stats->call_stats.name++; } while (0) -#define OBJECT_STAT_INC(name) do { if (_Py_stats) _Py_stats->object_stats.name++; } while (0) -#define OBJECT_STAT_INC_COND(name, cond) \ - do { if (_Py_stats && cond) _Py_stats->object_stats.name++; } while (0) -#define EVAL_CALL_STAT_INC(name) do { if (_Py_stats) _Py_stats->call_stats.eval_calls[name]++; } while (0) -#define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) \ - do { if (_Py_stats && PyFunction_Check(callable)) _Py_stats->call_stats.eval_calls[name]++; } while (0) -#define GC_STAT_ADD(gen, name, n) do { if (_Py_stats) _Py_stats->gc_stats[(gen)].name += (n); } while (0) -#define OPT_STAT_INC(name) do { if (_Py_stats) _Py_stats->optimization_stats.name++; } while (0) -#define OPT_STAT_ADD(name, n) do { if (_Py_stats) _Py_stats->optimization_stats.name += (n); } while (0) -#define UOP_STAT_INC(opname, name) do { if (_Py_stats) { assert(opname < 512); _Py_stats->optimization_stats.opcode[opname].name++; } } while (0) -#define UOP_PAIR_INC(uopcode, lastuop) \ - do { \ - if (lastuop && _Py_stats) { \ - _Py_stats->optimization_stats.opcode[lastuop].pair_count[uopcode]++; \ - } \ - lastuop = uopcode; \ +#define STAT_INC(opname, name) _Py_STATS_EXPR(opcode_stats[opname].specialization.name++) +#define STAT_DEC(opname, name) _Py_STATS_EXPR(opcode_stats[opname].specialization.name--) +#define OPCODE_EXE_INC(opname) _Py_STATS_EXPR(opcode_stats[opname].execution_count++) +#define CALL_STAT_INC(name) _Py_STATS_EXPR(call_stats.name++) +#define OBJECT_STAT_INC(name) _Py_STATS_EXPR(object_stats.name++) +#define OBJECT_STAT_INC_COND(name, cond) _Py_STATS_COND_EXPR(cond, object_stats.name++) +#define EVAL_CALL_STAT_INC(name) _Py_STATS_EXPR(call_stats.eval_calls[name]++) +#define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) _Py_STATS_COND_EXPR(PyFunction_Check(callable), call_stats.eval_calls[name]++) +#define GC_STAT_ADD(gen, name, n) _Py_STATS_EXPR(gc_stats[(gen)].name += (n)) +#define OPT_STAT_INC(name) _Py_STATS_EXPR(optimization_stats.name++) +#define OPT_STAT_ADD(name, n) _Py_STATS_EXPR(optimization_stats.name += (n)) +#define UOP_STAT_INC(opname, name) \ + do { \ + PyStats *s = _PyStats_GET(); \ + if (s) { \ + assert(opname < 512); \ + s->optimization_stats.opcode[opname].name++; \ + } \ + } while (0) +#define UOP_PAIR_INC(uopcode, lastuop) \ + do { \ + PyStats *s = _PyStats_GET(); \ + if (lastuop && s) { \ + s->optimization_stats.opcode[lastuop].pair_count[uopcode]++; \ + } \ + lastuop = uopcode; \ } while (0) -#define OPT_UNSUPPORTED_OPCODE(opname) do { if (_Py_stats) _Py_stats->optimization_stats.unsupported_opcode[opname]++; } while (0) -#define OPT_ERROR_IN_OPCODE(opname) do { if (_Py_stats) _Py_stats->optimization_stats.error_in_opcode[opname]++; } while (0) +#define OPT_UNSUPPORTED_OPCODE(opname) _Py_STATS_EXPR(optimization_stats.unsupported_opcode[opname]++) +#define OPT_ERROR_IN_OPCODE(opname) _Py_STATS_EXPR(optimization_stats.error_in_opcode[opname]++) #define OPT_HIST(length, name) \ do { \ - if (_Py_stats) { \ + PyStats *s = _PyStats_GET(); \ + if (s) { \ int bucket = _Py_bit_length(length >= 1 ? length - 1 : 0); \ bucket = (bucket >= _Py_UOP_HIST_SIZE) ? _Py_UOP_HIST_SIZE - 1 : bucket; \ - _Py_stats->optimization_stats.name[bucket]++; \ + s->optimization_stats.name[bucket]++; \ } \ } while (0) -#define RARE_EVENT_STAT_INC(name) do { if (_Py_stats) _Py_stats->rare_event_stats.name++; } while (0) -#define OPCODE_DEFERRED_INC(opname) do { if (_Py_stats && opcode == opname) _Py_stats->opcode_stats[opname].specialization.deferred++; } while (0) +#define RARE_EVENT_STAT_INC(name) _Py_STATS_EXPR(rare_event_stats.name++) +#define OPCODE_DEFERRED_INC(opname) _Py_STATS_COND_EXPR(opcode==opname, opcode_stats[opname].specialization.deferred++) + +#ifdef Py_GIL_DISABLED +#define FT_STAT_MUTEX_SLEEP_INC() _Py_STATS_EXPR(ft_stats.mutex_sleeps++) +#define FT_STAT_QSBR_POLL_INC() _Py_STATS_EXPR(ft_stats.qsbr_polls++) +#define FT_STAT_WORLD_STOP_INC() _Py_STATS_EXPR(ft_stats.world_stops++) +#else +#define FT_STAT_MUTEX_SLEEP_INC() +#define FT_STAT_QSBR_POLL_INC() +#define FT_STAT_WORLD_STOP_INC() +#endif // Export for '_opcode' shared extension PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void); @@ -71,6 +88,9 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void); #define OPT_HIST(length, name) ((void)0) #define RARE_EVENT_STAT_INC(name) ((void)0) #define OPCODE_DEFERRED_INC(opname) ((void)0) +#define FT_STAT_MUTEX_SLEEP_INC() +#define FT_STAT_QSBR_POLL_INC() +#define FT_STAT_WORLD_STOP_INC() #endif // !Py_STATS @@ -90,6 +110,11 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void); RARE_EVENT_INTERP_INC(interp, name); \ } while (0); \ +PyStatus _PyStats_InterpInit(PyInterpreterState *); +bool _PyStats_ThreadInit(PyInterpreterState *, _PyThreadStateImpl *); +void _PyStats_ThreadFini(_PyThreadStateImpl *); +void _PyStats_Attach(_PyThreadStateImpl *); +void _PyStats_Detach(_PyThreadStateImpl *); #ifdef __cplusplus } diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index bad968428c73a1..29ebdfd7e01613 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -70,8 +70,14 @@ typedef struct _PyThreadStateImpl { // When >1, code objects do not immortalize their non-string constants. int suppress_co_const_immortalization; + +#ifdef Py_STATS + // per-thread stats, will be merged into interp->pystats_struct + PyStats *pystats_struct; // allocated by _PyStats_ThreadInit() #endif +#endif // Py_GIL_DISABLED + #if defined(Py_REF_DEBUG) && defined(Py_GIL_DISABLED) Py_ssize_t reftotal; // this thread's total refcount operations #endif diff --git a/Lib/test/test_pystats.py b/Lib/test/test_pystats.py new file mode 100644 index 00000000000000..c50cecfcfddfe6 --- /dev/null +++ b/Lib/test/test_pystats.py @@ -0,0 +1,215 @@ +import sys +import textwrap +import unittest +from test.support import script_helper + +# This function is available for the --enable-pystats config. +HAVE_PYSTATS = hasattr(sys, '_stats_on') + +TEST_TEMPLATE = """ + import sys + import threading + import time + + THREADS = 2 + + class A: + pass + + class B: + pass + + def modify_class(): + # This is used as a rare event we can assume doesn't happen unless we do it. + # It increments the "Rare event (set_class)" count. + a = A() + a.__class__ = B + + TURNED_ON = False + def stats_on(): + global TURNED_ON + sys._stats_on() + TURNED_ON = True + + TURNED_OFF = False + def stats_off(): + global TURNED_OFF + sys._stats_off() + TURNED_OFF = True + + CLEARED = False + def stats_clear(): + global CLEARED + sys._stats_clear() + CLEARED = True + + def func_start(): + pass + + def func_end(): + pass + + def func_test(thread_id): + pass + + _TEST_CODE_ + + func_start() + threads = [] + for i in range(THREADS): + t = threading.Thread(target=func_test, args=(i,)) + threads.append(t) + t.start() + for t in threads: + t.join() + func_end() + """ + + +def run_test_code( + test_code, + args=[], + env_vars=None, +): + """Run test code and return the value of the "set_class" stats counter. + """ + code = textwrap.dedent(TEST_TEMPLATE) + code = code.replace('_TEST_CODE_', textwrap.dedent(test_code)) + script_args = args + ['-c', code] + env_vars = env_vars or {} + res, _ = script_helper.run_python_until_end(*script_args, **env_vars) + stderr = res.err.decode("ascii", "backslashreplace") + for line in stderr.split('\n'): + if 'Rare event (set_class)' in line: + label, _, value = line.partition(':') + return value.strip() + return '' + + +@unittest.skipUnless(HAVE_PYSTATS, "requires pystats build option") +class TestPyStats(unittest.TestCase): + """Tests for pystats functionality (requires --enable-pystats build + option). + """ + + def test_stats_toggle_on(self): + """Check the toggle on functionality. + """ + code = """ + def func_start(): + modify_class() + """ + + # If turned on with command line flag, should get one count. + stat_count = run_test_code(code, args=['-X', 'pystats']) + self.assertEqual(stat_count, '1') + + # If turned on with env var, should get one count. + stat_count = run_test_code(code, env_vars={'PYTHONSTATS': '1'}) + self.assertEqual(stat_count, '1') + + # If not turned on, should be no counts. + stat_count = run_test_code(code) + self.assertEqual(stat_count, '') + + code = """ + def func_start(): + modify_class() + sys._stats_on() + modify_class() + """ + # Not initially turned on but enabled by sys._stats_on(), should get + # one count. + stat_count = run_test_code(code) + self.assertEqual(stat_count, '1') + + def test_stats_toggle_on_thread(self): + """Check the toggle on functionality when threads are used. + """ + code = """ + def func_test(thread_id): + if thread_id == 0: + modify_class() + stats_on() + modify_class() + else: + while not TURNED_ON: + pass + modify_class() + """ + # Turning on in one thread will count in other thread. + stat_count = run_test_code(code) + self.assertEqual(stat_count, '2') + + code = """ + def func_test(thread_id): + if thread_id == 0: + modify_class() + stats_off() + modify_class() + else: + while not TURNED_OFF: + pass + modify_class() + """ + # Turning off in one thread will not count in other threads. + stat_count = run_test_code(code, args=['-X', 'pystats']) + self.assertEqual(stat_count, '1') + + def test_thread_exit_merge(self): + """Check that per-thread stats (when free-threading enabled) are merged. + """ + code = """ + def func_test(thread_id): + modify_class() + if thread_id == 0: + raise SystemExit + """ + # Stats from a thread exiting early should still be counted. + stat_count = run_test_code(code, args=['-X', 'pystats']) + self.assertEqual(stat_count, '2') + + def test_stats_dump(self): + """Check that sys._stats_dump() works. + """ + code = """ + def func_test(thread_id): + if thread_id == 0: + stats_on() + else: + while not TURNED_ON: + pass + modify_class() + sys._stats_dump() + stats_off() + """ + # Stats from a thread exiting early should still be counted. + stat_count = run_test_code(code) + self.assertEqual(stat_count, '1') + + def test_stats_clear(self): + """Check that sys._stats_clear() works. + """ + code = """ + ready = False + def func_test(thread_id): + global ready + if thread_id == 0: + stats_on() + modify_class() + while not ready: + pass # wait until other thread has called modify_class() + stats_clear() # clears stats for all threads + else: + while not TURNED_ON: + pass + modify_class() + ready = True + """ + # Clearing stats will clear for all threads + stat_count = run_test_code(code) + self.assertEqual(stat_count, '0') + + +if __name__ == "__main__": + unittest.main() diff --git a/Makefile.pre.in b/Makefile.pre.in index 656d9dacd962e3..dd28ff5d2a3ed1 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -483,6 +483,7 @@ PYTHON_OBJS= \ Python/pylifecycle.o \ Python/pymath.o \ Python/pystate.o \ + Python/pystats.o \ Python/pythonrun.o \ Python/pytime.o \ Python/qsbr.o \ diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-07-29-17-51-14.gh-issue-131253.GpRjWy.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-07-29-17-51-14.gh-issue-131253.GpRjWy.rst new file mode 100644 index 00000000000000..2826fad233058a --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-07-29-17-51-14.gh-issue-131253.GpRjWy.rst @@ -0,0 +1 @@ +Support the ``--enable-pystats`` build option for the free-threaded build. diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-10-19-10-32-28.gh-issue-136895.HfsEh0.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-10-19-10-32-28.gh-issue-136895.HfsEh0.rst new file mode 100644 index 00000000000000..fffc264a8650e0 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-10-19-10-32-28.gh-issue-136895.HfsEh0.rst @@ -0,0 +1 @@ +Update JIT compilation to use LLVM 20 at build time. diff --git a/Modules/_xxtestfuzz/fuzzer.c b/Modules/_xxtestfuzz/fuzzer.c index a04f1412eefda1..0cbe10c79ab4a6 100644 --- a/Modules/_xxtestfuzz/fuzzer.c +++ b/Modules/_xxtestfuzz/fuzzer.c @@ -10,8 +10,8 @@ See the source code for LLVMFuzzerTestOneInput for details. */ -#ifndef Py_BUILD_CORE -# define Py_BUILD_CORE 1 +#ifndef Py_BUILD_CORE_MODULE +# define Py_BUILD_CORE_MODULE 1 #endif #include diff --git a/PCbuild/_freeze_module.vcxproj b/PCbuild/_freeze_module.vcxproj index e65f201623fbbe..605861ad3fd06c 100644 --- a/PCbuild/_freeze_module.vcxproj +++ b/PCbuild/_freeze_module.vcxproj @@ -257,6 +257,7 @@ + diff --git a/PCbuild/_freeze_module.vcxproj.filters b/PCbuild/_freeze_module.vcxproj.filters index a9fb6f2328ad95..c67fe53363ee84 100644 --- a/PCbuild/_freeze_module.vcxproj.filters +++ b/PCbuild/_freeze_module.vcxproj.filters @@ -367,6 +367,9 @@ Source Files + + Source Files + Source Files diff --git a/PCbuild/get_external.py b/PCbuild/get_external.py index a78aa6a23041ad..07970624e8647e 100755 --- a/PCbuild/get_external.py +++ b/PCbuild/get_external.py @@ -3,6 +3,7 @@ import argparse import os import pathlib +import shutil import sys import time import urllib.error @@ -22,15 +23,13 @@ def retrieve_with_retries(download_location, output_path, reporthook, ) except (urllib.error.URLError, ConnectionError) as ex: if attempt == max_retries: - msg = f"Download from {download_location} failed." - raise OSError(msg) from ex + raise OSError(f'Download from {download_location} failed.') from ex time.sleep(2.25**attempt) else: return resp - def fetch_zip(commit_hash, zip_dir, *, org='python', binary=False, verbose): - repo = f'cpython-{"bin" if binary else "source"}-deps' + repo = 'cpython-bin-deps' if binary else 'cpython-source-deps' url = f'https://github.com/{org}/{repo}/archive/{commit_hash}.zip' reporthook = None if verbose: @@ -44,6 +43,23 @@ def fetch_zip(commit_hash, zip_dir, *, org='python', binary=False, verbose): return filename +def fetch_release(tag, tarball_dir, *, org='python', verbose=False): + url = f'https://github.com/{org}/cpython-bin-deps/releases/download/{tag}/{tag}.tar.xz' + reporthook = None + if verbose: + reporthook = print + tarball_dir.mkdir(parents=True, exist_ok=True) + output_path = tarball_dir / f'{tag}.tar.xz' + retrieve_with_retries(url, output_path, reporthook) + return output_path + + +def extract_tarball(externals_dir, tarball_path, tag): + output_path = externals_dir / tag + shutil.unpack_archive(os.fspath(tarball_path), os.fspath(output_path)) + return output_path + + def extract_zip(externals_dir, zip_path): with zipfile.ZipFile(os.fspath(zip_path)) as zf: zf.extractall(os.fspath(externals_dir)) @@ -55,6 +71,8 @@ def parse_args(): p.add_argument('-v', '--verbose', action='store_true') p.add_argument('-b', '--binary', action='store_true', help='Is the dependency in the binary repo?') + p.add_argument('-r', '--release', action='store_true', + help='Download from GitHub release assets instead of branch') p.add_argument('-O', '--organization', help='Organization owning the deps repos', default='python') p.add_argument('-e', '--externals-dir', type=pathlib.Path, @@ -67,15 +85,36 @@ def parse_args(): def main(): args = parse_args() - zip_path = fetch_zip( - args.tag, - args.externals_dir / 'zips', - org=args.organization, - binary=args.binary, - verbose=args.verbose, - ) final_name = args.externals_dir / args.tag - extracted = extract_zip(args.externals_dir, zip_path) + + # Check if the dependency already exists in externals/ directory + # (either already downloaded/extracted, or checked into the git tree) + if final_name.exists(): + if args.verbose: + print(f'{args.tag} already exists at {final_name}, skipping download.') + return + + # Determine download method: release artifacts for large deps (like LLVM), + # otherwise zip download from GitHub branches + if args.release: + tarball_path = fetch_release( + args.tag, + args.externals_dir / 'tarballs', + org=args.organization, + verbose=args.verbose, + ) + extracted = extract_tarball(args.externals_dir, tarball_path, args.tag) + else: + # Use zip download from GitHub branches + # (cpython-bin-deps if --binary, cpython-source-deps otherwise) + zip_path = fetch_zip( + args.tag, + args.externals_dir / 'zips', + org=args.organization, + binary=args.binary, + verbose=args.verbose, + ) + extracted = extract_zip(args.externals_dir, zip_path) for wait in [1, 2, 3, 5, 8, 0]: try: extracted.replace(final_name) diff --git a/PCbuild/get_externals.bat b/PCbuild/get_externals.bat index 50a227b563a7c0..319024e0f50f46 100644 --- a/PCbuild/get_externals.bat +++ b/PCbuild/get_externals.bat @@ -82,7 +82,7 @@ if NOT "%IncludeLibffi%"=="false" set binaries=%binaries% libffi-3.4.4 if NOT "%IncludeSSL%"=="false" set binaries=%binaries% openssl-bin-3.0.18 if NOT "%IncludeTkinter%"=="false" set binaries=%binaries% tcltk-8.6.15.0 if NOT "%IncludeSSLSrc%"=="false" set binaries=%binaries% nasm-2.11.06 -if NOT "%IncludeLLVM%"=="false" set binaries=%binaries% llvm-19.1.7.0 +if NOT "%IncludeLLVM%"=="false" set binaries=%binaries% llvm-20.1.8.0 for %%b in (%binaries%) do ( if exist "%EXTERNALS_DIR%\%%b" ( @@ -92,7 +92,11 @@ for %%b in (%binaries%) do ( git clone --depth 1 https://github.com/%ORG%/cpython-bin-deps --branch %%b "%EXTERNALS_DIR%\%%b" ) else ( echo.Fetching %%b... - %PYTHON% -E "%PCBUILD%\get_external.py" -b -O %ORG% -e "%EXTERNALS_DIR%" %%b + if "%%b"=="llvm-20.1.8.0" ( + %PYTHON% -E "%PCBUILD%\get_external.py" --release --organization %ORG% --externals-dir "%EXTERNALS_DIR%" %%b + ) else ( + %PYTHON% -E "%PCBUILD%\get_external.py" --binary --organization %ORG% --externals-dir "%EXTERNALS_DIR%" %%b + ) ) ) diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index a50ffb120bc013..359a47fbfc4fe2 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -658,6 +658,7 @@ + diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 868ab6f755874f..afdcbc563b2c60 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -62,8 +62,9 @@ #ifdef Py_STATS #define INSTRUCTION_STATS(op) \ do { \ + PyStats *s = _PyStats_GET(); \ OPCODE_EXE_INC(op); \ - if (_Py_stats) _Py_stats->opcode_stats[lastopcode].pair_count[op]++; \ + if (s) s->opcode_stats[lastopcode].pair_count[op]++; \ lastopcode = op; \ } while (0) #else diff --git a/Python/gc.c b/Python/gc.c index a1f3d86d91036b..03a5d7366ea6c9 100644 --- a/Python/gc.c +++ b/Python/gc.c @@ -2111,10 +2111,11 @@ _PyGC_Collect(PyThreadState *tstate, int generation, _PyGC_Reason reason) _PyErr_SetRaisedException(tstate, exc); GC_STAT_ADD(generation, objects_collected, stats.collected); #ifdef Py_STATS - if (_Py_stats) { + PyStats *s = _PyStats_GET(); + if (s) { GC_STAT_ADD(generation, object_visits, - _Py_stats->object_stats.object_visits); - _Py_stats->object_stats.object_visits = 0; + s->object_stats.object_visits); + s->object_stats.object_visits = 0; } #endif validate_spaces(gcstate); diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 842aa3401548c9..f39793c3eeb532 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -2362,8 +2362,9 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) assert(generation >= 0 && generation < NUM_GENERATIONS); #ifdef Py_STATS - if (_Py_stats) { - _Py_stats->object_stats.object_visits = 0; + PyStats *s = _PyStats_GET(); + if (s) { + s->object_stats.object_visits = 0; } #endif GC_STAT_ADD(generation, collections, 1); @@ -2426,10 +2427,13 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) GC_STAT_ADD(generation, objects_collected, m); #ifdef Py_STATS - if (_Py_stats) { - GC_STAT_ADD(generation, object_visits, - _Py_stats->object_stats.object_visits); - _Py_stats->object_stats.object_visits = 0; + { + PyStats *s = _PyStats_GET(); + if (s) { + GC_STAT_ADD(generation, object_visits, + s->object_stats.object_visits); + s->object_stats.object_visits = 0; + } } #endif diff --git a/Python/initconfig.c b/Python/initconfig.c index 5dc68eb4ec2cca..7176670c110d69 100644 --- a/Python/initconfig.c +++ b/Python/initconfig.c @@ -2810,12 +2810,6 @@ _PyConfig_Write(const PyConfig *config, _PyRuntimeState *runtime) return _PyStatus_NO_MEMORY(); } -#ifdef Py_STATS - if (config->_pystats) { - _Py_StatsOn(); - } -#endif - return _PyStatus_OK(); } diff --git a/Python/jit.c b/Python/jit.c index c3f3d686013fe4..279e1ce6a0d2e5 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -444,17 +444,42 @@ patch_x86_64_32rx(unsigned char *location, uint64_t value) } void patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *state); +void patch_x86_64_trampoline(unsigned char *location, int ordinal, jit_state *state); #include "jit_stencils.h" #if defined(__aarch64__) || defined(_M_ARM64) #define TRAMPOLINE_SIZE 16 #define DATA_ALIGN 8 +#elif defined(__x86_64__) && defined(__APPLE__) + // LLVM 20 on macOS x86_64 debug builds: GOT entries may exceed ±2GB PC-relative + // range. + #define TRAMPOLINE_SIZE 16 // 14 bytes + 2 bytes padding for alignment + #define DATA_ALIGN 8 #else #define TRAMPOLINE_SIZE 0 #define DATA_ALIGN 1 #endif +// Get the trampoline memory location for a given symbol ordinal. +static unsigned char * +get_trampoline_slot(int ordinal, jit_state *state) +{ + const uint32_t symbol_mask = 1 << (ordinal % 32); + const uint32_t trampoline_mask = state->trampolines.mask[ordinal / 32]; + assert(symbol_mask & trampoline_mask); + + // Count the number of set bits in the trampoline mask lower than ordinal + int index = _Py_popcount32(trampoline_mask & (symbol_mask - 1)); + for (int i = 0; i < ordinal / 32; i++) { + index += _Py_popcount32(state->trampolines.mask[i]); + } + + unsigned char *trampoline = state->trampolines.mem + index * TRAMPOLINE_SIZE; + assert((size_t)(index + 1) * TRAMPOLINE_SIZE <= state->trampolines.size); + return trampoline; +} + // Generate and patch AArch64 trampolines. The symbols to jump to are stored // in the jit_stencils.h in the symbols_map. void @@ -471,20 +496,8 @@ patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *state) return; } - // Masking is done modulo 32 as the mask is stored as an array of uint32_t - const uint32_t symbol_mask = 1 << (ordinal % 32); - const uint32_t trampoline_mask = state->trampolines.mask[ordinal / 32]; - assert(symbol_mask & trampoline_mask); - - // Count the number of set bits in the trampoline mask lower than ordinal, - // this gives the index into the array of trampolines. - int index = _Py_popcount32(trampoline_mask & (symbol_mask - 1)); - for (int i = 0; i < ordinal / 32; i++) { - index += _Py_popcount32(state->trampolines.mask[i]); - } - - uint32_t *p = (uint32_t*)(state->trampolines.mem + index * TRAMPOLINE_SIZE); - assert((size_t)(index + 1) * TRAMPOLINE_SIZE <= state->trampolines.size); + // Out of range - need a trampoline + uint32_t *p = (uint32_t *)get_trampoline_slot(ordinal, state); /* Generate the trampoline @@ -501,6 +514,37 @@ patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *state) patch_aarch64_26r(location, (uintptr_t)p); } +// Generate and patch x86_64 trampolines. +void +patch_x86_64_trampoline(unsigned char *location, int ordinal, jit_state *state) +{ + uint64_t value = (uintptr_t)symbols_map[ordinal]; + int64_t range = (int64_t)value - 4 - (int64_t)location; + + // If we are in range of 32 signed bits, we can patch directly + if (range >= -(1LL << 31) && range < (1LL << 31)) { + patch_32r(location, value - 4); + return; + } + + // Out of range - need a trampoline + unsigned char *trampoline = get_trampoline_slot(ordinal, state); + + /* Generate the trampoline (14 bytes, padded to 16): + 0: ff 25 00 00 00 00 jmp *(%rip) + 6: XX XX XX XX XX XX XX XX (64-bit target address) + + Reference: https://wiki.osdev.org/X86-64_Instruction_Encoding#FF (JMP r/m64) + */ + trampoline[0] = 0xFF; + trampoline[1] = 0x25; + memset(trampoline + 2, 0, 4); + memcpy(trampoline + 6, &value, 8); + + // Patch the call site to call the trampoline instead + patch_32r(location, (uintptr_t)trampoline - 4); +} + static void combine_symbol_mask(const symbol_mask src, symbol_mask dest) { diff --git a/Python/lock.c b/Python/lock.c index 98f3f89c201fef..789065d8162792 100644 --- a/Python/lock.c +++ b/Python/lock.c @@ -6,6 +6,7 @@ #include "pycore_parking_lot.h" #include "pycore_semaphore.h" #include "pycore_time.h" // _PyTime_Add() +#include "pycore_stats.h" // FT_STAT_MUTEX_SLEEP_INC() #ifdef MS_WINDOWS # ifndef WIN32_LEAN_AND_MEAN @@ -62,6 +63,8 @@ _PyMutex_LockTimed(PyMutex *m, PyTime_t timeout, _PyLockFlags flags) return PY_LOCK_FAILURE; } + FT_STAT_MUTEX_SLEEP_INC(); + PyTime_t now; // silently ignore error: cannot report error to the caller (void)PyTime_MonotonicRaw(&now); diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 8fcb31cfd12299..805805ef188e83 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -26,6 +26,7 @@ #include "pycore_runtime.h" // _Py_ID() #include "pycore_runtime_init.h" // _PyRuntimeState_INIT #include "pycore_setobject.h" // _PySet_NextEntry() +#include "pycore_stats.h" // _PyStats_InterpInit() #include "pycore_sysmodule.h" // _PySys_ClearAttrString() #include "pycore_traceback.h" // _Py_DumpTracebackThreads() #include "pycore_typeobject.h" // _PyTypes_InitTypes() @@ -656,6 +657,14 @@ pycore_create_interpreter(_PyRuntimeState *runtime, return status; } +#ifdef Py_STATS + // initialize pystats. This must be done after the settings are loaded. + status = _PyStats_InterpInit(interp); + if (_PyStatus_EXCEPTION(status)) { + return status; + } +#endif + // initialize the interp->obmalloc state. This must be done after // the settings are loaded (so that feature_flags are set) but before // any calls are made to obmalloc functions. @@ -2469,6 +2478,14 @@ new_interpreter(PyThreadState **tstate_p, return status; } +#ifdef Py_STATS + // initialize pystats. This must be done after the settings are loaded. + status = _PyStats_InterpInit(interp); + if (_PyStatus_EXCEPTION(status)) { + return status; + } +#endif + // initialize the interp->obmalloc state. This must be done after // the settings are loaded (so that feature_flags are set) but before // any calls are made to obmalloc functions. diff --git a/Python/pystate.c b/Python/pystate.c index 24681536797f94..cf251c120d75af 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -21,6 +21,7 @@ #include "pycore_runtime.h" // _PyRuntime #include "pycore_runtime_init.h" // _PyRuntimeState_INIT #include "pycore_stackref.h" // Py_STACKREF_DEBUG +#include "pycore_stats.h" // FT_STAT_WORLD_STOP_INC() #include "pycore_time.h" // _PyTime_Init() #include "pycore_uop.h" // UOP_BUFFER_SIZE #include "pycore_uniqueid.h" // _PyObject_FinalizePerThreadRefcounts() @@ -465,6 +466,12 @@ alloc_interpreter(void) static void free_interpreter(PyInterpreterState *interp) { +#ifdef Py_STATS + if (interp->pystats_struct) { + PyMem_RawFree(interp->pystats_struct); + interp->pystats_struct = NULL; + } +#endif // The main interpreter is statically allocated so // should not be freed. if (interp != &_PyRuntime._main_interpreter) { @@ -1407,6 +1414,9 @@ static void free_threadstate(_PyThreadStateImpl *tstate) { PyInterpreterState *interp = tstate->base.interp; +#ifdef Py_STATS + _PyStats_ThreadFini(tstate); +#endif // The initial thread state of the interpreter is allocated // as part of the interpreter state so should not be freed. if (tstate == &interp->_initial_thread) { @@ -1535,6 +1545,13 @@ new_threadstate(PyInterpreterState *interp, int whence) return NULL; } #endif +#ifdef Py_STATS + // The PyStats structure is quite large and is allocated separated from tstate. + if (!_PyStats_ThreadInit(interp, tstate)) { + free_threadstate(tstate); + return NULL; + } +#endif /* We serialize concurrent creation to protect global state. */ HEAD_LOCK(interp->runtime); @@ -1846,6 +1863,9 @@ _PyThreadState_DeleteCurrent(PyThreadState *tstate) _Py_EnsureTstateNotNULL(tstate); #ifdef Py_GIL_DISABLED _Py_qsbr_detach(((_PyThreadStateImpl *)tstate)->qsbr); +#endif +#ifdef Py_STATS + _PyStats_Detach((_PyThreadStateImpl *)tstate); #endif current_fast_clear(tstate->interp->runtime); tstate_delete_common(tstate, 1); // release GIL as part of call @@ -2020,6 +2040,10 @@ tstate_deactivate(PyThreadState *tstate) assert(tstate_is_bound(tstate)); assert(tstate->_status.active); +#if Py_STATS + _PyStats_Detach((_PyThreadStateImpl *)tstate); +#endif + tstate->_status.active = 0; // We do not unbind the gilstate tstate here. @@ -2123,6 +2147,10 @@ _PyThreadState_Attach(PyThreadState *tstate) _PyCriticalSection_Resume(tstate); } +#ifdef Py_STATS + _PyStats_Attach((_PyThreadStateImpl *)tstate); +#endif + #if defined(Py_DEBUG) errno = err; #endif @@ -2272,6 +2300,7 @@ stop_the_world(struct _stoptheworld_state *stw) stw->thread_countdown = 0; stw->stop_event = (PyEvent){0}; // zero-initialize (unset) stw->requester = _PyThreadState_GET(); // may be NULL + FT_STAT_WORLD_STOP_INC(); _Py_FOR_EACH_STW_INTERP(stw, i) { _Py_FOR_EACH_TSTATE_UNLOCKED(i, t) { diff --git a/Python/pystats.c b/Python/pystats.c new file mode 100644 index 00000000000000..2e377b8e6da3be --- /dev/null +++ b/Python/pystats.c @@ -0,0 +1,819 @@ +#include "Python.h" + +#include "pycore_opcode_metadata.h" // _PyOpcode_Caches +#include "pycore_pyatomic_ft_wrappers.h" +#include "pycore_pylifecycle.h" // _PyOS_URandomNonblock() +#include "pycore_tstate.h" +#include "pycore_initconfig.h" // _PyStatus_OK() +#include "pycore_uop_metadata.h" // _PyOpcode_uop_name +#include "pycore_uop_ids.h" // MAX_UOP_ID +#include "pycore_pystate.h" // _PyThreadState_GET() +#include "pycore_runtime.h" // NUM_GENERATIONS + +#include // rand() + +#ifdef Py_STATS + +PyStats * +_PyStats_GetLocal(void) +{ + PyThreadState *tstate = _PyThreadState_GET(); + if (tstate) { + return tstate->pystats; + } + return NULL; +} + +#ifdef Py_GIL_DISABLED +#define STATS_LOCK(interp) PyMutex_Lock(&interp->pystats_mutex) +#define STATS_UNLOCK(interp) PyMutex_Unlock(&interp->pystats_mutex) +#else +#define STATS_LOCK(interp) +#define STATS_UNLOCK(interp) +#endif + + +#if PYSTATS_MAX_UOP_ID < MAX_UOP_ID +#error "Not enough space allocated for pystats. Increase PYSTATS_MAX_UOP_ID to at least MAX_UOP_ID" +#endif + +#define ADD_STAT_TO_DICT(res, field) \ + do { \ + PyObject *val = PyLong_FromUnsignedLongLong(stats->field); \ + if (val == NULL) { \ + Py_DECREF(res); \ + return NULL; \ + } \ + if (PyDict_SetItemString(res, #field, val) == -1) { \ + Py_DECREF(res); \ + Py_DECREF(val); \ + return NULL; \ + } \ + Py_DECREF(val); \ + } while(0); + +static PyObject* +stats_to_dict(SpecializationStats *stats) +{ + PyObject *res = PyDict_New(); + if (res == NULL) { + return NULL; + } + ADD_STAT_TO_DICT(res, success); + ADD_STAT_TO_DICT(res, failure); + ADD_STAT_TO_DICT(res, hit); + ADD_STAT_TO_DICT(res, deferred); + ADD_STAT_TO_DICT(res, miss); + ADD_STAT_TO_DICT(res, deopt); + PyObject *failure_kinds = PyTuple_New(SPECIALIZATION_FAILURE_KINDS); + if (failure_kinds == NULL) { + Py_DECREF(res); + return NULL; + } + for (int i = 0; i < SPECIALIZATION_FAILURE_KINDS; i++) { + PyObject *stat = PyLong_FromUnsignedLongLong(stats->failure_kinds[i]); + if (stat == NULL) { + Py_DECREF(res); + Py_DECREF(failure_kinds); + return NULL; + } + PyTuple_SET_ITEM(failure_kinds, i, stat); + } + if (PyDict_SetItemString(res, "failure_kinds", failure_kinds)) { + Py_DECREF(res); + Py_DECREF(failure_kinds); + return NULL; + } + Py_DECREF(failure_kinds); + return res; +} +#undef ADD_STAT_TO_DICT + +static int +add_stat_dict( + PyStats *src, + PyObject *res, + int opcode, + const char *name) { + + SpecializationStats *stats = &src->opcode_stats[opcode].specialization; + PyObject *d = stats_to_dict(stats); + if (d == NULL) { + return -1; + } + int err = PyDict_SetItemString(res, name, d); + Py_DECREF(d); + return err; +} + +PyObject* +_Py_GetSpecializationStats(void) { + PyThreadState *tstate = _PyThreadState_GET(); + PyStats *src = FT_ATOMIC_LOAD_PTR_RELAXED(tstate->interp->pystats_struct); + if (src == NULL) { + Py_RETURN_NONE; + } + PyObject *stats = PyDict_New(); + if (stats == NULL) { + return NULL; + } + int err = 0; + err += add_stat_dict(src, stats, CONTAINS_OP, "contains_op"); + err += add_stat_dict(src, stats, LOAD_SUPER_ATTR, "load_super_attr"); + err += add_stat_dict(src, stats, LOAD_ATTR, "load_attr"); + err += add_stat_dict(src, stats, LOAD_GLOBAL, "load_global"); + err += add_stat_dict(src, stats, STORE_SUBSCR, "store_subscr"); + err += add_stat_dict(src, stats, STORE_ATTR, "store_attr"); + err += add_stat_dict(src, stats, JUMP_BACKWARD, "jump_backward"); + err += add_stat_dict(src, stats, CALL, "call"); + err += add_stat_dict(src, stats, CALL_KW, "call_kw"); + err += add_stat_dict(src, stats, BINARY_OP, "binary_op"); + err += add_stat_dict(src, stats, COMPARE_OP, "compare_op"); + err += add_stat_dict(src, stats, UNPACK_SEQUENCE, "unpack_sequence"); + err += add_stat_dict(src, stats, FOR_ITER, "for_iter"); + err += add_stat_dict(src, stats, TO_BOOL, "to_bool"); + err += add_stat_dict(src, stats, SEND, "send"); + if (err < 0) { + Py_DECREF(stats); + return NULL; + } + return stats; +} + + +#define PRINT_STAT(i, field) \ + if (stats[i].field) { \ + fprintf(out, " opcode[%s]." #field " : %" PRIu64 "\n", _PyOpcode_OpName[i], stats[i].field); \ + } + +static void +print_spec_stats(FILE *out, OpcodeStats *stats) +{ + /* Mark some opcodes as specializable for stats, + * even though we don't specialize them yet. */ + fprintf(out, "opcode[BINARY_SLICE].specializable : 1\n"); + fprintf(out, "opcode[STORE_SLICE].specializable : 1\n"); + fprintf(out, "opcode[GET_ITER].specializable : 1\n"); + for (int i = 0; i < 256; i++) { + if (_PyOpcode_Caches[i]) { + /* Ignore jumps as they cannot be specialized */ + switch (i) { + case POP_JUMP_IF_FALSE: + case POP_JUMP_IF_TRUE: + case POP_JUMP_IF_NONE: + case POP_JUMP_IF_NOT_NONE: + case JUMP_BACKWARD: + break; + default: + fprintf(out, "opcode[%s].specializable : 1\n", _PyOpcode_OpName[i]); + } + } + PRINT_STAT(i, specialization.success); + PRINT_STAT(i, specialization.failure); + PRINT_STAT(i, specialization.hit); + PRINT_STAT(i, specialization.deferred); + PRINT_STAT(i, specialization.miss); + PRINT_STAT(i, specialization.deopt); + PRINT_STAT(i, execution_count); + for (int j = 0; j < SPECIALIZATION_FAILURE_KINDS; j++) { + uint64_t val = stats[i].specialization.failure_kinds[j]; + if (val) { + fprintf(out, " opcode[%s].specialization.failure_kinds[%d] : %" + PRIu64 "\n", _PyOpcode_OpName[i], j, val); + } + } + for (int j = 0; j < 256; j++) { + if (stats[i].pair_count[j]) { + fprintf(out, "opcode[%s].pair_count[%s] : %" PRIu64 "\n", + _PyOpcode_OpName[i], _PyOpcode_OpName[j], stats[i].pair_count[j]); + } + } + } +} +#undef PRINT_STAT + + +static void +print_call_stats(FILE *out, CallStats *stats) +{ + fprintf(out, "Calls to PyEval_EvalDefault: %" PRIu64 "\n", stats->pyeval_calls); + fprintf(out, "Calls to Python functions inlined: %" PRIu64 "\n", stats->inlined_py_calls); + fprintf(out, "Frames pushed: %" PRIu64 "\n", stats->frames_pushed); + fprintf(out, "Frame objects created: %" PRIu64 "\n", stats->frame_objects_created); + for (int i = 0; i < EVAL_CALL_KINDS; i++) { + fprintf(out, "Calls via PyEval_EvalFrame[%d] : %" PRIu64 "\n", i, stats->eval_calls[i]); + } +} + +static void +print_object_stats(FILE *out, ObjectStats *stats) +{ + fprintf(out, "Object allocations from freelist: %" PRIu64 "\n", stats->from_freelist); + fprintf(out, "Object frees to freelist: %" PRIu64 "\n", stats->to_freelist); + fprintf(out, "Object allocations: %" PRIu64 "\n", stats->allocations); + fprintf(out, "Object allocations to 512 bytes: %" PRIu64 "\n", stats->allocations512); + fprintf(out, "Object allocations to 4 kbytes: %" PRIu64 "\n", stats->allocations4k); + fprintf(out, "Object allocations over 4 kbytes: %" PRIu64 "\n", stats->allocations_big); + fprintf(out, "Object frees: %" PRIu64 "\n", stats->frees); + fprintf(out, "Object inline values: %" PRIu64 "\n", stats->inline_values); + fprintf(out, "Object interpreter mortal increfs: %" PRIu64 "\n", stats->interpreter_increfs); + fprintf(out, "Object interpreter mortal decrefs: %" PRIu64 "\n", stats->interpreter_decrefs); + fprintf(out, "Object mortal increfs: %" PRIu64 "\n", stats->increfs); + fprintf(out, "Object mortal decrefs: %" PRIu64 "\n", stats->decrefs); + fprintf(out, "Object interpreter immortal increfs: %" PRIu64 "\n", stats->interpreter_immortal_increfs); + fprintf(out, "Object interpreter immortal decrefs: %" PRIu64 "\n", stats->interpreter_immortal_decrefs); + fprintf(out, "Object immortal increfs: %" PRIu64 "\n", stats->immortal_increfs); + fprintf(out, "Object immortal decrefs: %" PRIu64 "\n", stats->immortal_decrefs); + fprintf(out, "Object materialize dict (on request): %" PRIu64 "\n", stats->dict_materialized_on_request); + fprintf(out, "Object materialize dict (new key): %" PRIu64 "\n", stats->dict_materialized_new_key); + fprintf(out, "Object materialize dict (too big): %" PRIu64 "\n", stats->dict_materialized_too_big); + fprintf(out, "Object materialize dict (str subclass): %" PRIu64 "\n", stats->dict_materialized_str_subclass); + fprintf(out, "Object method cache hits: %" PRIu64 "\n", stats->type_cache_hits); + fprintf(out, "Object method cache misses: %" PRIu64 "\n", stats->type_cache_misses); + fprintf(out, "Object method cache collisions: %" PRIu64 "\n", stats->type_cache_collisions); + fprintf(out, "Object method cache dunder hits: %" PRIu64 "\n", stats->type_cache_dunder_hits); + fprintf(out, "Object method cache dunder misses: %" PRIu64 "\n", stats->type_cache_dunder_misses); +} + +static void +print_gc_stats(FILE *out, GCStats *stats) +{ + for (int i = 0; i < NUM_GENERATIONS; i++) { + fprintf(out, "GC[%d] collections: %" PRIu64 "\n", i, stats[i].collections); + fprintf(out, "GC[%d] object visits: %" PRIu64 "\n", i, stats[i].object_visits); + fprintf(out, "GC[%d] objects collected: %" PRIu64 "\n", i, stats[i].objects_collected); + fprintf(out, "GC[%d] objects reachable from roots: %" PRIu64 "\n", i, stats[i].objects_transitively_reachable); + fprintf(out, "GC[%d] objects not reachable from roots: %" PRIu64 "\n", i, stats[i].objects_not_transitively_reachable); + } +} + +#ifdef _Py_TIER2 +static void +print_histogram(FILE *out, const char *name, uint64_t hist[_Py_UOP_HIST_SIZE]) +{ + for (int i = 0; i < _Py_UOP_HIST_SIZE; i++) { + fprintf(out, "%s[%" PRIu64"]: %" PRIu64 "\n", name, (uint64_t)1 << i, hist[i]); + } +} + +extern const char *_PyUOpName(int index); + +static void +print_optimization_stats(FILE *out, OptimizationStats *stats) +{ + fprintf(out, "Optimization attempts: %" PRIu64 "\n", stats->attempts); + fprintf(out, "Optimization traces created: %" PRIu64 "\n", stats->traces_created); + fprintf(out, "Optimization traces executed: %" PRIu64 "\n", stats->traces_executed); + fprintf(out, "Optimization uops executed: %" PRIu64 "\n", stats->uops_executed); + fprintf(out, "Optimization trace stack overflow: %" PRIu64 "\n", stats->trace_stack_overflow); + fprintf(out, "Optimization trace stack underflow: %" PRIu64 "\n", stats->trace_stack_underflow); + fprintf(out, "Optimization trace too long: %" PRIu64 "\n", stats->trace_too_long); + fprintf(out, "Optimization trace too short: %" PRIu64 "\n", stats->trace_too_short); + fprintf(out, "Optimization inner loop: %" PRIu64 "\n", stats->inner_loop); + fprintf(out, "Optimization recursive call: %" PRIu64 "\n", stats->recursive_call); + fprintf(out, "Optimization low confidence: %" PRIu64 "\n", stats->low_confidence); + fprintf(out, "Optimization unknown callee: %" PRIu64 "\n", stats->unknown_callee); + fprintf(out, "Executors invalidated: %" PRIu64 "\n", stats->executors_invalidated); + + print_histogram(out, "Trace length", stats->trace_length_hist); + print_histogram(out, "Trace run length", stats->trace_run_length_hist); + print_histogram(out, "Optimized trace length", stats->optimized_trace_length_hist); + + fprintf(out, "Optimization optimizer attempts: %" PRIu64 "\n", stats->optimizer_attempts); + fprintf(out, "Optimization optimizer successes: %" PRIu64 "\n", stats->optimizer_successes); + fprintf(out, "Optimization optimizer failure no memory: %" PRIu64 "\n", + stats->optimizer_failure_reason_no_memory); + fprintf(out, "Optimizer remove globals builtins changed: %" PRIu64 "\n", stats->remove_globals_builtins_changed); + fprintf(out, "Optimizer remove globals incorrect keys: %" PRIu64 "\n", stats->remove_globals_incorrect_keys); + for (int i = 0; i <= MAX_UOP_ID; i++) { + if (stats->opcode[i].execution_count) { + fprintf(out, "uops[%s].execution_count : %" PRIu64 "\n", _PyUOpName(i), stats->opcode[i].execution_count); + } + if (stats->opcode[i].miss) { + fprintf(out, "uops[%s].specialization.miss : %" PRIu64 "\n", _PyUOpName(i), stats->opcode[i].miss); + } + } + for (int i = 0; i < 256; i++) { + if (stats->unsupported_opcode[i]) { + fprintf( + out, + "unsupported_opcode[%s].count : %" PRIu64 "\n", + _PyOpcode_OpName[i], + stats->unsupported_opcode[i] + ); + } + } + + for (int i = 1; i <= MAX_UOP_ID; i++){ + for (int j = 1; j <= MAX_UOP_ID; j++) { + if (stats->opcode[i].pair_count[j]) { + fprintf(out, "uop[%s].pair_count[%s] : %" PRIu64 "\n", + _PyOpcode_uop_name[i], _PyOpcode_uop_name[j], stats->opcode[i].pair_count[j]); + } + } + } + for (int i = 0; i < MAX_UOP_ID; i++) { + if (stats->error_in_opcode[i]) { + fprintf( + out, + "error_in_opcode[%s].count : %" PRIu64 "\n", + _PyUOpName(i), + stats->error_in_opcode[i] + ); + } + } + fprintf(out, "JIT total memory size: %" PRIu64 "\n", stats->jit_total_memory_size); + fprintf(out, "JIT code size: %" PRIu64 "\n", stats->jit_code_size); + fprintf(out, "JIT trampoline size: %" PRIu64 "\n", stats->jit_trampoline_size); + fprintf(out, "JIT data size: %" PRIu64 "\n", stats->jit_data_size); + fprintf(out, "JIT padding size: %" PRIu64 "\n", stats->jit_padding_size); + fprintf(out, "JIT freed memory size: %" PRIu64 "\n", stats->jit_freed_memory_size); + + print_histogram(out, "Trace total memory size", stats->trace_total_memory_hist); +} +#endif + +#ifdef Py_GIL_DISABLED +static void +print_ft_stats(FILE *out, FTStats *stats) +{ + fprintf(out, "Mutex sleeps (mutex_sleeps): %" PRIu64 "\n", stats->mutex_sleeps); + fprintf(out, "QSBR polls (qsbr_polls): %" PRIu64 "\n", stats->qsbr_polls); + fprintf(out, "World stops (world_stops): %" PRIu64 "\n", stats->world_stops); +} +#endif + +static void +print_rare_event_stats(FILE *out, RareEventStats *stats) +{ + fprintf(out, "Rare event (set_class): %" PRIu64 "\n", stats->set_class); + fprintf(out, "Rare event (set_bases): %" PRIu64 "\n", stats->set_bases); + fprintf(out, "Rare event (set_eval_frame_func): %" PRIu64 "\n", stats->set_eval_frame_func); + fprintf(out, "Rare event (builtin_dict): %" PRIu64 "\n", stats->builtin_dict); + fprintf(out, "Rare event (func_modification): %" PRIu64 "\n", stats->func_modification); + fprintf(out, "Rare event (watched_dict_modification): %" PRIu64 "\n", stats->watched_dict_modification); + fprintf(out, "Rare event (watched_globals_modification): %" PRIu64 "\n", stats->watched_globals_modification); +} + +static void +print_stats(FILE *out, PyStats *stats) +{ + print_spec_stats(out, stats->opcode_stats); + print_call_stats(out, &stats->call_stats); + print_object_stats(out, &stats->object_stats); + print_gc_stats(out, stats->gc_stats); +#ifdef _Py_TIER2 + print_optimization_stats(out, &stats->optimization_stats); +#endif +#ifdef Py_GIL_DISABLED + print_ft_stats(out, &stats->ft_stats); +#endif + print_rare_event_stats(out, &stats->rare_event_stats); +} + +#ifdef Py_GIL_DISABLED + +static void +merge_specialization_stats(SpecializationStats *dest, const SpecializationStats *src) +{ + dest->success += src->success; + dest->failure += src->failure; + dest->hit += src->hit; + dest->deferred += src->deferred; + dest->miss += src->miss; + dest->deopt += src->deopt; + for (int i = 0; i < SPECIALIZATION_FAILURE_KINDS; i++) { + dest->failure_kinds[i] += src->failure_kinds[i]; + } +} + +static void +merge_opcode_stats_array(OpcodeStats *dest, const OpcodeStats *src) +{ + for (int i = 0; i < 256; i++) { + merge_specialization_stats(&dest[i].specialization, &src[i].specialization); + dest[i].execution_count += src[i].execution_count; + for (int j = 0; j < 256; j++) { + dest[i].pair_count[j] += src[i].pair_count[j]; + } + } +} + +static void +merge_call_stats(CallStats *dest, const CallStats *src) +{ + dest->inlined_py_calls += src->inlined_py_calls; + dest->pyeval_calls += src->pyeval_calls; + dest->frames_pushed += src->frames_pushed; + dest->frame_objects_created += src->frame_objects_created; + for (int i = 0; i < EVAL_CALL_KINDS; i++) { + dest->eval_calls[i] += src->eval_calls[i]; + } +} + +static void +merge_object_stats(ObjectStats *dest, const ObjectStats *src) +{ + dest->increfs += src->increfs; + dest->decrefs += src->decrefs; + dest->interpreter_increfs += src->interpreter_increfs; + dest->interpreter_decrefs += src->interpreter_decrefs; + dest->immortal_increfs += src->immortal_increfs; + dest->immortal_decrefs += src->immortal_decrefs; + dest->interpreter_immortal_increfs += src->interpreter_immortal_increfs; + dest->interpreter_immortal_decrefs += src->interpreter_immortal_decrefs; + dest->allocations += src->allocations; + dest->allocations512 += src->allocations512; + dest->allocations4k += src->allocations4k; + dest->allocations_big += src->allocations_big; + dest->frees += src->frees; + dest->to_freelist += src->to_freelist; + dest->from_freelist += src->from_freelist; + dest->inline_values += src->inline_values; + dest->dict_materialized_on_request += src->dict_materialized_on_request; + dest->dict_materialized_new_key += src->dict_materialized_new_key; + dest->dict_materialized_too_big += src->dict_materialized_too_big; + dest->dict_materialized_str_subclass += src->dict_materialized_str_subclass; + dest->type_cache_hits += src->type_cache_hits; + dest->type_cache_misses += src->type_cache_misses; + dest->type_cache_dunder_hits += src->type_cache_dunder_hits; + dest->type_cache_dunder_misses += src->type_cache_dunder_misses; + dest->type_cache_collisions += src->type_cache_collisions; + dest->object_visits += src->object_visits; +} + +static void +merge_uop_stats_array(UOpStats *dest, const UOpStats *src) +{ + for (int i = 0; i <= PYSTATS_MAX_UOP_ID; i++) { + dest[i].execution_count += src[i].execution_count; + dest[i].miss += src[i].miss; + for (int j = 0; j <= PYSTATS_MAX_UOP_ID; j++) { + dest[i].pair_count[j] += src[i].pair_count[j]; + } + } +} + +static void +merge_optimization_stats(OptimizationStats *dest, const OptimizationStats *src) +{ + dest->attempts += src->attempts; + dest->traces_created += src->traces_created; + dest->traces_executed += src->traces_executed; + dest->uops_executed += src->uops_executed; + dest->trace_stack_overflow += src->trace_stack_overflow; + dest->trace_stack_underflow += src->trace_stack_underflow; + dest->trace_too_long += src->trace_too_long; + dest->trace_too_short += src->trace_too_short; + dest->inner_loop += src->inner_loop; + dest->recursive_call += src->recursive_call; + dest->low_confidence += src->low_confidence; + dest->unknown_callee += src->unknown_callee; + dest->executors_invalidated += src->executors_invalidated; + dest->optimizer_attempts += src->optimizer_attempts; + dest->optimizer_successes += src->optimizer_successes; + dest->optimizer_failure_reason_no_memory += src->optimizer_failure_reason_no_memory; + dest->remove_globals_builtins_changed += src->remove_globals_builtins_changed; + dest->remove_globals_incorrect_keys += src->remove_globals_incorrect_keys; + dest->jit_total_memory_size += src->jit_total_memory_size; + dest->jit_code_size += src->jit_code_size; + dest->jit_trampoline_size += src->jit_trampoline_size; + dest->jit_data_size += src->jit_data_size; + dest->jit_padding_size += src->jit_padding_size; + dest->jit_freed_memory_size += src->jit_freed_memory_size; + + merge_uop_stats_array(dest->opcode, src->opcode); + + for (int i = 0; i < 256; i++) { + dest->unsupported_opcode[i] += src->unsupported_opcode[i]; + } + for (int i = 0; i < _Py_UOP_HIST_SIZE; i++) { + dest->trace_length_hist[i] += src->trace_length_hist[i]; + dest->trace_run_length_hist[i] += src->trace_run_length_hist[i]; + dest->optimized_trace_length_hist[i] += src->optimized_trace_length_hist[i]; + dest->trace_total_memory_hist[i] += src->trace_total_memory_hist[i]; + } + for (int i = 0; i <= PYSTATS_MAX_UOP_ID; i++) { + dest->error_in_opcode[i] += src->error_in_opcode[i]; + } +} + +static void +merge_ft_stats(FTStats *dest, const FTStats *src) +{ + dest->mutex_sleeps = src->mutex_sleeps; + dest->qsbr_polls = src->qsbr_polls; + dest->world_stops = src->world_stops; +} + +static void +merge_rare_event_stats(RareEventStats *dest, const RareEventStats *src) +{ + dest->set_class += src->set_class; + dest->set_bases += src->set_bases; + dest->set_eval_frame_func += src->set_eval_frame_func; + dest->builtin_dict += src->builtin_dict; + dest->func_modification += src->func_modification; + dest->watched_dict_modification += src->watched_dict_modification; + dest->watched_globals_modification += src->watched_globals_modification; +} + +static void +merge_gc_stats_array(GCStats *dest, const GCStats *src) +{ + for (int i = 0; i < NUM_GENERATIONS; i++) { + dest[i].collections += src[i].collections; + dest[i].object_visits += src[i].object_visits; + dest[i].objects_collected += src[i].objects_collected; + dest[i].objects_transitively_reachable += src[i].objects_transitively_reachable; + dest[i].objects_not_transitively_reachable += src[i].objects_not_transitively_reachable; + } +} + +void +stats_zero_thread(_PyThreadStateImpl *tstate) +{ + // Zero the thread local stat counters + if (tstate->pystats_struct) { + memset(tstate->pystats_struct, 0, sizeof(PyStats)); + } +} + +// merge stats for a single thread into the global structure +void +stats_merge_thread(_PyThreadStateImpl *tstate) +{ + PyStats *src = tstate->pystats_struct; + PyStats *dest = ((PyThreadState *)tstate)->interp->pystats_struct; + + if (src == NULL || dest == NULL) { + return; + } + + // Merge each category of stats using the helper functions. + merge_opcode_stats_array(dest->opcode_stats, src->opcode_stats); + merge_call_stats(&dest->call_stats, &src->call_stats); + merge_object_stats(&dest->object_stats, &src->object_stats); + merge_optimization_stats(&dest->optimization_stats, &src->optimization_stats); + merge_ft_stats(&dest->ft_stats, &src->ft_stats); + merge_rare_event_stats(&dest->rare_event_stats, &src->rare_event_stats); + merge_gc_stats_array(dest->gc_stats, src->gc_stats); +} +#endif // Py_GIL_DISABLED + +// toggle stats collection on or off for all threads +static int +stats_toggle_on_off(PyThreadState *tstate, int on) +{ + bool changed = false; + PyInterpreterState *interp = tstate->interp; + STATS_LOCK(interp); + if (on && interp->pystats_struct == NULL) { + PyStats *s = PyMem_RawCalloc(1, sizeof(PyStats)); + if (s == NULL) { + STATS_UNLOCK(interp); + return -1; + } + FT_ATOMIC_STORE_PTR_RELAXED(interp->pystats_struct, s); + } + if (tstate->interp->pystats_enabled != on) { + FT_ATOMIC_STORE_INT_RELAXED(interp->pystats_enabled, on); + changed = true; + } + STATS_UNLOCK(interp); + if (!changed) { + return 0; + } + _PyEval_StopTheWorld(interp); + _Py_FOR_EACH_TSTATE_UNLOCKED(interp, ts) { + PyStats *s = NULL; + if (interp->pystats_enabled) { +#ifdef Py_GIL_DISABLED + _PyThreadStateImpl *ts_impl = (_PyThreadStateImpl *)ts; + if (ts_impl->pystats_struct == NULL) { + // first activation for this thread, allocate structure + ts_impl->pystats_struct = PyMem_RawCalloc(1, sizeof(PyStats)); + } + s = ts_impl->pystats_struct; +#else + s = ts->interp->pystats_struct; +#endif + } + ts->pystats = s; + } + _PyEval_StartTheWorld(interp); + return 0; +} + +// zero stats for all threads and for the interpreter +static void +stats_zero_all(void) +{ + PyThreadState *tstate = _PyThreadState_GET(); + if (tstate == NULL) { + return; + } + if (FT_ATOMIC_LOAD_PTR_RELAXED(tstate->interp->pystats_struct) == NULL) { + return; + } + PyInterpreterState *interp = tstate->interp; + _PyEval_StopTheWorld(interp); +#ifdef Py_GIL_DISABLED + _Py_FOR_EACH_TSTATE_UNLOCKED(interp, ts) { + stats_zero_thread((_PyThreadStateImpl *)ts); + } +#endif + if (interp->pystats_struct) { + memset(interp->pystats_struct, 0, sizeof(PyStats)); + } + _PyEval_StartTheWorld(interp); +} + +// merge stats for all threads into the per-interpreter structure +static void +stats_merge_all(void) +{ + PyThreadState *tstate = _PyThreadState_GET(); + if (tstate == NULL) { + return; + } + if (FT_ATOMIC_LOAD_PTR_RELAXED(tstate->interp->pystats_struct) == NULL) { + return; + } + PyInterpreterState *interp = tstate->interp; + _PyEval_StopTheWorld(interp); +#ifdef Py_GIL_DISABLED + _Py_FOR_EACH_TSTATE_UNLOCKED(interp, ts) { + stats_merge_thread((_PyThreadStateImpl *)ts); + stats_zero_thread((_PyThreadStateImpl *)ts); + } +#endif + _PyEval_StartTheWorld(interp); +} + +int +_Py_StatsOn(void) +{ + PyThreadState *tstate = _PyThreadState_GET(); + return stats_toggle_on_off(tstate, 1); +} + +void +_Py_StatsOff(void) +{ + PyThreadState *tstate = _PyThreadState_GET(); + stats_toggle_on_off(tstate, 0); +} + +void +_Py_StatsClear(void) +{ + stats_zero_all(); +} + +static int +mem_is_zero(unsigned char *ptr, size_t size) +{ + for (size_t i=0; i < size; i++) { + if (*ptr != 0) { + return 0; + } + ptr++; + } + return 1; +} + +int +_Py_PrintSpecializationStats(int to_file) +{ + assert(to_file); + stats_merge_all(); + PyThreadState *tstate = _PyThreadState_GET(); + STATS_LOCK(tstate->interp); + PyStats *stats = tstate->interp->pystats_struct; + if (stats == NULL) { + STATS_UNLOCK(tstate->interp); + return 0; + } +#define MEM_IS_ZERO(DATA) mem_is_zero((unsigned char*)DATA, sizeof(*(DATA))) + int is_zero = ( + MEM_IS_ZERO(stats->gc_stats) // is a pointer + && MEM_IS_ZERO(&stats->opcode_stats) + && MEM_IS_ZERO(&stats->call_stats) + && MEM_IS_ZERO(&stats->object_stats) + ); +#undef MEM_IS_ZERO + STATS_UNLOCK(tstate->interp); + if (is_zero) { + // gh-108753: -X pystats command line was used, but then _stats_off() + // and _stats_clear() have been called: in this case, avoid printing + // useless "all zeros" statistics. + return 0; + } + + FILE *out = stderr; + if (to_file) { + /* Write to a file instead of stderr. */ +# ifdef MS_WINDOWS + const char *dirname = "c:\\temp\\py_stats\\"; +# else + const char *dirname = "/tmp/py_stats/"; +# endif + /* Use random 160 bit number as file name, + * to avoid both accidental collisions and + * symlink attacks. */ + unsigned char rand[20]; + char hex_name[41]; + _PyOS_URandomNonblock(rand, 20); + for (int i = 0; i < 20; i++) { + hex_name[2*i] = Py_hexdigits[rand[i]&15]; + hex_name[2*i+1] = Py_hexdigits[(rand[i]>>4)&15]; + } + hex_name[40] = '\0'; + char buf[64]; + assert(strlen(dirname) + 40 + strlen(".txt") < 64); + sprintf(buf, "%s%s.txt", dirname, hex_name); + FILE *fout = fopen(buf, "w"); + if (fout) { + out = fout; + } + } + else { + fprintf(out, "Specialization stats:\n"); + } + STATS_LOCK(tstate->interp); + print_stats(out, stats); + STATS_UNLOCK(tstate->interp); + if (out != stderr) { + fclose(out); + } + return 1; +} + +PyStatus +_PyStats_InterpInit(PyInterpreterState *interp) +{ + if (interp->config._pystats) { + // start with pystats enabled, can be disabled via sys._stats_off() + // this needs to be set before the first tstate is created + interp->pystats_enabled = 1; + interp->pystats_struct = PyMem_RawCalloc(1, sizeof(PyStats)); + if (interp->pystats_struct == NULL) { + return _PyStatus_ERR("out-of-memory while initializing interpreter"); + } + } + return _PyStatus_OK(); +} + +bool +_PyStats_ThreadInit(PyInterpreterState *interp, _PyThreadStateImpl *tstate) +{ +#ifdef Py_GIL_DISABLED + if (FT_ATOMIC_LOAD_INT_RELAXED(interp->pystats_enabled)) { + assert(interp->pystats_struct != NULL); + tstate->pystats_struct = PyMem_RawCalloc(1, sizeof(PyStats)); + if (tstate->pystats_struct == NULL) { + return false; + } + } +#endif + return true; +} + +void +_PyStats_ThreadFini(_PyThreadStateImpl *tstate) +{ +#ifdef Py_GIL_DISABLED + STATS_LOCK(((PyThreadState *)tstate)->interp); + stats_merge_thread(tstate); + STATS_UNLOCK(((PyThreadState *)tstate)->interp); + PyMem_RawFree(tstate->pystats_struct); +#endif +} + +void +_PyStats_Attach(_PyThreadStateImpl *tstate_impl) +{ + PyStats *s; + PyThreadState *tstate = (PyThreadState *)tstate_impl; + PyInterpreterState *interp = tstate->interp; + if (FT_ATOMIC_LOAD_INT_RELAXED(interp->pystats_enabled)) { +#ifdef Py_GIL_DISABLED + s = ((_PyThreadStateImpl *)tstate)->pystats_struct; +#else + s = tstate->interp->pystats_struct; +#endif + } + else { + s = NULL; + } + tstate->pystats = s; +} + +void +_PyStats_Detach(_PyThreadStateImpl *tstate_impl) +{ + ((PyThreadState *)tstate_impl)->pystats = NULL; +} + +#endif // Py_STATS diff --git a/Python/qsbr.c b/Python/qsbr.c index c992c285cb13e4..b2153bf9d67230 100644 --- a/Python/qsbr.c +++ b/Python/qsbr.c @@ -36,6 +36,7 @@ #include "pycore_pystate.h" // _PyThreadState_GET() #include "pycore_qsbr.h" #include "pycore_tstate.h" // _PyThreadStateImpl +#include "pycore_stats.h" // FT_STAT_QSBR_POLL_INC() // Starting size of the array of qsbr thread states @@ -158,7 +159,7 @@ _Py_qsbr_poll(struct _qsbr_thread_state *qsbr, uint64_t goal) if (_Py_qbsr_goal_reached(qsbr, goal)) { return true; } - + FT_STAT_QSBR_POLL_INC(); uint64_t rd_seq = qsbr_poll_scan(qsbr->shared); return QSBR_LEQ(goal, rd_seq); } diff --git a/Python/specialize.c b/Python/specialize.c index a1c5dedd61563b..2193596a331d3c 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -22,437 +22,23 @@ #include // rand() -extern const char *_PyUOpName(int index); - /* For guidance on adding or extending families of instructions see * InternalDocs/interpreter.md `Specialization` section. */ -#ifdef Py_STATS -GCStats _py_gc_stats[NUM_GENERATIONS] = { 0 }; -static PyStats _Py_stats_struct = { .gc_stats = _py_gc_stats }; -PyStats *_Py_stats = NULL; - -#if PYSTATS_MAX_UOP_ID < MAX_UOP_ID -#error "Not enough space allocated for pystats. Increase PYSTATS_MAX_UOP_ID to at least MAX_UOP_ID" -#endif - -#define ADD_STAT_TO_DICT(res, field) \ - do { \ - PyObject *val = PyLong_FromUnsignedLongLong(stats->field); \ - if (val == NULL) { \ - Py_DECREF(res); \ - return NULL; \ - } \ - if (PyDict_SetItemString(res, #field, val) == -1) { \ - Py_DECREF(res); \ - Py_DECREF(val); \ - return NULL; \ - } \ - Py_DECREF(val); \ - } while(0); - -static PyObject* -stats_to_dict(SpecializationStats *stats) -{ - PyObject *res = PyDict_New(); - if (res == NULL) { - return NULL; - } - ADD_STAT_TO_DICT(res, success); - ADD_STAT_TO_DICT(res, failure); - ADD_STAT_TO_DICT(res, hit); - ADD_STAT_TO_DICT(res, deferred); - ADD_STAT_TO_DICT(res, miss); - ADD_STAT_TO_DICT(res, deopt); - PyObject *failure_kinds = PyTuple_New(SPECIALIZATION_FAILURE_KINDS); - if (failure_kinds == NULL) { - Py_DECREF(res); - return NULL; - } - for (int i = 0; i < SPECIALIZATION_FAILURE_KINDS; i++) { - PyObject *stat = PyLong_FromUnsignedLongLong(stats->failure_kinds[i]); - if (stat == NULL) { - Py_DECREF(res); - Py_DECREF(failure_kinds); - return NULL; - } - PyTuple_SET_ITEM(failure_kinds, i, stat); - } - if (PyDict_SetItemString(res, "failure_kinds", failure_kinds)) { - Py_DECREF(res); - Py_DECREF(failure_kinds); - return NULL; - } - Py_DECREF(failure_kinds); - return res; -} -#undef ADD_STAT_TO_DICT - -static int -add_stat_dict( - PyObject *res, - int opcode, - const char *name) { - - SpecializationStats *stats = &_Py_stats_struct.opcode_stats[opcode].specialization; - PyObject *d = stats_to_dict(stats); - if (d == NULL) { - return -1; - } - int err = PyDict_SetItemString(res, name, d); - Py_DECREF(d); - return err; -} - -PyObject* -_Py_GetSpecializationStats(void) { - PyObject *stats = PyDict_New(); - if (stats == NULL) { - return NULL; - } - int err = 0; - err += add_stat_dict(stats, CONTAINS_OP, "contains_op"); - err += add_stat_dict(stats, LOAD_SUPER_ATTR, "load_super_attr"); - err += add_stat_dict(stats, LOAD_ATTR, "load_attr"); - err += add_stat_dict(stats, LOAD_GLOBAL, "load_global"); - err += add_stat_dict(stats, STORE_SUBSCR, "store_subscr"); - err += add_stat_dict(stats, STORE_ATTR, "store_attr"); - err += add_stat_dict(stats, JUMP_BACKWARD, "jump_backward"); - err += add_stat_dict(stats, CALL, "call"); - err += add_stat_dict(stats, CALL_KW, "call_kw"); - err += add_stat_dict(stats, BINARY_OP, "binary_op"); - err += add_stat_dict(stats, COMPARE_OP, "compare_op"); - err += add_stat_dict(stats, UNPACK_SEQUENCE, "unpack_sequence"); - err += add_stat_dict(stats, FOR_ITER, "for_iter"); - err += add_stat_dict(stats, TO_BOOL, "to_bool"); - err += add_stat_dict(stats, SEND, "send"); - if (err < 0) { - Py_DECREF(stats); - return NULL; - } - return stats; -} - - -#define PRINT_STAT(i, field) \ - if (stats[i].field) { \ - fprintf(out, " opcode[%s]." #field " : %" PRIu64 "\n", _PyOpcode_OpName[i], stats[i].field); \ - } - -static void -print_spec_stats(FILE *out, OpcodeStats *stats) -{ - /* Mark some opcodes as specializable for stats, - * even though we don't specialize them yet. */ - fprintf(out, "opcode[BINARY_SLICE].specializable : 1\n"); - fprintf(out, "opcode[STORE_SLICE].specializable : 1\n"); - fprintf(out, "opcode[GET_ITER].specializable : 1\n"); - for (int i = 0; i < 256; i++) { - if (_PyOpcode_Caches[i]) { - /* Ignore jumps as they cannot be specialized */ - switch (i) { - case POP_JUMP_IF_FALSE: - case POP_JUMP_IF_TRUE: - case POP_JUMP_IF_NONE: - case POP_JUMP_IF_NOT_NONE: - case JUMP_BACKWARD: - break; - default: - fprintf(out, "opcode[%s].specializable : 1\n", _PyOpcode_OpName[i]); - } - } - PRINT_STAT(i, specialization.success); - PRINT_STAT(i, specialization.failure); - PRINT_STAT(i, specialization.hit); - PRINT_STAT(i, specialization.deferred); - PRINT_STAT(i, specialization.miss); - PRINT_STAT(i, specialization.deopt); - PRINT_STAT(i, execution_count); - for (int j = 0; j < SPECIALIZATION_FAILURE_KINDS; j++) { - uint64_t val = stats[i].specialization.failure_kinds[j]; - if (val) { - fprintf(out, " opcode[%s].specialization.failure_kinds[%d] : %" - PRIu64 "\n", _PyOpcode_OpName[i], j, val); - } - } - for (int j = 0; j < 256; j++) { - if (stats[i].pair_count[j]) { - fprintf(out, "opcode[%s].pair_count[%s] : %" PRIu64 "\n", - _PyOpcode_OpName[i], _PyOpcode_OpName[j], stats[i].pair_count[j]); - } - } - } -} -#undef PRINT_STAT - - -static void -print_call_stats(FILE *out, CallStats *stats) -{ - fprintf(out, "Calls to PyEval_EvalDefault: %" PRIu64 "\n", stats->pyeval_calls); - fprintf(out, "Calls to Python functions inlined: %" PRIu64 "\n", stats->inlined_py_calls); - fprintf(out, "Frames pushed: %" PRIu64 "\n", stats->frames_pushed); - fprintf(out, "Frame objects created: %" PRIu64 "\n", stats->frame_objects_created); - for (int i = 0; i < EVAL_CALL_KINDS; i++) { - fprintf(out, "Calls via PyEval_EvalFrame[%d] : %" PRIu64 "\n", i, stats->eval_calls[i]); - } -} - -static void -print_object_stats(FILE *out, ObjectStats *stats) -{ - fprintf(out, "Object allocations from freelist: %" PRIu64 "\n", stats->from_freelist); - fprintf(out, "Object frees to freelist: %" PRIu64 "\n", stats->to_freelist); - fprintf(out, "Object allocations: %" PRIu64 "\n", stats->allocations); - fprintf(out, "Object allocations to 512 bytes: %" PRIu64 "\n", stats->allocations512); - fprintf(out, "Object allocations to 4 kbytes: %" PRIu64 "\n", stats->allocations4k); - fprintf(out, "Object allocations over 4 kbytes: %" PRIu64 "\n", stats->allocations_big); - fprintf(out, "Object frees: %" PRIu64 "\n", stats->frees); - fprintf(out, "Object inline values: %" PRIu64 "\n", stats->inline_values); - fprintf(out, "Object interpreter mortal increfs: %" PRIu64 "\n", stats->interpreter_increfs); - fprintf(out, "Object interpreter mortal decrefs: %" PRIu64 "\n", stats->interpreter_decrefs); - fprintf(out, "Object mortal increfs: %" PRIu64 "\n", stats->increfs); - fprintf(out, "Object mortal decrefs: %" PRIu64 "\n", stats->decrefs); - fprintf(out, "Object interpreter immortal increfs: %" PRIu64 "\n", stats->interpreter_immortal_increfs); - fprintf(out, "Object interpreter immortal decrefs: %" PRIu64 "\n", stats->interpreter_immortal_decrefs); - fprintf(out, "Object immortal increfs: %" PRIu64 "\n", stats->immortal_increfs); - fprintf(out, "Object immortal decrefs: %" PRIu64 "\n", stats->immortal_decrefs); - fprintf(out, "Object materialize dict (on request): %" PRIu64 "\n", stats->dict_materialized_on_request); - fprintf(out, "Object materialize dict (new key): %" PRIu64 "\n", stats->dict_materialized_new_key); - fprintf(out, "Object materialize dict (too big): %" PRIu64 "\n", stats->dict_materialized_too_big); - fprintf(out, "Object materialize dict (str subclass): %" PRIu64 "\n", stats->dict_materialized_str_subclass); - fprintf(out, "Object method cache hits: %" PRIu64 "\n", stats->type_cache_hits); - fprintf(out, "Object method cache misses: %" PRIu64 "\n", stats->type_cache_misses); - fprintf(out, "Object method cache collisions: %" PRIu64 "\n", stats->type_cache_collisions); - fprintf(out, "Object method cache dunder hits: %" PRIu64 "\n", stats->type_cache_dunder_hits); - fprintf(out, "Object method cache dunder misses: %" PRIu64 "\n", stats->type_cache_dunder_misses); -} - -static void -print_gc_stats(FILE *out, GCStats *stats) -{ - for (int i = 0; i < NUM_GENERATIONS; i++) { - fprintf(out, "GC[%d] collections: %" PRIu64 "\n", i, stats[i].collections); - fprintf(out, "GC[%d] object visits: %" PRIu64 "\n", i, stats[i].object_visits); - fprintf(out, "GC[%d] objects collected: %" PRIu64 "\n", i, stats[i].objects_collected); - fprintf(out, "GC[%d] objects reachable from roots: %" PRIu64 "\n", i, stats[i].objects_transitively_reachable); - fprintf(out, "GC[%d] objects not reachable from roots: %" PRIu64 "\n", i, stats[i].objects_not_transitively_reachable); - } -} - -#ifdef _Py_TIER2 -static void -print_histogram(FILE *out, const char *name, uint64_t hist[_Py_UOP_HIST_SIZE]) -{ - for (int i = 0; i < _Py_UOP_HIST_SIZE; i++) { - fprintf(out, "%s[%" PRIu64"]: %" PRIu64 "\n", name, (uint64_t)1 << i, hist[i]); - } -} - -static void -print_optimization_stats(FILE *out, OptimizationStats *stats) -{ - fprintf(out, "Optimization attempts: %" PRIu64 "\n", stats->attempts); - fprintf(out, "Optimization traces created: %" PRIu64 "\n", stats->traces_created); - fprintf(out, "Optimization traces executed: %" PRIu64 "\n", stats->traces_executed); - fprintf(out, "Optimization uops executed: %" PRIu64 "\n", stats->uops_executed); - fprintf(out, "Optimization trace stack overflow: %" PRIu64 "\n", stats->trace_stack_overflow); - fprintf(out, "Optimization trace stack underflow: %" PRIu64 "\n", stats->trace_stack_underflow); - fprintf(out, "Optimization trace too long: %" PRIu64 "\n", stats->trace_too_long); - fprintf(out, "Optimization trace too short: %" PRIu64 "\n", stats->trace_too_short); - fprintf(out, "Optimization inner loop: %" PRIu64 "\n", stats->inner_loop); - fprintf(out, "Optimization recursive call: %" PRIu64 "\n", stats->recursive_call); - fprintf(out, "Optimization low confidence: %" PRIu64 "\n", stats->low_confidence); - fprintf(out, "Optimization unknown callee: %" PRIu64 "\n", stats->unknown_callee); - fprintf(out, "Executors invalidated: %" PRIu64 "\n", stats->executors_invalidated); - - print_histogram(out, "Trace length", stats->trace_length_hist); - print_histogram(out, "Trace run length", stats->trace_run_length_hist); - print_histogram(out, "Optimized trace length", stats->optimized_trace_length_hist); - - fprintf(out, "Optimization optimizer attempts: %" PRIu64 "\n", stats->optimizer_attempts); - fprintf(out, "Optimization optimizer successes: %" PRIu64 "\n", stats->optimizer_successes); - fprintf(out, "Optimization optimizer failure no memory: %" PRIu64 "\n", - stats->optimizer_failure_reason_no_memory); - fprintf(out, "Optimizer remove globals builtins changed: %" PRIu64 "\n", stats->remove_globals_builtins_changed); - fprintf(out, "Optimizer remove globals incorrect keys: %" PRIu64 "\n", stats->remove_globals_incorrect_keys); - for (int i = 0; i <= MAX_UOP_ID; i++) { - if (stats->opcode[i].execution_count) { - fprintf(out, "uops[%s].execution_count : %" PRIu64 "\n", _PyUOpName(i), stats->opcode[i].execution_count); - } - if (stats->opcode[i].miss) { - fprintf(out, "uops[%s].specialization.miss : %" PRIu64 "\n", _PyUOpName(i), stats->opcode[i].miss); - } - } - for (int i = 0; i < 256; i++) { - if (stats->unsupported_opcode[i]) { - fprintf( - out, - "unsupported_opcode[%s].count : %" PRIu64 "\n", - _PyOpcode_OpName[i], - stats->unsupported_opcode[i] - ); - } - } - - for (int i = 1; i <= MAX_UOP_ID; i++){ - for (int j = 1; j <= MAX_UOP_ID; j++) { - if (stats->opcode[i].pair_count[j]) { - fprintf(out, "uop[%s].pair_count[%s] : %" PRIu64 "\n", - _PyOpcode_uop_name[i], _PyOpcode_uop_name[j], stats->opcode[i].pair_count[j]); - } - } - } - for (int i = 0; i < MAX_UOP_ID; i++) { - if (stats->error_in_opcode[i]) { - fprintf( - out, - "error_in_opcode[%s].count : %" PRIu64 "\n", - _PyUOpName(i), - stats->error_in_opcode[i] - ); - } - } - fprintf(out, "JIT total memory size: %" PRIu64 "\n", stats->jit_total_memory_size); - fprintf(out, "JIT code size: %" PRIu64 "\n", stats->jit_code_size); - fprintf(out, "JIT trampoline size: %" PRIu64 "\n", stats->jit_trampoline_size); - fprintf(out, "JIT data size: %" PRIu64 "\n", stats->jit_data_size); - fprintf(out, "JIT padding size: %" PRIu64 "\n", stats->jit_padding_size); - fprintf(out, "JIT freed memory size: %" PRIu64 "\n", stats->jit_freed_memory_size); - - print_histogram(out, "Trace total memory size", stats->trace_total_memory_hist); -} -#endif - -static void -print_rare_event_stats(FILE *out, RareEventStats *stats) -{ - fprintf(out, "Rare event (set_class): %" PRIu64 "\n", stats->set_class); - fprintf(out, "Rare event (set_bases): %" PRIu64 "\n", stats->set_bases); - fprintf(out, "Rare event (set_eval_frame_func): %" PRIu64 "\n", stats->set_eval_frame_func); - fprintf(out, "Rare event (builtin_dict): %" PRIu64 "\n", stats->builtin_dict); - fprintf(out, "Rare event (func_modification): %" PRIu64 "\n", stats->func_modification); - fprintf(out, "Rare event (watched_dict_modification): %" PRIu64 "\n", stats->watched_dict_modification); - fprintf(out, "Rare event (watched_globals_modification): %" PRIu64 "\n", stats->watched_globals_modification); -} - -static void -print_stats(FILE *out, PyStats *stats) -{ - print_spec_stats(out, stats->opcode_stats); - print_call_stats(out, &stats->call_stats); - print_object_stats(out, &stats->object_stats); - print_gc_stats(out, stats->gc_stats); -#ifdef _Py_TIER2 - print_optimization_stats(out, &stats->optimization_stats); -#endif - print_rare_event_stats(out, &stats->rare_event_stats); -} - -void -_Py_StatsOn(void) -{ - _Py_stats = &_Py_stats_struct; -} - -void -_Py_StatsOff(void) -{ - _Py_stats = NULL; -} - -void -_Py_StatsClear(void) -{ - memset(&_py_gc_stats, 0, sizeof(_py_gc_stats)); - memset(&_Py_stats_struct, 0, sizeof(_Py_stats_struct)); - _Py_stats_struct.gc_stats = _py_gc_stats; -} - -static int -mem_is_zero(unsigned char *ptr, size_t size) -{ - for (size_t i=0; i < size; i++) { - if (*ptr != 0) { - return 0; - } - ptr++; - } - return 1; -} - -int -_Py_PrintSpecializationStats(int to_file) -{ - PyStats *stats = &_Py_stats_struct; -#define MEM_IS_ZERO(DATA) mem_is_zero((unsigned char*)DATA, sizeof(*(DATA))) - int is_zero = ( - MEM_IS_ZERO(stats->gc_stats) // is a pointer - && MEM_IS_ZERO(&stats->opcode_stats) - && MEM_IS_ZERO(&stats->call_stats) - && MEM_IS_ZERO(&stats->object_stats) - ); -#undef MEM_IS_ZERO - if (is_zero) { - // gh-108753: -X pystats command line was used, but then _stats_off() - // and _stats_clear() have been called: in this case, avoid printing - // useless "all zeros" statistics. - return 0; - } - - FILE *out = stderr; - if (to_file) { - /* Write to a file instead of stderr. */ -# ifdef MS_WINDOWS - const char *dirname = "c:\\temp\\py_stats\\"; -# else - const char *dirname = "/tmp/py_stats/"; -# endif - /* Use random 160 bit number as file name, - * to avoid both accidental collisions and - * symlink attacks. */ - unsigned char rand[20]; - char hex_name[41]; - _PyOS_URandomNonblock(rand, 20); - for (int i = 0; i < 20; i++) { - hex_name[2*i] = Py_hexdigits[rand[i]&15]; - hex_name[2*i+1] = Py_hexdigits[(rand[i]>>4)&15]; - } - hex_name[40] = '\0'; - char buf[64]; - assert(strlen(dirname) + 40 + strlen(".txt") < 64); - sprintf(buf, "%s%s.txt", dirname, hex_name); - FILE *fout = fopen(buf, "w"); - if (fout) { - out = fout; - } - } - else { - fprintf(out, "Specialization stats:\n"); - } - print_stats(out, stats); - if (out != stderr) { - fclose(out); - } - return 1; -} - +#if Py_STATS #define SPECIALIZATION_FAIL(opcode, kind) \ do { \ - if (_Py_stats) { \ + PyStats *s = _PyStats_GET(); \ + if (s) { \ int _kind = (kind); \ assert(_kind < SPECIALIZATION_FAILURE_KINDS); \ - _Py_stats->opcode_stats[opcode].specialization.failure_kinds[_kind]++; \ + s->opcode_stats[opcode].specialization.failure_kinds[_kind]++; \ } \ } while (0) - -#endif // Py_STATS - - -#ifndef SPECIALIZATION_FAIL +#else # define SPECIALIZATION_FAIL(opcode, kind) ((void)0) -#endif +#endif // Py_STATS // Initialize warmup counters and optimize instructions. This cannot fail. void diff --git a/Python/sysmodule.c b/Python/sysmodule.c index 59baca26793f6c..86dd1395cae4a8 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -2281,7 +2281,9 @@ static PyObject * sys__stats_on_impl(PyObject *module) /*[clinic end generated code: output=aca53eafcbb4d9fe input=43b5bfe145299e55]*/ { - _Py_StatsOn(); + if (_Py_StatsOn() < 0) { + return NULL; + } Py_RETURN_NONE; } diff --git a/Tools/jit/README.md b/Tools/jit/README.md index 35c7ffd7a283f8..d83b09aab59f8c 100644 --- a/Tools/jit/README.md +++ b/Tools/jit/README.md @@ -9,32 +9,32 @@ Python 3.11 or newer is required to build the JIT. The JIT compiler does not require end users to install any third-party dependencies, but part of it must be *built* using LLVM[^why-llvm]. You are *not* required to build the rest of CPython using LLVM, or even the same version of LLVM (in fact, this is uncommon). -LLVM version 19 is the officially supported version. You can modify if needed using the `LLVM_VERSION` env var during configure. Both `clang` and `llvm-readobj` need to be installed and discoverable (version suffixes, like `clang-19`, are okay). It's highly recommended that you also have `llvm-objdump` available, since this allows the build script to dump human-readable assembly for the generated code. +LLVM version 20 is the officially supported version. You can modify if needed using the `LLVM_VERSION` env var during configure. Both `clang` and `llvm-readobj` need to be installed and discoverable (version suffixes, like `clang-19`, are okay). It's highly recommended that you also have `llvm-objdump` available, since this allows the build script to dump human-readable assembly for the generated code. It's easy to install all of the required tools: ### Linux -Install LLVM 19 on Ubuntu/Debian: +Install LLVM 20 on Ubuntu/Debian: ```sh wget https://apt.llvm.org/llvm.sh chmod +x llvm.sh -sudo ./llvm.sh 19 +sudo ./llvm.sh 20 ``` -Install LLVM 19 on Fedora Linux 40 or newer: +Install LLVM 20 on Fedora Linux 40 or newer: ```sh -sudo dnf install 'clang(major) = 19' 'llvm(major) = 19' +sudo dnf install 'clang(major) = 20' 'llvm(major) = 20' ``` ### macOS -Install LLVM 19 with [Homebrew](https://brew.sh): +Install LLVM 20 with [Homebrew](https://brew.sh): ```sh -brew install llvm@19 +brew install llvm@20 ``` Homebrew won't add any of the tools to your `$PATH`. That's okay; the build script knows how to find them. @@ -43,18 +43,18 @@ Homebrew won't add any of the tools to your `$PATH`. That's okay; the build scri LLVM is downloaded automatically (along with other external binary dependencies) by `PCbuild\build.bat`. -Otherwise, you can install LLVM 19 [by searching for it on LLVM's GitHub releases page](https://github.com/llvm/llvm-project/releases?q=19), clicking on "Assets", downloading the appropriate Windows installer for your platform (likely the file ending with `-win64.exe`), and running it. **When installing, be sure to select the option labeled "Add LLVM to the system PATH".** +Otherwise, you can install LLVM 20 [by searching for it on LLVM's GitHub releases page](https://github.com/llvm/llvm-project/releases?q=20), clicking on "Assets", downloading the appropriate Windows installer for your platform (likely the file ending with `-win64.exe`), and running it. **When installing, be sure to select the option labeled "Add LLVM to the system PATH".** Alternatively, you can use [chocolatey](https://chocolatey.org): ```sh -choco install llvm --version=19.1.0 +choco install llvm --version=20.1.8 ``` ### Dev Containers If you are working on CPython in a [Codespaces instance](https://devguide.python.org/getting-started/setup-building/#using-codespaces), there's no -need to install LLVM as the Fedora 41 base image includes LLVM 19 out of the box. +need to install LLVM as the Fedora 42 base image includes LLVM 20 out of the box. ## Building diff --git a/Tools/jit/_llvm.py b/Tools/jit/_llvm.py index bc3b50ffe61634..54c2bf86a36ed6 100644 --- a/Tools/jit/_llvm.py +++ b/Tools/jit/_llvm.py @@ -11,8 +11,8 @@ import _targets -_LLVM_VERSION = "19" -_EXTERNALS_LLVM_TAG = "llvm-19.1.7.0" +_LLVM_VERSION = "20" +_EXTERNALS_LLVM_TAG = "llvm-20.1.8.0" _P = typing.ParamSpec("_P") _R = typing.TypeVar("_R") diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py index 777db7366b186b..e717365b6b9785 100644 --- a/Tools/jit/_stencils.py +++ b/Tools/jit/_stencils.py @@ -253,6 +253,23 @@ def process_relocations(self, known_symbols: dict[str, int]) -> None: self._trampolines.add(ordinal) hole.addend = ordinal hole.symbol = None + # x86_64 Darwin trampolines for external symbols + elif ( + hole.kind == "X86_64_RELOC_BRANCH" + and hole.value is HoleValue.ZERO + and hole.symbol not in self.symbols + ): + hole.func = "patch_x86_64_trampoline" + hole.need_state = True + assert hole.symbol is not None + if hole.symbol in known_symbols: + ordinal = known_symbols[hole.symbol] + else: + ordinal = len(known_symbols) + known_symbols[hole.symbol] = ordinal + self._trampolines.add(ordinal) + hole.addend = ordinal + hole.symbol = None self.data.pad(8) for stencil in [self.code, self.data]: for hole in stencil.holes: diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index dcc0abaf23f16d..a76d8ff2792602 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -166,10 +166,6 @@ async def _compile( "-fno-asynchronous-unwind-tables", # Don't call built-in functions that we can't find or patch: "-fno-builtin", - # Emit relaxable 64-bit calls/jumps, so we don't have to worry about - # about emitting in-range trampolines for out-of-range targets. - # We can probably remove this and emit trampolines in the future: - "-fno-plt", # Don't call stack-smashing canaries that we can't find or patch: "-fno-stack-protector", "-std=c11", @@ -571,14 +567,14 @@ def get_target(host: str) -> _COFF32 | _COFF64 | _ELF | _MachO: elif re.fullmatch(r"aarch64-pc-windows-msvc", host): host = "aarch64-pc-windows-msvc" condition = "defined(_M_ARM64)" - args = ["-fms-runtime-lib=dll", "-fplt"] + args = ["-fms-runtime-lib=dll"] optimizer = _optimizers.OptimizerAArch64 target = _COFF64(host, condition, args=args, optimizer=optimizer) elif re.fullmatch(r"aarch64-.*-linux-gnu", host): host = "aarch64-unknown-linux-gnu" condition = "defined(__aarch64__) && defined(__linux__)" # -mno-outline-atomics: Keep intrinsics from being emitted. - args = ["-fpic", "-mno-outline-atomics"] + args = ["-fpic", "-mno-outline-atomics", "-fno-plt"] optimizer = _optimizers.OptimizerAArch64 target = _ELF(host, condition, args=args, optimizer=optimizer) elif re.fullmatch(r"i686-pc-windows-msvc", host): @@ -602,7 +598,7 @@ def get_target(host: str) -> _COFF32 | _COFF64 | _ELF | _MachO: elif re.fullmatch(r"x86_64-.*-linux-gnu", host): host = "x86_64-unknown-linux-gnu" condition = "defined(__x86_64__) && defined(__linux__)" - args = ["-fno-pic", "-mcmodel=medium", "-mlarge-data-threshold=0"] + args = ["-fno-pic", "-mcmodel=medium", "-mlarge-data-threshold=0", "-fno-plt"] optimizer = _optimizers.OptimizerX86 target = _ELF(host, condition, args=args, optimizer=optimizer) else: diff --git a/Tools/jit/trampoline.c b/Tools/jit/trampoline.c index 79d6af97961fc9..fdc63b7fc40a89 100644 --- a/Tools/jit/trampoline.c +++ b/Tools/jit/trampoline.c @@ -10,7 +10,7 @@ _Py_CODEUNIT * _JIT_ENTRY( _PyExecutorObject *exec, _PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate ) { - typedef DECLARE_TARGET((*jit_func)); - jit_func jitted = (jit_func)exec->jit_code; + // Note that this is *not* a tail call + jit_func_preserve_none jitted = (jit_func_preserve_none)exec->jit_code; return jitted(frame, stack_pointer, tstate); }