diff --git a/Doc/c-api/code.rst b/Doc/c-api/code.rst
index 6eae24b38fae48..42594f063b0709 100644
--- a/Doc/c-api/code.rst
+++ b/Doc/c-api/code.rst
@@ -182,7 +182,7 @@ bound into a function.
    Type of a code object watcher callback function.
 
    If *event* is ``PY_CODE_EVENT_CREATE``, then the callback is invoked
-   after `co` has been fully initialized. Otherwise, the callback is invoked
+   after *co* has been fully initialized. Otherwise, the callback is invoked
    before the destruction of *co* takes place, so the prior state of *co*
    can be inspected.
 
diff --git a/Doc/c-api/function.rst b/Doc/c-api/function.rst
index 58792edeed25e3..63b78f677674e9 100644
--- a/Doc/c-api/function.rst
+++ b/Doc/c-api/function.rst
@@ -169,7 +169,7 @@ There are a few functions specific to Python functions.
    unpredictable effects, including infinite recursion.
 
    If *event* is ``PyFunction_EVENT_CREATE``, then the callback is invoked
-   after `func` has been fully initialized. Otherwise, the callback is invoked
+   after *func* has been fully initialized. Otherwise, the callback is invoked
    before the modification to *func* takes place, so the prior state of *func*
    can be inspected. The runtime is permitted to optimize away the creation of
    function objects when possible. In such cases no event will be emitted.
diff --git a/Doc/c-api/typeobj.rst b/Doc/c-api/typeobj.rst
index 5df0c0fe608e53..91046c0e6f18ae 100644
--- a/Doc/c-api/typeobj.rst
+++ b/Doc/c-api/typeobj.rst
@@ -1238,7 +1238,7 @@ and :c:data:`PyType_Type` effectively act as defaults.)
 
    .. c:macro:: Py_TPFLAGS_MANAGED_DICT
 
-      This bit indicates that instances of the class have a `~object.__dict__`
+      This bit indicates that instances of the class have a :attr:`~object.__dict__`
       attribute, and that the space for the dictionary is managed by the VM.
 
       If this flag is set, :c:macro:`Py_TPFLAGS_HAVE_GC` should also be set.
diff --git a/Doc/library/annotationlib.rst b/Doc/library/annotationlib.rst
index 41c9ce479ff0f8..7dfc11449a6cbc 100644
--- a/Doc/library/annotationlib.rst
+++ b/Doc/library/annotationlib.rst
@@ -211,6 +211,10 @@ Classes
       means may not have any information about their scope, so passing
       arguments to this method may be necessary to evaluate them successfully.
 
+      If no *owner*, *globals*, *locals*, or *type_params* are provided and the
+      :class:`~ForwardRef` does not contain information about its origin,
+      empty globals and locals dictionaries are used.
+
    .. versionadded:: 3.14
 
 
diff --git a/Doc/library/typing.rst b/Doc/library/typing.rst
index 54cc3ea3311adf..dd8ea3c364f49a 100644
--- a/Doc/library/typing.rst
+++ b/Doc/library/typing.rst
@@ -3500,20 +3500,11 @@ Introspection helpers
    Evaluate an :class:`annotationlib.ForwardRef` as a :term:`type hint`.
 
    This is similar to calling :meth:`annotationlib.ForwardRef.evaluate`,
-   but unlike that method, :func:`!evaluate_forward_ref` also:
-
-   * Recursively evaluates forward references nested within the type hint.
-   * Raises :exc:`TypeError` when it encounters certain objects that are
-     not valid type hints.
-   * Replaces type hints that evaluate to :const:`!None` with
-     :class:`types.NoneType`.
-   * Supports the :attr:`~annotationlib.Format.FORWARDREF` and
-     :attr:`~annotationlib.Format.STRING` formats.
+   but unlike that method, :func:`!evaluate_forward_ref` also
+   recursively evaluates forward references nested within the type hint.
 
    See the documentation for :meth:`annotationlib.ForwardRef.evaluate` for
-   the meaning of the *owner*, *globals*, *locals*, and *type_params* parameters.
-   *format* specifies the format of the annotation and is a member of
-   the :class:`annotationlib.Format` enum.
+   the meaning of the *owner*, *globals*, *locals*, *type_params*, and *format* parameters.
 
    .. versionadded:: 3.14
 
diff --git a/Include/cpython/pystate.h b/Include/cpython/pystate.h
index 7f1bc363861ddf..54d7e62292966e 100644
--- a/Include/cpython/pystate.h
+++ b/Include/cpython/pystate.h
@@ -61,6 +61,8 @@ typedef struct _stack_chunk {
     PyObject * data[1]; /* Variable sized */
 } _PyStackChunk;
 
+/* Minimum size of data stack chunk */
+#define _PY_DATA_STACK_CHUNK_SIZE (16*1024)
 struct _ts {
     /* See Python/ceval.c for comments explaining most fields */
 
diff --git a/Include/internal/pycore_debug_offsets.h b/Include/internal/pycore_debug_offsets.h
index 1a265c59ff8c08..ce3fcb109f49f7 100644
--- a/Include/internal/pycore_debug_offsets.h
+++ b/Include/internal/pycore_debug_offsets.h
@@ -54,11 +54,13 @@ extern "C" {
 # define _Py_Debug_Free_Threaded 1
 # define _Py_Debug_code_object_co_tlbc offsetof(PyCodeObject, co_tlbc)
 # define _Py_Debug_interpreter_frame_tlbc_index offsetof(_PyInterpreterFrame, tlbc_index)
+# define _Py_Debug_interpreter_state_tlbc_generation offsetof(PyInterpreterState, tlbc_indices.tlbc_generation)
 #else
 # define _Py_Debug_gilruntimestate_enabled 0
 # define _Py_Debug_Free_Threaded 0
 # define _Py_Debug_code_object_co_tlbc 0
 # define _Py_Debug_interpreter_frame_tlbc_index 0
+# define _Py_Debug_interpreter_state_tlbc_generation 0
 #endif
 
 
@@ -89,6 +91,8 @@ typedef struct _Py_DebugOffsets {
         uint64_t gil_runtime_state_enabled;
         uint64_t gil_runtime_state_locked;
         uint64_t gil_runtime_state_holder;
+        uint64_t code_object_generation;
+        uint64_t tlbc_generation;
     } interpreter_state;
 
     // Thread state offset;
@@ -216,6 +220,11 @@ typedef struct _Py_DebugOffsets {
         uint64_t gi_frame_state;
     } gen_object;
 
+    struct _llist_node {
+        uint64_t next;
+        uint64_t prev;
+    } llist_node;
+
     struct _debugger_support {
         uint64_t eval_breaker;
         uint64_t remote_debugger_support;
@@ -251,6 +260,8 @@ typedef struct _Py_DebugOffsets {
         .gil_runtime_state_enabled = _Py_Debug_gilruntimestate_enabled, \
         .gil_runtime_state_locked = offsetof(PyInterpreterState, _gil.locked), \
         .gil_runtime_state_holder = offsetof(PyInterpreterState, _gil.last_holder), \
+        .code_object_generation = offsetof(PyInterpreterState, _code_object_generation), \
+        .tlbc_generation = _Py_Debug_interpreter_state_tlbc_generation, \
     }, \
     .thread_state = { \
         .size = sizeof(PyThreadState), \
@@ -347,6 +358,10 @@ typedef struct _Py_DebugOffsets {
         .gi_iframe = offsetof(PyGenObject, gi_iframe), \
         .gi_frame_state = offsetof(PyGenObject, gi_frame_state), \
     }, \
+    .llist_node = { \
+        .next = offsetof(struct llist_node, next), \
+        .prev = offsetof(struct llist_node, prev), \
+    }, \
     .debugger_support = { \
         .eval_breaker = offsetof(PyThreadState, eval_breaker), \
         .remote_debugger_support = offsetof(PyThreadState, remote_debugger_support),  \
diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h
index d896e870630418..356bcaa7c350a1 100644
--- a/Include/internal/pycore_global_objects_fini_generated.h
+++ b/Include/internal/pycore_global_objects_fini_generated.h
@@ -795,6 +795,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(alias));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(align));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(all));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(all_threads));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(allow_code));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(any));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(append));
diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h
index a06d7495bab8e7..aebe798031ce4f 100644
--- a/Include/internal/pycore_global_strings.h
+++ b/Include/internal/pycore_global_strings.h
@@ -286,6 +286,7 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(alias)
         STRUCT_FOR_ID(align)
         STRUCT_FOR_ID(all)
+        STRUCT_FOR_ID(all_threads)
         STRUCT_FOR_ID(allow_code)
         STRUCT_FOR_ID(any)
         STRUCT_FOR_ID(append)
diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h
index c3e6c77405bfe7..8a29c533b99058 100644
--- a/Include/internal/pycore_interp_structs.h
+++ b/Include/internal/pycore_interp_structs.h
@@ -726,6 +726,10 @@ typedef struct _PyIndexPool {
 
     // Next index to allocate if no free indices are available
     int32_t next_index;
+
+    // Generation counter incremented on thread creation/destruction
+    // Used for TLBC cache invalidation in remote debugging
+    uint32_t tlbc_generation;
 } _PyIndexPool;
 
 typedef union _Py_unique_id_entry {
@@ -843,6 +847,8 @@ struct _is {
     /* The per-interpreter GIL, which might not be used. */
     struct _gil_runtime_state _gil;
 
+    uint64_t _code_object_generation;
+
      /* ---------- IMPORTANT ---------------------------
      The fields above this line are declared as early as
      possible to facilitate out-of-process observability
diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h
index 83301d8aef7697..0fa1fa5af99a92 100644
--- a/Include/internal/pycore_runtime_init_generated.h
+++ b/Include/internal/pycore_runtime_init_generated.h
@@ -793,6 +793,7 @@ extern "C" {
     INIT_ID(alias), \
     INIT_ID(align), \
     INIT_ID(all), \
+    INIT_ID(all_threads), \
     INIT_ID(allow_code), \
     INIT_ID(any), \
     INIT_ID(append), \
diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h
index c0f5f2b17f6609..4982c4532afd89 100644
--- a/Include/internal/pycore_unicodeobject_generated.h
+++ b/Include/internal/pycore_unicodeobject_generated.h
@@ -932,6 +932,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(all_threads);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(allow_code);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
diff --git a/Lib/asyncio/tools.py b/Lib/asyncio/tools.py
index b2da7d2f6ba10c..3fc4524c008db6 100644
--- a/Lib/asyncio/tools.py
+++ b/Lib/asyncio/tools.py
@@ -1,11 +1,10 @@
 """Tools to analyze tasks running in asyncio programs."""
 
-from dataclasses import dataclass
 from collections import defaultdict
 from itertools import count
 from enum import Enum
 import sys
-from _remote_debugging import get_all_awaited_by
+from _remote_debugging import RemoteUnwinder
 
 
 class NodeType(Enum):
@@ -118,6 +117,11 @@ def dfs(v):
 
 
 # ─── PRINT TREE FUNCTION ───────────────────────────────────────
+def get_all_awaited_by(pid):
+    unwinder = RemoteUnwinder(pid)
+    return unwinder.get_all_awaited_by()
+
+
 def build_async_tree(result, task_emoji="(T)", cor_emoji=""):
     """
     Build a list of strings for pretty-print an async call tree.
diff --git a/Lib/test/.ruff.toml b/Lib/test/.ruff.toml
index 7aa8a4785d6844..f1a967203ce4ba 100644
--- a/Lib/test/.ruff.toml
+++ b/Lib/test/.ruff.toml
@@ -19,5 +19,12 @@ extend-exclude = [
 
 [lint]
 select = [
+    "F401",  # Unused import
     "F811",  # Redefinition of unused variable (useful for finding test methods with the same name)
 ]
+
+[lint.per-file-ignores]
+"*/**/__main__.py" = ["F401"]  # Unused import
+"test_import/*.py" = ["F401"]  # Unused import
+"test_importlib/*.py" = ["F401"]  # Unused import
+"typinganndata/partialexecution/*.py" = ["F401"]  # Unused import
diff --git a/Lib/test/support/__init__.py b/Lib/test/support/__init__.py
index b7cd7940eb15b3..351d832a26d1df 100644
--- a/Lib/test/support/__init__.py
+++ b/Lib/test/support/__init__.py
@@ -1101,7 +1101,6 @@ def __init__(self):
         self.started = False
 
     def start(self):
-        import warnings
         try:
             f = open(self.procfile, 'r')
         except OSError as e:
@@ -2728,7 +2727,7 @@ def iter_builtin_types():
     # Fall back to making a best-effort guess.
     if hasattr(object, '__flags__'):
         # Look for any type object with the Py_TPFLAGS_STATIC_BUILTIN flag set.
-        import datetime
+        import datetime  # noqa: F401
         seen = set()
         for cls, subs in walk_class_hierarchy(object):
             if cls in seen:
diff --git a/Lib/test/support/interpreters/channels.py b/Lib/test/support/interpreters/channels.py
index 7a2bd7d63f808f..b25a17b1aabb93 100644
--- a/Lib/test/support/interpreters/channels.py
+++ b/Lib/test/support/interpreters/channels.py
@@ -6,8 +6,8 @@
 
 # aliases:
 from _interpchannels import (
-    ChannelError, ChannelNotFoundError, ChannelClosedError,
-    ChannelEmptyError, ChannelNotEmptyError,
+    ChannelError, ChannelNotFoundError, ChannelClosedError,  # noqa: F401
+    ChannelEmptyError, ChannelNotEmptyError,  # noqa: F401
 )
 from ._crossinterp import (
     UNBOUND_ERROR, UNBOUND_REMOVE,
diff --git a/Lib/test/support/interpreters/queues.py b/Lib/test/support/interpreters/queues.py
index d6a3197d9e0e26..99987f2f6926b0 100644
--- a/Lib/test/support/interpreters/queues.py
+++ b/Lib/test/support/interpreters/queues.py
@@ -1,6 +1,5 @@
 """Cross-interpreter Queues High Level Module."""
 
-import pickle
 import queue
 import time
 import weakref
diff --git a/Lib/test/test_capi/test_config.py b/Lib/test/test_capi/test_config.py
index a2d70dd3af482d..04a27de8d84994 100644
--- a/Lib/test/test_capi/test_config.py
+++ b/Lib/test/test_capi/test_config.py
@@ -3,7 +3,6 @@
 """
 import os
 import sys
-import sysconfig
 import types
 import unittest
 from test import support
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
index a767f67a02cf56..65d54d1004d647 100644
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -2,7 +2,6 @@
 import codecs
 import html.entities
 import itertools
-import re
 import sys
 import unicodedata
 import unittest
diff --git a/Lib/test/test_crossinterp.py b/Lib/test/test_crossinterp.py
index c54635eaeab3f9..2fa0077a09bbbb 100644
--- a/Lib/test/test_crossinterp.py
+++ b/Lib/test/test_crossinterp.py
@@ -1,6 +1,4 @@
 import contextlib
-import importlib
-import importlib.util
 import itertools
 import sys
 import types
diff --git a/Lib/test/test_ctypes/_support.py b/Lib/test/test_ctypes/_support.py
index 946d654a19aff8..700657a4e41f74 100644
--- a/Lib/test/test_ctypes/_support.py
+++ b/Lib/test/test_ctypes/_support.py
@@ -3,7 +3,6 @@
 import ctypes
 from _ctypes import Structure, Union, _Pointer, Array, _SimpleCData, CFuncPtr
 import sys
-from test import support
 
 
 _CData = Structure.__base__
diff --git a/Lib/test/test_ctypes/test_byteswap.py b/Lib/test/test_ctypes/test_byteswap.py
index ea5951603f9324..f14e1aa32e17ab 100644
--- a/Lib/test/test_ctypes/test_byteswap.py
+++ b/Lib/test/test_ctypes/test_byteswap.py
@@ -1,5 +1,4 @@
 import binascii
-import ctypes
 import math
 import struct
 import sys
diff --git a/Lib/test/test_ctypes/test_generated_structs.py b/Lib/test/test_ctypes/test_generated_structs.py
index aa448fad5bbae6..1cb46a82701553 100644
--- a/Lib/test/test_ctypes/test_generated_structs.py
+++ b/Lib/test/test_ctypes/test_generated_structs.py
@@ -10,7 +10,7 @@
 """
 
 import unittest
-from test.support import import_helper, verbose
+from test.support import import_helper
 import re
 from dataclasses import dataclass
 from functools import cached_property
diff --git a/Lib/test/test_decimal.py b/Lib/test/test_decimal.py
index 9e298401dc3dcc..c0a1e378583ba8 100644
--- a/Lib/test/test_decimal.py
+++ b/Lib/test/test_decimal.py
@@ -28,7 +28,6 @@
 import math
 import os, sys
 import operator
-import warnings
 import pickle, copy
 import unittest
 import numbers
diff --git a/Lib/test/test_external_inspection.py b/Lib/test/test_external_inspection.py
index ad3f669a03043e..291c419066ac5b 100644
--- a/Lib/test/test_external_inspection.py
+++ b/Lib/test/test_external_inspection.py
@@ -4,6 +4,7 @@
 import importlib
 import sys
 import socket
+import threading
 from asyncio import staggered, taskgroups
 from unittest.mock import ANY
 from test.support import os_helper, SHORT_TIMEOUT, busy_retry
@@ -16,9 +17,7 @@
 
 try:
     from _remote_debugging import PROCESS_VM_READV_SUPPORTED
-    from _remote_debugging import get_stack_trace
-    from _remote_debugging import get_async_stack_trace
-    from _remote_debugging import get_all_awaited_by
+    from _remote_debugging import RemoteUnwinder
 except ImportError:
     raise unittest.SkipTest("Test only runs when _remote_debugging is available")
 
@@ -34,7 +33,23 @@ def _make_test_script(script_dir, script_basename, source):
 )
 
 
+def get_stack_trace(pid):
+    unwinder = RemoteUnwinder(pid, all_threads=True)
+    return unwinder.get_stack_trace()
+
+
+def get_async_stack_trace(pid):
+    unwinder = RemoteUnwinder(pid)
+    return unwinder.get_async_stack_trace()
+
+
+def get_all_awaited_by(pid):
+    unwinder = RemoteUnwinder(pid)
+    return unwinder.get_all_awaited_by()
+
+
 class TestGetStackTrace(unittest.TestCase):
+    maxDiff = None
 
     @skip_if_not_supported
     @unittest.skipIf(
@@ -46,7 +61,7 @@ def test_remote_stack_trace(self):
         port = find_unused_port()
         script = textwrap.dedent(
             f"""\
-            import time, sys, socket
+            import time, sys, socket, threading
             # Connect to the test process
             sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
             sock.connect(('localhost', {port}))
@@ -55,13 +70,16 @@ def bar():
                 for x in range(100):
                     if x == 50:
                         baz()
+
             def baz():
                 foo()
 
             def foo():
-                sock.sendall(b"ready"); time.sleep(10_000)  # same line number
+                sock.sendall(b"ready:thread\\n"); time.sleep(10_000)  # same line number
 
-            bar()
+            t = threading.Thread(target=bar)
+            t.start()
+            sock.sendall(b"ready:main\\n"); t.join()  # same line number
             """
         )
         stack_trace = None
@@ -82,8 +100,9 @@ def foo():
                 p = subprocess.Popen([sys.executable, script_name])
                 client_socket, _ = server_socket.accept()
                 server_socket.close()
-                response = client_socket.recv(1024)
-                self.assertEqual(response, b"ready")
+                response = b""
+                while b"ready:main" not in response or b"ready:thread" not in response:
+                    response += client_socket.recv(1024)
                 stack_trace = get_stack_trace(p.pid)
             except PermissionError:
                 self.skipTest("Insufficient permissions to read the stack trace")
@@ -94,13 +113,23 @@ def foo():
                 p.terminate()
                 p.wait(timeout=SHORT_TIMEOUT)
 
-            expected_stack_trace = [
-                ("foo", script_name, 14),
-                ("baz", script_name, 11),
+            thread_expected_stack_trace = [
+                ("foo", script_name, 15),
+                ("baz", script_name, 12),
                 ("bar", script_name, 9),
-                ("<module>", script_name, 16),
+                ('Thread.run', threading.__file__, ANY)
             ]
-            self.assertEqual(stack_trace, expected_stack_trace)
+            # Is possible that there are more threads, so we check that the
+            # expected stack traces are in the result (looking at you Windows!)
+            self.assertIn((ANY, thread_expected_stack_trace), stack_trace)
+
+            # Check that the main thread stack trace is in the result
+            frame = ("<module>", script_name, 19)
+            for _, stack in stack_trace:
+                if frame in stack:
+                    break
+            else:
+                self.fail("Main thread stack trace not found in result")
 
     @skip_if_not_supported
     @unittest.skipIf(
@@ -700,13 +729,28 @@ async def main():
     )
     def test_self_trace(self):
         stack_trace = get_stack_trace(os.getpid())
+        # Is possible that there are more threads, so we check that the
+        # expected stack traces are in the result (looking at you Windows!)
+        this_tread_stack = None
+        for thread_id, stack in stack_trace:
+            if thread_id == threading.get_native_id():
+                this_tread_stack = stack
+                break
+        self.assertIsNotNone(this_tread_stack)
         self.assertEqual(
-            stack_trace[0],
-            (
-                "TestGetStackTrace.test_self_trace",
-                __file__,
-                self.test_self_trace.__code__.co_firstlineno + 6,
-            ),
+            stack[:2],
+            [
+                (
+                    "get_stack_trace",
+                    __file__,
+                    get_stack_trace.__code__.co_firstlineno + 2,
+                ),
+                (
+                    "TestGetStackTrace.test_self_trace",
+                    __file__,
+                    self.test_self_trace.__code__.co_firstlineno + 6,
+                ),
+            ]
         )
 
 
diff --git a/Lib/test/test_generated_cases.py b/Lib/test/test_generated_cases.py
index a71ddc01d1c045..37046d8e1c02b7 100644
--- a/Lib/test/test_generated_cases.py
+++ b/Lib/test/test_generated_cases.py
@@ -1,11 +1,9 @@
 import contextlib
 import os
-import re
 import sys
 import tempfile
 import unittest
 
-from io import StringIO
 from test import support
 from test import test_tools
 
@@ -31,12 +29,11 @@ def skip_if_different_mount_drives():
 
 test_tools.skip_if_missing("cases_generator")
 with test_tools.imports_under_tool("cases_generator"):
-    from analyzer import analyze_forest, StackItem
+    from analyzer import StackItem
     from cwriter import CWriter
     import parser
     from stack import Local, Stack
     import tier1_generator
-    import opcode_metadata_generator
     import optimizer_generator
 
 
diff --git a/Lib/test/test_genericpath.py b/Lib/test/test_genericpath.py
index df07af01fc7540..16c3268fefb034 100644
--- a/Lib/test/test_genericpath.py
+++ b/Lib/test/test_genericpath.py
@@ -8,7 +8,7 @@
 import unittest
 import warnings
 from test.support import (
-    is_apple, is_emscripten, os_helper, warnings_helper
+    is_apple, os_helper, warnings_helper
 )
 from test.support.script_helper import assert_python_ok
 from test.support.os_helper import FakePath
diff --git a/Lib/test/test_gzip.py b/Lib/test/test_gzip.py
index ccbacc7c19b6e6..a12ff5662a73db 100644
--- a/Lib/test/test_gzip.py
+++ b/Lib/test/test_gzip.py
@@ -9,7 +9,6 @@
 import struct
 import sys
 import unittest
-import warnings
 from subprocess import PIPE, Popen
 from test.support import catch_unraisable_exception
 from test.support import import_helper
diff --git a/Lib/test/test_hashlib.py b/Lib/test/test_hashlib.py
index de4c8a1670f591..161c7652d7ab11 100644
--- a/Lib/test/test_hashlib.py
+++ b/Lib/test/test_hashlib.py
@@ -17,7 +17,6 @@
 import tempfile
 import threading
 import unittest
-import warnings
 from test import support
 from test.support import _4G, bigmemtest
 from test.support import hashlib_helper
diff --git a/Lib/test/test_hmac.py b/Lib/test/test_hmac.py
index e898644dd8a552..ff6e1bce0ef801 100644
--- a/Lib/test/test_hmac.py
+++ b/Lib/test/test_hmac.py
@@ -21,7 +21,6 @@
 import hmac
 import hashlib
 import random
-import test.support
 import test.support.hashlib_helper as hashlib_helper
 import types
 import unittest
diff --git a/Lib/test/test_idle.py b/Lib/test/test_idle.py
index 3d8b7ecc0ecb6d..ebf572ac5caac1 100644
--- a/Lib/test/test_idle.py
+++ b/Lib/test/test_idle.py
@@ -16,7 +16,7 @@
 
 # Unittest.main and test.libregrtest.runtest.runtest_inner
 # call load_tests, when present here, to discover tests to run.
-from idlelib.idle_test import load_tests
+from idlelib.idle_test import load_tests  # noqa: F401
 
 if __name__ == '__main__':
     tk.NoDefaultRoot()
diff --git a/Lib/test/test_interpreters/test_queues.py b/Lib/test/test_interpreters/test_queues.py
index 64a2db1230d023..757373904d7a43 100644
--- a/Lib/test/test_interpreters/test_queues.py
+++ b/Lib/test/test_interpreters/test_queues.py
@@ -9,7 +9,6 @@
 _queues = import_helper.import_module('_interpqueues')
 from test.support import interpreters
 from test.support.interpreters import queues, _crossinterp
-import test._crossinterp_definitions as defs
 from .utils import _run_output, TestBase as _TestBase
 
 
diff --git a/Lib/test/test_interpreters/utils.py b/Lib/test/test_interpreters/utils.py
index fc4ad662e03b66..c25e0fb7475e7e 100644
--- a/Lib/test/test_interpreters/utils.py
+++ b/Lib/test/test_interpreters/utils.py
@@ -12,7 +12,6 @@
 import threading
 import types
 import unittest
-import warnings
 
 from test import support
 
diff --git a/Lib/test/test_ntpath.py b/Lib/test/test_ntpath.py
index f83ef225a6e48e..c3b0bdaebc2329 100644
--- a/Lib/test/test_ntpath.py
+++ b/Lib/test/test_ntpath.py
@@ -6,8 +6,7 @@
 import sys
 import unittest
 import warnings
-from test.support import cpython_only, os_helper
-from test.support import TestFailed, is_emscripten
+from test.support import TestFailed, cpython_only, os_helper
 from test.support.os_helper import FakePath
 from test import test_genericpath
 from tempfile import TemporaryFile
diff --git a/Lib/test/test_peepholer.py b/Lib/test/test_peepholer.py
index 0a9ba578673b39..f33de3d420ca34 100644
--- a/Lib/test/test_peepholer.py
+++ b/Lib/test/test_peepholer.py
@@ -12,7 +12,7 @@
 
 from test import support
 from test.support.bytecode_helper import (
-    BytecodeTestCase, CfgOptimizationTestCase, CompilationStepTestCase)
+    BytecodeTestCase, CfgOptimizationTestCase)
 
 
 def compile_pattern_with_fast_locals(pattern):
diff --git a/Lib/test/test_pty.py b/Lib/test/test_pty.py
index c1728f5019d042..4836f38c388c05 100644
--- a/Lib/test/test_pty.py
+++ b/Lib/test/test_pty.py
@@ -20,7 +20,6 @@
 import signal
 import socket
 import io # readline
-import warnings
 
 TEST_STRING_1 = b"I wish to buy a fish license.\n"
 TEST_STRING_2 = b"For my pet fish, Eric.\n"
diff --git a/Lib/test/test_pydoc/test_pydoc.py b/Lib/test/test_pydoc/test_pydoc.py
index 281b24eaa36b80..d1d6f4987def0c 100644
--- a/Lib/test/test_pydoc/test_pydoc.py
+++ b/Lib/test/test_pydoc/test_pydoc.py
@@ -553,7 +553,7 @@ class object
             # of the known subclasses of object. (doc.docclass() used to
             # fail if HeapType was imported before running this test, like
             # when running tests sequentially.)
-            from _testcapi import HeapType
+            from _testcapi import HeapType  # noqa: F401
         except ImportError:
             pass
         text = doc.docclass(object)
diff --git a/Lib/test/test_pyrepl/test_windows_console.py b/Lib/test/test_pyrepl/test_windows_console.py
index a52ae96a83ddde..f9607e02c604ff 100644
--- a/Lib/test/test_pyrepl/test_windows_console.py
+++ b/Lib/test/test_pyrepl/test_windows_console.py
@@ -386,6 +386,7 @@ def get_event(self, input_records, **kwargs) -> Console:
         self.console._read_input = self.mock
         self.console._WindowsConsole__vt_support = kwargs.get("vt_support",
                                                               False)
+        self.console.wait = MagicMock(return_value=True)
         event = self.console.get_event(block=False)
         return event
 
diff --git a/Lib/test/test_remote_pdb.py b/Lib/test/test_remote_pdb.py
index aef8a6b0129092..a1c50af15f3dd2 100644
--- a/Lib/test/test_remote_pdb.py
+++ b/Lib/test/test_remote_pdb.py
@@ -1,5 +1,4 @@
 import io
-import time
 import itertools
 import json
 import os
@@ -8,16 +7,13 @@
 import socket
 import subprocess
 import sys
-import tempfile
 import textwrap
-import threading
 import unittest
 import unittest.mock
 from contextlib import closing, contextmanager, redirect_stdout, redirect_stderr, ExitStack
-from pathlib import Path
 from test.support import is_wasi, cpython_only, force_color, requires_subprocess, SHORT_TIMEOUT
-from test.support.os_helper import temp_dir, TESTFN, unlink
-from typing import Dict, List, Optional, Tuple, Union, Any
+from test.support.os_helper import TESTFN, unlink
+from typing import List
 
 import pdb
 from pdb import _PdbServer, _PdbClient
@@ -1434,7 +1430,6 @@ def test_multi_line_commands(self):
 
 
 def _supports_remote_attaching():
-    from contextlib import suppress
     PROCESS_VM_READV_SUPPORTED = False
 
     try:
diff --git a/Lib/test/test_shutil.py b/Lib/test/test_shutil.py
index 62c80aab4b3305..ebb6cf88336249 100644
--- a/Lib/test/test_shutil.py
+++ b/Lib/test/test_shutil.py
@@ -3492,7 +3492,7 @@ def test_module_all_attribute(self):
             target_api.append('disk_usage')
         self.assertEqual(set(shutil.__all__), set(target_api))
         with self.assertWarns(DeprecationWarning):
-            from shutil import ExecError
+            from shutil import ExecError  # noqa: F401
 
 
 if __name__ == '__main__':
diff --git a/Lib/test/test_string/_support.py b/Lib/test/test_string/_support.py
index eaa3354a559246..abdddaf187b4fe 100644
--- a/Lib/test/test_string/_support.py
+++ b/Lib/test/test_string/_support.py
@@ -1,4 +1,3 @@
-import unittest
 from string.templatelib import Interpolation
 
 
diff --git a/Lib/test/test_sysconfig.py b/Lib/test/test_sysconfig.py
index d30f69ded6643a..2c0df9376abfc6 100644
--- a/Lib/test/test_sysconfig.py
+++ b/Lib/test/test_sysconfig.py
@@ -32,7 +32,6 @@
 from sysconfig.__main__ import _main, _parse_makefile, _get_pybuilddir, _get_json_data_name
 import _imp
 import _osx_support
-import _sysconfig
 
 
 HAS_USER_BASE = sysconfig._HAS_USER_BASE
diff --git a/Lib/test/test_threading.py b/Lib/test/test_threading.py
index 0e51e7fc8c5a76..59b3a749d2fffa 100644
--- a/Lib/test/test_threading.py
+++ b/Lib/test/test_threading.py
@@ -1253,7 +1253,7 @@ def test_start_new_thread_failed(self):
         # its state should be removed from interpreter' thread states list
         # to avoid its double cleanup
         try:
-            from resource import setrlimit, RLIMIT_NPROC
+            from resource import setrlimit, RLIMIT_NPROC  # noqa: F401
         except ImportError as err:
             self.skipTest(err)  # RLIMIT_NPROC is specific to Linux and BSD
         code = """if 1:
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 2d41a5e5ac0697..e6b19fe1812d44 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1975,6 +1975,10 @@ def test_roundtrip(self):
         for case in cases:
             self.check_roundtrip(case)
 
+        self.check_roundtrip(r"t'{ {}}'")
+        self.check_roundtrip(r"t'{f'{ {}}'}{ {}}'")
+        self.check_roundtrip(r"f'{t'{ {}}'}{ {}}'")
+
 
     def test_continuation(self):
         # Balancing continuation
diff --git a/Lib/test/test_tools/i18n_data/docstrings.py b/Lib/test/test_tools/i18n_data/docstrings.py
index 151a55a4b56ba6..14559a632da158 100644
--- a/Lib/test/test_tools/i18n_data/docstrings.py
+++ b/Lib/test/test_tools/i18n_data/docstrings.py
@@ -1,7 +1,7 @@
 """Module docstring"""
 
 # Test docstring extraction
-from gettext import gettext as _
+from gettext import gettext as _  # noqa: F401
 
 
 # Empty docstring
diff --git a/Lib/test/test_types.py b/Lib/test/test_types.py
index 3097c7ddf05901..9011e0e1962820 100644
--- a/Lib/test/test_types.py
+++ b/Lib/test/test_types.py
@@ -2516,7 +2516,7 @@ def setUpClass(cls):
             from test.support import interpreters
         except ModuleNotFoundError:
             raise unittest.SkipTest('subinterpreters required')
-        import test.support.interpreters.channels
+        import test.support.interpreters.channels  # noqa: F401
 
     @cpython_only
     @no_rerun('channels (and queues) might have a refleak; see gh-122199')
diff --git a/Lib/test/test_typing.py b/Lib/test/test_typing.py
index 246be22a0d8ec4..ef02e8202fc829 100644
--- a/Lib/test/test_typing.py
+++ b/Lib/test/test_typing.py
@@ -46,11 +46,10 @@
 import textwrap
 import typing
 import weakref
-import warnings
 import types
 
 from test.support import (
-    captured_stderr, cpython_only, infinite_recursion, requires_docstrings, import_helper, run_code,
+    captured_stderr, cpython_only, requires_docstrings, import_helper, run_code,
     EqualToForwardRef,
 )
 from test.typinganndata import (
@@ -6859,12 +6858,10 @@ def test_forward_ref_and_final(self):
         self.assertEqual(hints, {'value': Final})
 
     def test_top_level_class_var(self):
-        # https://bugs.python.org/issue45166
-        with self.assertRaisesRegex(
-            TypeError,
-            r'typing.ClassVar\[int\] is not valid as type argument',
-        ):
-            get_type_hints(ann_module6)
+        # This is not meaningful but we don't raise for it.
+        # https://github.com/python/cpython/issues/133959
+        hints = get_type_hints(ann_module6)
+        self.assertEqual(hints, {'wrong': ClassVar[int]})
 
     def test_get_type_hints_typeddict(self):
         self.assertEqual(get_type_hints(TotalMovie), {'title': str, 'year': int})
@@ -6967,6 +6964,11 @@ def foo(a: 'Callable[..., T]'):
         self.assertEqual(get_type_hints(foo, globals(), locals()),
                          {'a': Callable[..., T]})
 
+    def test_special_forms_no_forward(self):
+        def f(x: ClassVar[int]):
+            pass
+        self.assertEqual(get_type_hints(f), {'x': ClassVar[int]})
+
     def test_special_forms_forward(self):
 
         class C:
@@ -6982,8 +6984,9 @@ class CF:
         self.assertEqual(get_type_hints(C, globals())['b'], Final[int])
         self.assertEqual(get_type_hints(C, globals())['x'], ClassVar)
         self.assertEqual(get_type_hints(C, globals())['y'], Final)
-        with self.assertRaises(TypeError):
-            get_type_hints(CF, globals()),
+        lfi = get_type_hints(CF, globals())['b']
+        self.assertIs(get_origin(lfi), list)
+        self.assertEqual(get_args(lfi), (Final[int],))
 
     def test_union_forward_recursion(self):
         ValueList = List['Value']
@@ -7216,33 +7219,113 @@ class C(Generic[T]): pass
 class EvaluateForwardRefTests(BaseTestCase):
     def test_evaluate_forward_ref(self):
         int_ref = ForwardRef('int')
-        missing = ForwardRef('missing')
+        self.assertIs(typing.evaluate_forward_ref(int_ref), int)
         self.assertIs(
             typing.evaluate_forward_ref(int_ref, type_params=()),
             int,
         )
+        self.assertIs(
+            typing.evaluate_forward_ref(int_ref, format=annotationlib.Format.VALUE),
+            int,
+        )
         self.assertIs(
             typing.evaluate_forward_ref(
-                int_ref, type_params=(), format=annotationlib.Format.FORWARDREF,
+                int_ref, format=annotationlib.Format.FORWARDREF,
             ),
             int,
         )
+        self.assertEqual(
+            typing.evaluate_forward_ref(
+                int_ref, format=annotationlib.Format.STRING,
+            ),
+            'int',
+        )
+
+    def test_evaluate_forward_ref_undefined(self):
+        missing = ForwardRef('missing')
+        with self.assertRaises(NameError):
+            typing.evaluate_forward_ref(missing)
         self.assertIs(
             typing.evaluate_forward_ref(
-                missing, type_params=(), format=annotationlib.Format.FORWARDREF,
+                missing, format=annotationlib.Format.FORWARDREF,
             ),
             missing,
         )
         self.assertEqual(
             typing.evaluate_forward_ref(
-                int_ref, type_params=(), format=annotationlib.Format.STRING,
+                missing, format=annotationlib.Format.STRING,
             ),
-            'int',
+            "missing",
         )
 
-    def test_evaluate_forward_ref_no_type_params(self):
-        ref = ForwardRef('int')
-        self.assertIs(typing.evaluate_forward_ref(ref), int)
+    def test_evaluate_forward_ref_nested(self):
+        ref = ForwardRef("int | list['str']")
+        self.assertEqual(
+            typing.evaluate_forward_ref(ref),
+            int | list[str],
+        )
+        self.assertEqual(
+            typing.evaluate_forward_ref(ref, format=annotationlib.Format.FORWARDREF),
+            int | list[str],
+        )
+        self.assertEqual(
+            typing.evaluate_forward_ref(ref, format=annotationlib.Format.STRING),
+            "int | list['str']",
+        )
+
+        why = ForwardRef('"\'str\'"')
+        self.assertIs(typing.evaluate_forward_ref(why), str)
+
+    def test_evaluate_forward_ref_none(self):
+        none_ref = ForwardRef('None')
+        self.assertIs(typing.evaluate_forward_ref(none_ref), None)
+
+    def test_globals(self):
+        A = "str"
+        ref = ForwardRef('list[A]')
+        with self.assertRaises(NameError):
+            typing.evaluate_forward_ref(ref)
+        self.assertEqual(
+            typing.evaluate_forward_ref(ref, globals={'A': A}),
+            list[str],
+        )
+
+    def test_owner(self):
+        ref = ForwardRef("A")
+
+        with self.assertRaises(NameError):
+            typing.evaluate_forward_ref(ref)
+
+        # We default to the globals of `owner`,
+        # so it no longer raises `NameError`
+        self.assertIs(
+            typing.evaluate_forward_ref(ref, owner=Loop), A
+        )
+
+    def test_inherited_owner(self):
+        # owner passed to evaluate_forward_ref
+        ref = ForwardRef("list['A']")
+        self.assertEqual(
+            typing.evaluate_forward_ref(ref, owner=Loop),
+            list[A],
+        )
+
+        # owner set on the ForwardRef
+        ref = ForwardRef("list['A']", owner=Loop)
+        self.assertEqual(
+            typing.evaluate_forward_ref(ref),
+            list[A],
+        )
+
+    def test_partial_evaluation(self):
+        ref = ForwardRef("list[A]")
+        with self.assertRaises(NameError):
+            typing.evaluate_forward_ref(ref)
+
+        self.assertEqual(
+            typing.evaluate_forward_ref(ref, format=annotationlib.Format.FORWARDREF),
+            list[EqualToForwardRef('A')],
+        )
 
 
 class CollectionsAbcTests(BaseTestCase):
diff --git a/Lib/test/test_venv.py b/Lib/test/test_venv.py
index 12c30e178aeb51..d62f3fba2d1a94 100644
--- a/Lib/test/test_venv.py
+++ b/Lib/test/test_venv.py
@@ -1008,7 +1008,7 @@ def do_test_with_pip(self, system_site_packages):
                      err, flags=re.MULTILINE)
         # Ignore warning about missing optional module:
         try:
-            import ssl
+            import ssl  # noqa: F401
         except ImportError:
             err = re.sub(
                 "^WARNING: Disabling truststore since ssl support is missing$",
diff --git a/Lib/test/test_webbrowser.py b/Lib/test/test_webbrowser.py
index 4c3ea1cd8df13e..6b577ae100e419 100644
--- a/Lib/test/test_webbrowser.py
+++ b/Lib/test/test_webbrowser.py
@@ -6,7 +6,6 @@
 import sys
 import unittest
 import webbrowser
-from functools import partial
 from test import support
 from test.support import import_helper
 from test.support import is_apple_mobile
diff --git a/Lib/test/test_zipfile/__main__.py b/Lib/test/test_zipfile/__main__.py
index e25ac946edffe4..90da74ade38c69 100644
--- a/Lib/test/test_zipfile/__main__.py
+++ b/Lib/test/test_zipfile/__main__.py
@@ -1,6 +1,6 @@
 import unittest
 
-from . import load_tests  # noqa: F401
+from . import load_tests
 
 
 if __name__ == "__main__":
diff --git a/Lib/test/test_zstd.py b/Lib/test/test_zstd.py
index 34c7c721b1ad32..bc809603cbc629 100644
--- a/Lib/test/test_zstd.py
+++ b/Lib/test/test_zstd.py
@@ -12,7 +12,6 @@
 from test.support.import_helper import import_module
 from test.support import threading_helper
 from test.support import _1M
-from test.support import Py_GIL_DISABLED
 
 _zstd = import_module("_zstd")
 zstd = import_module("compression.zstd")
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 8d01fd7bce41b0..559a7aecbde2d1 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -274,7 +274,7 @@ def compat(self, token, iterable):
         toks_append = self.tokens.append
         startline = token[0] in (NEWLINE, NL)
         prevstring = False
-        in_fstring = 0
+        in_fstring_or_tstring = 0
 
         for tok in _itertools.chain([token], iterable):
             toknum, tokval = tok[:2]
@@ -293,10 +293,10 @@ def compat(self, token, iterable):
             else:
                 prevstring = False
 
-            if toknum == FSTRING_START:
-                in_fstring += 1
-            elif toknum == FSTRING_END:
-                in_fstring -= 1
+            if toknum in {FSTRING_START, TSTRING_START}:
+                in_fstring_or_tstring += 1
+            elif toknum in {FSTRING_END, TSTRING_END}:
+                in_fstring_or_tstring -= 1
             if toknum == INDENT:
                 indents.append(tokval)
                 continue
@@ -311,8 +311,8 @@ def compat(self, token, iterable):
             elif toknum in {FSTRING_MIDDLE, TSTRING_MIDDLE}:
                 tokval = self.escape_brackets(tokval)
 
-            # Insert a space between two consecutive brackets if we are in an f-string
-            if tokval in {"{", "}"} and self.tokens and self.tokens[-1] == tokval and in_fstring:
+            # Insert a space between two consecutive brackets if we are in an f-string or t-string
+            if tokval in {"{", "}"} and self.tokens and self.tokens[-1] == tokval and in_fstring_or_tstring:
                 tokval = ' ' + tokval
 
             # Insert a space between two consecutive f-strings
diff --git a/Lib/typing.py b/Lib/typing.py
index 98af61be8b0716..ed1dd4fc6413a5 100644
--- a/Lib/typing.py
+++ b/Lib/typing.py
@@ -956,12 +956,8 @@ def evaluate_forward_ref(
     """Evaluate a forward reference as a type hint.
 
     This is similar to calling the ForwardRef.evaluate() method,
-    but unlike that method, evaluate_forward_ref() also:
-
-    * Recursively evaluates forward references nested within the type hint.
-    * Rejects certain objects that are not valid type hints.
-    * Replaces type hints that evaluate to None with types.NoneType.
-    * Supports the *FORWARDREF* and *STRING* formats.
+    but unlike that method, evaluate_forward_ref() also
+    recursively evaluates forward references nested within the type hint.
 
     *forward_ref* must be an instance of ForwardRef. *owner*, if given,
     should be the object that holds the annotations that the forward reference
@@ -981,23 +977,24 @@ def evaluate_forward_ref(
     if forward_ref.__forward_arg__ in _recursive_guard:
         return forward_ref
 
-    try:
-        value = forward_ref.evaluate(globals=globals, locals=locals,
-                                     type_params=type_params, owner=owner)
-    except NameError:
-        if format == _lazy_annotationlib.Format.FORWARDREF:
-            return forward_ref
-        else:
-            raise
-
-    type_ = _type_check(
-        value,
-        "Forward references must evaluate to types.",
-        is_argument=forward_ref.__forward_is_argument__,
-        allow_special_forms=forward_ref.__forward_is_class__,
-    )
+    if format is None:
+        format = _lazy_annotationlib.Format.VALUE
+    value = forward_ref.evaluate(globals=globals, locals=locals,
+                                 type_params=type_params, owner=owner, format=format)
+
+    if (isinstance(value, _lazy_annotationlib.ForwardRef)
+            and format == _lazy_annotationlib.Format.FORWARDREF):
+        return value
+
+    if isinstance(value, str):
+        value = _make_forward_ref(value, module=forward_ref.__forward_module__,
+                                  owner=owner or forward_ref.__owner__,
+                                  is_argument=forward_ref.__forward_is_argument__,
+                                  is_class=forward_ref.__forward_is_class__)
+    if owner is None:
+        owner = forward_ref.__owner__
     return _eval_type(
-        type_,
+        value,
         globals,
         locals,
         type_params,
@@ -2338,12 +2335,12 @@ def get_type_hints(obj, globalns=None, localns=None, include_extras=False,
                 # This only affects ForwardRefs.
                 base_globals, base_locals = base_locals, base_globals
             for name, value in ann.items():
-                if value is None:
-                    value = type(None)
                 if isinstance(value, str):
                     value = _make_forward_ref(value, is_argument=False, is_class=True)
                 value = _eval_type(value, base_globals, base_locals, base.__type_params__,
                                    format=format, owner=obj)
+                if value is None:
+                    value = type(None)
                 hints[name] = value
         if include_extras or format == Format.STRING:
             return hints
@@ -2377,8 +2374,6 @@ def get_type_hints(obj, globalns=None, localns=None, include_extras=False,
         localns = globalns
     type_params = getattr(obj, "__type_params__", ())
     for name, value in hints.items():
-        if value is None:
-            value = type(None)
         if isinstance(value, str):
             # class-level forward refs were handled above, this must be either
             # a module-level annotation or a function argument annotation
@@ -2387,7 +2382,10 @@ def get_type_hints(obj, globalns=None, localns=None, include_extras=False,
                 is_argument=not isinstance(obj, types.ModuleType),
                 is_class=False,
             )
-        hints[name] = _eval_type(value, globalns, localns, type_params, format=format, owner=obj)
+        value = _eval_type(value, globalns, localns, type_params, format=format, owner=obj)
+        if value is None:
+            value = type(None)
+        hints[name] = value
     return hints if include_extras else {k: _strip_annotations(t) for k, t in hints.items()}
 
 
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 3ab7c3d6c48ad9..b5703fbe6ae974 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -1206,6 +1206,7 @@ PYTHON_HEADERS= \
 		$(srcdir)/Include/unicodeobject.h \
 		$(srcdir)/Include/warnings.h \
 		$(srcdir)/Include/weakrefobject.h \
+		$(srcdir)/Python/remote_debug.h \
 		\
 		pyconfig.h \
 		$(PARSER_HEADERS) \
diff --git a/Misc/NEWS.d/next/Library/2025-05-12-20-38-57.gh-issue-133960.Aee79f.rst b/Misc/NEWS.d/next/Library/2025-05-12-20-38-57.gh-issue-133960.Aee79f.rst
new file mode 100644
index 00000000000000..66e8483b25bc37
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-05-12-20-38-57.gh-issue-133960.Aee79f.rst
@@ -0,0 +1,3 @@
+Simplify and improve :func:`typing.evaluate_forward_ref`. It now no longer
+raises errors on certain invalid types. In several situations, it is now
+able to evaluate forward references that were previously unsupported.
diff --git a/Misc/NEWS.d/next/Library/2025-05-23-23-43-39.gh-issue-134582.9POq3l.rst b/Misc/NEWS.d/next/Library/2025-05-23-23-43-39.gh-issue-134582.9POq3l.rst
new file mode 100644
index 00000000000000..23e1d5891b685f
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-05-23-23-43-39.gh-issue-134582.9POq3l.rst
@@ -0,0 +1 @@
+Fix tokenize.untokenize() round-trip errors related to t-strings braces escaping
diff --git a/Modules/_remote_debugging_module.c b/Modules/_remote_debugging_module.c
index 8c0f40f835c36e..a13cbd63ad3bd8 100644
--- a/Modules/_remote_debugging_module.c
+++ b/Modules/_remote_debugging_module.c
@@ -1,5 +1,16 @@
+/******************************************************************************
+ * Python Remote Debugging Module
+ *
+ * This module provides functionality to debug Python processes remotely by
+ * reading their memory and reconstructing stack traces and asyncio task states.
+ ******************************************************************************/
+
 #define _GNU_SOURCE
 
+/* ============================================================================
+ * HEADERS AND INCLUDES
+ * ============================================================================ */
+
 #include <errno.h>
 #include <fcntl.h>
 #include <stddef.h>
@@ -23,6 +34,47 @@
 #    define HAVE_PROCESS_VM_READV 0
 #endif
 
+/* ============================================================================
+ * TYPE DEFINITIONS AND STRUCTURES
+ * ============================================================================ */
+
+#define GET_MEMBER(type, obj, offset) (*(type*)((char*)(obj) + (offset)))
+
+/* Size macros for opaque buffers */
+#define SIZEOF_BYTES_OBJ sizeof(PyBytesObject)
+#define SIZEOF_CODE_OBJ sizeof(PyCodeObject)
+#define SIZEOF_GEN_OBJ sizeof(PyGenObject)
+#define SIZEOF_INTERP_FRAME sizeof(_PyInterpreterFrame)
+#define SIZEOF_LLIST_NODE sizeof(struct llist_node)
+#define SIZEOF_PAGE_CACHE_ENTRY sizeof(page_cache_entry_t)
+#define SIZEOF_PYOBJECT sizeof(PyObject)
+#define SIZEOF_SET_OBJ sizeof(PySetObject)
+#define SIZEOF_TASK_OBJ 4096
+#define SIZEOF_THREAD_STATE sizeof(PyThreadState)
+#define SIZEOF_TYPE_OBJ sizeof(PyTypeObject)
+#define SIZEOF_UNICODE_OBJ sizeof(PyUnicodeObject)
+#define SIZEOF_LONG_OBJ sizeof(PyLongObject)
+
+// Calculate the minimum buffer size needed to read interpreter state fields
+// We need to read code_object_generation and potentially tlbc_generation
+#ifndef MAX
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#endif
+
+#ifdef Py_GIL_DISABLED
+#define INTERP_STATE_MIN_SIZE MAX(MAX(offsetof(PyInterpreterState, _code_object_generation) + sizeof(uint64_t), \
+                                      offsetof(PyInterpreterState, tlbc_indices.tlbc_generation) + sizeof(uint32_t)), \
+                                  offsetof(PyInterpreterState, threads.head) + sizeof(void*))
+#else
+#define INTERP_STATE_MIN_SIZE MAX(offsetof(PyInterpreterState, _code_object_generation) + sizeof(uint64_t), \
+                                  offsetof(PyInterpreterState, threads.head) + sizeof(void*))
+#endif
+#define INTERP_STATE_BUFFER_SIZE MAX(INTERP_STATE_MIN_SIZE, 256)
+
+
+
+// Copied from Modules/_asynciomodule.c because it's not exported
+
 struct _Py_AsyncioModuleDebugOffsets {
     struct _asyncio_task_object {
         uint64_t size;
@@ -45,6 +97,127 @@ struct _Py_AsyncioModuleDebugOffsets {
     } asyncio_thread_state;
 };
 
+typedef struct {
+    PyObject_HEAD
+    proc_handle_t handle;
+    uintptr_t runtime_start_address;
+    struct _Py_DebugOffsets debug_offsets;
+    int async_debug_offsets_available;
+    struct _Py_AsyncioModuleDebugOffsets async_debug_offsets;
+    uintptr_t interpreter_addr;
+    uintptr_t tstate_addr;
+    uint64_t code_object_generation;
+    _Py_hashtable_t *code_object_cache;
+#ifdef Py_GIL_DISABLED
+    // TLBC cache invalidation tracking
+    uint32_t tlbc_generation;  // Track TLBC index pool changes
+    _Py_hashtable_t *tlbc_cache;  // Cache of TLBC arrays by code object address
+#endif
+} RemoteUnwinderObject;
+
+typedef struct {
+    PyObject *func_name;
+    PyObject *file_name;
+    int first_lineno;
+    PyObject *linetable;  // bytes
+    uintptr_t addr_code_adaptive;
+} CachedCodeMetadata;
+
+typedef struct {
+    /* Types */
+    PyTypeObject *RemoteDebugging_Type;
+} RemoteDebuggingState;
+
+typedef struct
+{
+    int lineno;
+    int end_lineno;
+    int column;
+    int end_column;
+} LocationInfo;
+
+typedef struct {
+    uintptr_t remote_addr;
+    size_t size;
+    void *local_copy;
+} StackChunkInfo;
+
+typedef struct {
+    StackChunkInfo *chunks;
+    size_t count;
+} StackChunkList;
+
+#include "clinic/_remote_debugging_module.c.h"
+
+/*[clinic input]
+module _remote_debugging
+[clinic start generated code]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=5f507d5b2e76a7f7]*/
+
+
+/* ============================================================================
+ * FORWARD DECLARATIONS
+ * ============================================================================ */
+
+static int
+parse_tasks_in_set(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t set_addr,
+    PyObject *awaited_by,
+    int recurse_task
+);
+
+static int
+parse_task(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t task_address,
+    PyObject *render_to,
+    int recurse_task
+);
+
+static int
+parse_coro_chain(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t coro_address,
+    PyObject *render_to
+);
+
+/* Forward declarations for task parsing functions */
+static int parse_frame_object(
+    RemoteUnwinderObject *unwinder,
+    PyObject** result,
+    uintptr_t address,
+    uintptr_t* previous_frame
+);
+
+/* ============================================================================
+ * UTILITY FUNCTIONS AND HELPERS
+ * ============================================================================ */
+
+static void
+cached_code_metadata_destroy(void *ptr)
+{
+    CachedCodeMetadata *meta = (CachedCodeMetadata *)ptr;
+    Py_DECREF(meta->func_name);
+    Py_DECREF(meta->file_name);
+    Py_DECREF(meta->linetable);
+    PyMem_RawFree(meta);
+}
+
+static inline RemoteDebuggingState *
+RemoteDebugging_GetState(PyObject *module)
+{
+    void *state = _PyModule_GetState(module);
+    assert(state != NULL);
+    return (RemoteDebuggingState *)state;
+}
+
+static inline int
+RemoteDebugging_InitState(RemoteDebuggingState *st)
+{
+    return 0;
+}
+
 // Helper to chain exceptions and avoid repetitions
 static void
 chain_exceptions(PyObject *exception, const char *string)
@@ -54,36 +227,14 @@ chain_exceptions(PyObject *exception, const char *string)
     _PyErr_ChainExceptions1(exc);
 }
 
-// Get the PyAsyncioDebug section address for any platform
-static uintptr_t
-_Py_RemoteDebug_GetAsyncioDebugAddress(proc_handle_t* handle)
-{
-    uintptr_t address;
-
-#ifdef MS_WINDOWS
-    // On Windows, search for asyncio debug in executable or DLL
-    address = search_windows_map_for_section(handle, "AsyncioD", L"_asyncio");
-#elif defined(__linux__)
-    // On Linux, search for asyncio debug in executable or DLL
-    address = search_linux_map_for_section(handle, "AsyncioDebug", "_asyncio.cpython");
-#elif defined(__APPLE__) && TARGET_OS_OSX
-    // On macOS, try libpython first, then fall back to python
-    address = search_map_for_section(handle, "AsyncioDebug", "_asyncio.cpython");
-    if (address == 0) {
-        PyErr_Clear();
-        address = search_map_for_section(handle, "AsyncioDebug", "_asyncio.cpython");
-    }
-#else
-    Py_UNREACHABLE();
-#endif
-
-    return address;
-}
+/* ============================================================================
+ * MEMORY READING FUNCTIONS
+ * ============================================================================ */
 
 static inline int
 read_ptr(proc_handle_t *handle, uintptr_t address, uintptr_t *ptr_addr)
 {
-    int result = _Py_RemoteDebug_ReadRemoteMemory(handle, address, sizeof(void*), ptr_addr);
+    int result = _Py_RemoteDebug_PagedReadRemoteMemory(handle, address, sizeof(void*), ptr_addr);
     if (result < 0) {
         return -1;
     }
@@ -93,7 +244,7 @@ read_ptr(proc_handle_t *handle, uintptr_t address, uintptr_t *ptr_addr)
 static inline int
 read_Py_ssize_t(proc_handle_t *handle, uintptr_t address, Py_ssize_t *size)
 {
-    int result = _Py_RemoteDebug_ReadRemoteMemory(handle, address, sizeof(Py_ssize_t), size);
+    int result = _Py_RemoteDebug_PagedReadRemoteMemory(handle, address, sizeof(Py_ssize_t), size);
     if (result < 0) {
         return -1;
     }
@@ -113,72 +264,53 @@ read_py_ptr(proc_handle_t *handle, uintptr_t address, uintptr_t *ptr_addr)
 static int
 read_char(proc_handle_t *handle, uintptr_t address, char *result)
 {
-    int res = _Py_RemoteDebug_ReadRemoteMemory(handle, address, sizeof(char), result);
-    if (res < 0) {
-        return -1;
-    }
-    return 0;
-}
-
-static int
-read_sized_int(proc_handle_t *handle, uintptr_t address, void *result, size_t size)
-{
-    int res = _Py_RemoteDebug_ReadRemoteMemory(handle, address, size, result);
-    if (res < 0) {
-        return -1;
-    }
-    return 0;
-}
-
-static int
-read_unsigned_long(proc_handle_t *handle, uintptr_t address, unsigned long *result)
-{
-    int res = _Py_RemoteDebug_ReadRemoteMemory(handle, address, sizeof(unsigned long), result);
+    int res = _Py_RemoteDebug_PagedReadRemoteMemory(handle, address, sizeof(char), result);
     if (res < 0) {
         return -1;
     }
     return 0;
 }
 
-static int
-read_pyobj(proc_handle_t *handle, uintptr_t address, PyObject *ptr_addr)
-{
-    int res = _Py_RemoteDebug_ReadRemoteMemory(handle, address, sizeof(PyObject), ptr_addr);
-    if (res < 0) {
-        return -1;
-    }
-    return 0;
-}
+/* ============================================================================
+ * PYTHON OBJECT READING FUNCTIONS
+ * ============================================================================ */
 
 static PyObject *
 read_py_str(
-    proc_handle_t *handle,
-    _Py_DebugOffsets* debug_offsets,
+    RemoteUnwinderObject *unwinder,
     uintptr_t address,
     Py_ssize_t max_len
 ) {
     PyObject *result = NULL;
     char *buf = NULL;
 
-    Py_ssize_t len;
-    int res = _Py_RemoteDebug_ReadRemoteMemory(
-        handle,
-        address + debug_offsets->unicode_object.length,
-        sizeof(Py_ssize_t),
-        &len
+    // Read the entire PyUnicodeObject at once
+    char unicode_obj[SIZEOF_UNICODE_OBJ];
+    int res = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle,
+        address,
+        SIZEOF_UNICODE_OBJ,
+        unicode_obj
     );
     if (res < 0) {
         goto err;
     }
 
+    Py_ssize_t len = GET_MEMBER(Py_ssize_t, unicode_obj, unwinder->debug_offsets.unicode_object.length);
+    if (len < 0 || len > max_len) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "Invalid string length (%zd) at 0x%lx", len, address);
+        return NULL;
+    }
+
     buf = (char *)PyMem_RawMalloc(len+1);
     if (buf == NULL) {
         PyErr_NoMemory();
         return NULL;
     }
 
-    size_t offset = debug_offsets->unicode_object.asciiobject_size;
-    res = _Py_RemoteDebug_ReadRemoteMemory(handle, address + offset, len, buf);
+    size_t offset = unwinder->debug_offsets.unicode_object.asciiobject_size;
+    res = _Py_RemoteDebug_PagedReadRemoteMemory(&unwinder->handle, address + offset, len, buf);
     if (res < 0) {
         goto err;
     }
@@ -202,32 +334,40 @@ read_py_str(
 
 static PyObject *
 read_py_bytes(
-    proc_handle_t *handle,
-    _Py_DebugOffsets* debug_offsets,
-    uintptr_t address
+    RemoteUnwinderObject *unwinder,
+    uintptr_t address,
+    Py_ssize_t max_len
 ) {
     PyObject *result = NULL;
     char *buf = NULL;
 
-    Py_ssize_t len;
-    int res = _Py_RemoteDebug_ReadRemoteMemory(
-        handle,
-        address + debug_offsets->bytes_object.ob_size,
-        sizeof(Py_ssize_t),
-        &len
+    // Read the entire PyBytesObject at once
+    char bytes_obj[SIZEOF_BYTES_OBJ];
+    int res = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle,
+        address,
+        SIZEOF_BYTES_OBJ,
+        bytes_obj
     );
     if (res < 0) {
         goto err;
     }
 
+    Py_ssize_t len = GET_MEMBER(Py_ssize_t, bytes_obj, unwinder->debug_offsets.bytes_object.ob_size);
+    if (len < 0 || len > max_len) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "Invalid string length (%zd) at 0x%lx", len, address);
+        return NULL;
+    }
+
     buf = (char *)PyMem_RawMalloc(len+1);
     if (buf == NULL) {
         PyErr_NoMemory();
         return NULL;
     }
 
-    size_t offset = debug_offsets->bytes_object.ob_sval;
-    res = _Py_RemoteDebug_ReadRemoteMemory(handle, address + offset, len, buf);
+    size_t offset = unwinder->debug_offsets.bytes_object.ob_sval;
+    res = _Py_RemoteDebug_PagedReadRemoteMemory(&unwinder->handle, address + offset, len, buf);
     if (res < 0) {
         goto err;
     }
@@ -249,45 +389,60 @@ read_py_bytes(
     return NULL;
 }
 
-
-
 static long
-read_py_long(proc_handle_t *handle, _Py_DebugOffsets* offsets, uintptr_t address)
+read_py_long(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t address
+)
 {
     unsigned int shift = PYLONG_BITS_IN_DIGIT;
 
-    Py_ssize_t size;
-    uintptr_t lv_tag;
-
-    int bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
-        handle, address + offsets->long_object.lv_tag,
-        sizeof(uintptr_t),
-        &lv_tag);
+    // Read the entire PyLongObject at once
+    char long_obj[SIZEOF_LONG_OBJ];
+    int bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle,
+        address,
+        unwinder->debug_offsets.long_object.size,
+        long_obj);
     if (bytes_read < 0) {
         return -1;
     }
 
+    uintptr_t lv_tag = GET_MEMBER(uintptr_t, long_obj, unwinder->debug_offsets.long_object.lv_tag);
     int negative = (lv_tag & 3) == 2;
-    size = lv_tag >> 3;
+    Py_ssize_t size = lv_tag >> 3;
 
     if (size == 0) {
         return 0;
     }
 
-    digit *digits = (digit *)PyMem_RawMalloc(size * sizeof(digit));
-    if (!digits) {
-        PyErr_NoMemory();
-        return -1;
-    }
+    // If the long object has inline digits, use them directly
+    digit *digits;
+    if (size <= _PY_NSMALLNEGINTS + _PY_NSMALLPOSINTS) {
+        // For small integers, digits are inline in the long_value.ob_digit array
+        digits = (digit *)PyMem_RawMalloc(size * sizeof(digit));
+        if (!digits) {
+            PyErr_NoMemory();
+            return -1;
+        }
+        memcpy(digits, long_obj + unwinder->debug_offsets.long_object.ob_digit, size * sizeof(digit));
+    } else {
+        // For larger integers, we need to read the digits separately
+        digits = (digit *)PyMem_RawMalloc(size * sizeof(digit));
+        if (!digits) {
+            PyErr_NoMemory();
+            return -1;
+        }
 
-    bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
-        handle,
-        address + offsets->long_object.ob_digit,
-        sizeof(digit) * size,
-        digits
-    );
-    if (bytes_read < 0) {
-        goto error;
+        bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
+            &unwinder->handle,
+            address + unwinder->debug_offsets.long_object.ob_digit,
+            sizeof(digit) * size,
+            digits
+        );
+        if (bytes_read < 0) {
+            goto error;
+        }
     }
 
     long long value = 0;
@@ -310,44 +465,115 @@ read_py_long(proc_handle_t *handle, _Py_DebugOffsets* offsets, uintptr_t address
     return -1;
 }
 
+/* ============================================================================
+ * ASYNCIO DEBUG FUNCTIONS
+ * ============================================================================ */
+
+// Get the PyAsyncioDebug section address for any platform
+static uintptr_t
+_Py_RemoteDebug_GetAsyncioDebugAddress(proc_handle_t* handle)
+{
+    uintptr_t address;
+
+#ifdef MS_WINDOWS
+    // On Windows, search for asyncio debug in executable or DLL
+    address = search_windows_map_for_section(handle, "AsyncioD", L"_asyncio");
+    if (address == 0) {
+        // Error out: 'python' substring covers both executable and DLL
+        PyObject *exc = PyErr_GetRaisedException();
+        PyErr_SetString(PyExc_RuntimeError, "Failed to find the AsyncioDebug section in the process.");
+        _PyErr_ChainExceptions1(exc);
+    }
+#elif defined(__linux__)
+    // On Linux, search for asyncio debug in executable or DLL
+    address = search_linux_map_for_section(handle, "AsyncioDebug", "_asyncio.cpython");
+    if (address == 0) {
+        // Error out: 'python' substring covers both executable and DLL
+        PyObject *exc = PyErr_GetRaisedException();
+        PyErr_SetString(PyExc_RuntimeError, "Failed to find the AsyncioDebug section in the process.");
+        _PyErr_ChainExceptions1(exc);
+    }
+#elif defined(__APPLE__) && TARGET_OS_OSX
+    // On macOS, try libpython first, then fall back to python
+    address = search_map_for_section(handle, "AsyncioDebug", "_asyncio.cpython");
+    if (address == 0) {
+        PyErr_Clear();
+        address = search_map_for_section(handle, "AsyncioDebug", "_asyncio.cpython");
+    }
+    if (address == 0) {
+        // Error out: 'python' substring covers both executable and DLL
+        PyObject *exc = PyErr_GetRaisedException();
+        PyErr_SetString(PyExc_RuntimeError, "Failed to find the AsyncioDebug section in the process.");
+        _PyErr_ChainExceptions1(exc);
+    }
+#else
+    Py_UNREACHABLE();
+#endif
+
+    return address;
+}
+
+static int
+read_async_debug(
+    RemoteUnwinderObject *unwinder
+) {
+    uintptr_t async_debug_addr = _Py_RemoteDebug_GetAsyncioDebugAddress(&unwinder->handle);
+    if (!async_debug_addr) {
+        return -1;
+    }
+
+    size_t size = sizeof(struct _Py_AsyncioModuleDebugOffsets);
+    int result = _Py_RemoteDebug_PagedReadRemoteMemory(&unwinder->handle, async_debug_addr, size, &unwinder->async_debug_offsets);
+    return result;
+}
+
+/* ============================================================================
+ * ASYNCIO TASK PARSING FUNCTIONS
+ * ============================================================================ */
+
 static PyObject *
 parse_task_name(
-    proc_handle_t *handle,
-    _Py_DebugOffsets* offsets,
-    struct _Py_AsyncioModuleDebugOffsets* async_offsets,
+    RemoteUnwinderObject *unwinder,
     uintptr_t task_address
 ) {
-    uintptr_t task_name_addr;
-    int err = read_py_ptr(
-        handle,
-        task_address + async_offsets->asyncio_task_object.task_name,
-        &task_name_addr);
-    if (err) {
+    // Read the entire TaskObj at once
+    char task_obj[SIZEOF_TASK_OBJ];
+    int err = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle,
+        task_address,
+        unwinder->async_debug_offsets.asyncio_task_object.size,
+        task_obj);
+    if (err < 0) {
         return NULL;
     }
 
-    // The task name can be a long or a string so we need to check the type
+    uintptr_t task_name_addr = GET_MEMBER(uintptr_t, task_obj, unwinder->async_debug_offsets.asyncio_task_object.task_name);
+    task_name_addr &= ~Py_TAG_BITS;
 
-    PyObject task_name_obj;
-    err = read_pyobj(
-        handle,
+    // The task name can be a long or a string so we need to check the type
+    char task_name_obj[SIZEOF_PYOBJECT];
+    err = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle,
         task_name_addr,
-        &task_name_obj);
-    if (err) {
+        SIZEOF_PYOBJECT,
+        task_name_obj);
+    if (err < 0) {
         return NULL;
     }
 
-    unsigned long flags;
-    err = read_unsigned_long(
-        handle,
-        (uintptr_t)task_name_obj.ob_type + offsets->type_object.tp_flags,
-        &flags);
-    if (err) {
+    // Now read the type object to get the flags
+    char type_obj[SIZEOF_TYPE_OBJ];
+    err = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle,
+        GET_MEMBER(uintptr_t, task_name_obj, unwinder->debug_offsets.pyobject.ob_type),
+        SIZEOF_TYPE_OBJ,
+        type_obj);
+    if (err < 0) {
         return NULL;
     }
 
-    if ((flags & Py_TPFLAGS_LONG_SUBCLASS)) {
-        long res = read_py_long(handle, offsets, task_name_addr);
+    if ((GET_MEMBER(unsigned long, type_obj, unwinder->debug_offsets.type_object.tp_flags) & Py_TPFLAGS_LONG_SUBCLASS)) {
+        long res = read_py_long(unwinder, task_name_addr);
         if (res == -1) {
             chain_exceptions(PyExc_RuntimeError, "Failed to get task name");
             return NULL;
@@ -355,355 +581,375 @@ parse_task_name(
         return PyUnicode_FromFormat("Task-%d", res);
     }
 
-    if(!(flags & Py_TPFLAGS_UNICODE_SUBCLASS)) {
+    if(!(GET_MEMBER(unsigned long, type_obj, unwinder->debug_offsets.type_object.tp_flags) & Py_TPFLAGS_UNICODE_SUBCLASS)) {
         PyErr_SetString(PyExc_RuntimeError, "Invalid task name object");
         return NULL;
     }
 
     return read_py_str(
-        handle,
-        offsets,
+        unwinder,
         task_name_addr,
         255
     );
 }
 
-static int
-parse_frame_object(
-    proc_handle_t *handle,
-    PyObject** result,
-    struct _Py_DebugOffsets* offsets,
-    uintptr_t address,
-    uintptr_t* previous_frame
-);
-
-static int
-parse_coro_chain(
-    proc_handle_t *handle,
-    struct _Py_DebugOffsets* offsets,
-    struct _Py_AsyncioModuleDebugOffsets* async_offsets,
-    uintptr_t coro_address,
-    PyObject *render_to
+static int parse_task_awaited_by(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t task_address,
+    PyObject *awaited_by,
+    int recurse_task
 ) {
-    assert((void*)coro_address != NULL);
-
-    uintptr_t gen_type_addr;
-    int err = read_ptr(
-        handle,
-        coro_address + offsets->pyobject.ob_type,
-        &gen_type_addr);
-    if (err) {
+    // Read the entire TaskObj at once
+    char task_obj[SIZEOF_TASK_OBJ];
+    if (_Py_RemoteDebug_PagedReadRemoteMemory(&unwinder->handle, task_address,
+                                              unwinder->async_debug_offsets.asyncio_task_object.size,
+                                              task_obj) < 0) {
         return -1;
     }
 
-    PyObject* name = NULL;
-    uintptr_t prev_frame;
-    if (parse_frame_object(
-                handle,
-                &name,
-                offsets,
-                coro_address + offsets->gen_object.gi_iframe,
-                &prev_frame)
-        < 0)
-    {
-        return -1;
-    }
+    uintptr_t task_ab_addr = GET_MEMBER(uintptr_t, task_obj, unwinder->async_debug_offsets.asyncio_task_object.task_awaited_by);
+    task_ab_addr &= ~Py_TAG_BITS;
 
-    if (PyList_Append(render_to, name)) {
-        Py_DECREF(name);
-        return -1;
+    if ((void*)task_ab_addr == NULL) {
+        return 0;
     }
-    Py_DECREF(name);
 
-    int8_t gi_frame_state;
-    err = read_sized_int(
-        handle,
-        coro_address + offsets->gen_object.gi_frame_state,
-        &gi_frame_state,
-        sizeof(int8_t)
-    );
-    if (err) {
-        return -1;
-    }
+    char awaited_by_is_a_set = GET_MEMBER(char, task_obj, unwinder->async_debug_offsets.asyncio_task_object.task_awaited_by_is_set);
 
-    if (gi_frame_state == FRAME_SUSPENDED_YIELD_FROM) {
-        char owner;
-        err = read_char(
-            handle,
-            coro_address + offsets->gen_object.gi_iframe +
-                offsets->interpreter_frame.owner,
-            &owner
-        );
-        if (err) {
+    if (awaited_by_is_a_set) {
+        if (parse_tasks_in_set(unwinder, task_ab_addr, awaited_by, recurse_task)) {
             return -1;
         }
-        if (owner != FRAME_OWNED_BY_GENERATOR) {
-            PyErr_SetString(
-                PyExc_RuntimeError,
-                "generator doesn't own its frame \\_o_/");
+    } else {
+        if (parse_task(unwinder, task_ab_addr, awaited_by, recurse_task)) {
             return -1;
         }
+    }
+
+    return 0;
+}
+
+static int
+handle_yield_from_frame(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t gi_iframe_addr,
+    uintptr_t gen_type_addr,
+    PyObject *render_to
+) {
+    // Read the entire interpreter frame at once
+    char iframe[SIZEOF_INTERP_FRAME];
+    int err = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle,
+        gi_iframe_addr,
+        SIZEOF_INTERP_FRAME,
+        iframe);
+    if (err < 0) {
+        return -1;
+    }
+
+    if (GET_MEMBER(char, iframe, unwinder->debug_offsets.interpreter_frame.owner) != FRAME_OWNED_BY_GENERATOR) {
+        PyErr_SetString(
+            PyExc_RuntimeError,
+            "generator doesn't own its frame \\_o_/");
+        return -1;
+    }
+
+    uintptr_t stackpointer_addr = GET_MEMBER(uintptr_t, iframe, unwinder->debug_offsets.interpreter_frame.stackpointer);
+    stackpointer_addr &= ~Py_TAG_BITS;
 
-        uintptr_t stackpointer_addr;
+    if ((void*)stackpointer_addr != NULL) {
+        uintptr_t gi_await_addr;
         err = read_py_ptr(
-            handle,
-            coro_address + offsets->gen_object.gi_iframe +
-                offsets->interpreter_frame.stackpointer,
-            &stackpointer_addr);
+            &unwinder->handle,
+            stackpointer_addr - sizeof(void*),
+            &gi_await_addr);
         if (err) {
             return -1;
         }
 
-        if ((void*)stackpointer_addr != NULL) {
-            uintptr_t gi_await_addr;
-            err = read_py_ptr(
-                handle,
-                stackpointer_addr - sizeof(void*),
-                &gi_await_addr);
+        if ((void*)gi_await_addr != NULL) {
+            uintptr_t gi_await_addr_type_addr;
+            err = read_ptr(
+                &unwinder->handle,
+                gi_await_addr + unwinder->debug_offsets.pyobject.ob_type,
+                &gi_await_addr_type_addr);
             if (err) {
                 return -1;
             }
 
-            if ((void*)gi_await_addr != NULL) {
-                uintptr_t gi_await_addr_type_addr;
-                int err = read_ptr(
-                    handle,
-                    gi_await_addr + offsets->pyobject.ob_type,
-                    &gi_await_addr_type_addr);
+            if (gen_type_addr == gi_await_addr_type_addr) {
+                /* This needs an explanation. We always start with parsing
+                   native coroutine / generator frames. Ultimately they
+                   are awaiting on something. That something can be
+                   a native coroutine frame or... an iterator.
+                   If it's the latter -- we can't continue building
+                   our chain. So the condition to bail out of this is
+                   to do that when the type of the current coroutine
+                   doesn't match the type of whatever it points to
+                   in its cr_await.
+                */
+                err = parse_coro_chain(unwinder, gi_await_addr, render_to);
                 if (err) {
                     return -1;
                 }
-
-                if (gen_type_addr == gi_await_addr_type_addr) {
-                    /* This needs an explanation. We always start with parsing
-                       native coroutine / generator frames. Ultimately they
-                       are awaiting on something. That something can be
-                       a native coroutine frame or... an iterator.
-                       If it's the latter -- we can't continue building
-                       our chain. So the condition to bail out of this is
-                       to do that when the type of the current coroutine
-                       doesn't match the type of whatever it points to
-                       in its cr_await.
-                    */
-                    err = parse_coro_chain(
-                        handle,
-                        offsets,
-                        async_offsets,
-                        gi_await_addr,
-                        render_to
-                    );
-                    if (err) {
-                        return -1;
-                    }
-                }
             }
         }
-
     }
 
     return 0;
 }
 
-
 static int
-parse_task_awaited_by(
-    proc_handle_t *handle,
-    struct _Py_DebugOffsets* offsets,
-    struct _Py_AsyncioModuleDebugOffsets* async_offsets,
-    uintptr_t task_address,
-    PyObject *awaited_by,
-    int recurse_task
-);
+parse_coro_chain(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t coro_address,
+    PyObject *render_to
+) {
+    assert((void*)coro_address != NULL);
+
+    // Read the entire generator object at once
+    char gen_object[SIZEOF_GEN_OBJ];
+    int err = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle,
+        coro_address,
+        SIZEOF_GEN_OBJ,
+        gen_object);
+    if (err < 0) {
+        return -1;
+    }
 
+    uintptr_t gen_type_addr = GET_MEMBER(uintptr_t, gen_object, unwinder->debug_offsets.pyobject.ob_type);
 
-static int
-parse_task(
-    proc_handle_t *handle,
-    struct _Py_DebugOffsets* offsets,
-    struct _Py_AsyncioModuleDebugOffsets* async_offsets,
+    PyObject* name = NULL;
+
+    // Parse the previous frame using the gi_iframe from local copy
+    uintptr_t prev_frame;
+    uintptr_t gi_iframe_addr = coro_address + unwinder->debug_offsets.gen_object.gi_iframe;
+    if (parse_frame_object(unwinder, &name, gi_iframe_addr, &prev_frame) < 0) {
+        return -1;
+    }
+
+    if (PyList_Append(render_to, name)) {
+        Py_DECREF(name);
+        return -1;
+    }
+    Py_DECREF(name);
+
+    if (GET_MEMBER(int8_t, gen_object, unwinder->debug_offsets.gen_object.gi_frame_state) == FRAME_SUSPENDED_YIELD_FROM) {
+        return handle_yield_from_frame(unwinder, gi_iframe_addr, gen_type_addr, render_to);
+    }
+
+    return 0;
+}
+
+static PyObject*
+create_task_result(
+    RemoteUnwinderObject *unwinder,
     uintptr_t task_address,
-    PyObject *render_to,
     int recurse_task
 ) {
-    char is_task;
-    int err = read_char(
-        handle,
-        task_address + async_offsets->asyncio_task_object.task_is_task,
-        &is_task);
-    if (err) {
-        return -1;
-    }
+    PyObject* result = NULL;
+    PyObject *call_stack = NULL;
+    PyObject *tn = NULL;
+    char task_obj[SIZEOF_TASK_OBJ];
+    uintptr_t coro_addr;
 
-    PyObject* result = PyList_New(0);
+    result = PyList_New(0);
     if (result == NULL) {
-        return -1;
+        goto error;
     }
 
-    PyObject *call_stack = PyList_New(0);
+    call_stack = PyList_New(0);
     if (call_stack == NULL) {
-        goto err;
+        goto error;
     }
+
     if (PyList_Append(result, call_stack)) {
-        Py_DECREF(call_stack);
-        goto err;
+        goto error;
     }
-    /* we can operate on a borrowed one to simplify cleanup */
-    Py_DECREF(call_stack);
+    Py_CLEAR(call_stack);
 
-    if (is_task) {
-        PyObject *tn = NULL;
-        if (recurse_task) {
-            tn = parse_task_name(
-                handle, offsets, async_offsets, task_address);
-        } else {
-            tn = PyLong_FromUnsignedLongLong(task_address);
+    if (recurse_task) {
+        tn = parse_task_name(unwinder, task_address);
+    } else {
+        tn = PyLong_FromUnsignedLongLong(task_address);
+    }
+    if (tn == NULL) {
+        goto error;
+    }
+
+    if (PyList_Append(result, tn)) {
+        goto error;
+    }
+    Py_CLEAR(tn);
+
+    // Parse coroutine chain
+    if (_Py_RemoteDebug_PagedReadRemoteMemory(&unwinder->handle, task_address,
+                                              unwinder->async_debug_offsets.asyncio_task_object.size,
+                                              task_obj) < 0) {
+        goto error;
+    }
+
+    coro_addr = GET_MEMBER(uintptr_t, task_obj, unwinder->async_debug_offsets.asyncio_task_object.task_coro);
+    coro_addr &= ~Py_TAG_BITS;
+
+    if ((void*)coro_addr != NULL) {
+        call_stack = PyList_New(0);
+        if (call_stack == NULL) {
+            goto error;
         }
-        if (tn == NULL) {
-            goto err;
+
+        if (parse_coro_chain(unwinder, coro_addr, call_stack) < 0) {
+            Py_DECREF(call_stack);
+            goto error;
         }
-        if (PyList_Append(result, tn)) {
-            Py_DECREF(tn);
-            goto err;
+
+        if (PyList_Reverse(call_stack)) {
+            Py_DECREF(call_stack);
+            goto error;
         }
-        Py_DECREF(tn);
 
-        uintptr_t coro_addr;
-        err = read_py_ptr(
-            handle,
-            task_address + async_offsets->asyncio_task_object.task_coro,
-            &coro_addr);
-        if (err) {
-            goto err;
+        if (PyList_SetItem(result, 0, call_stack) < 0) {
+            Py_DECREF(call_stack);
+            goto error;
         }
+    }
 
-        if ((void*)coro_addr != NULL) {
-            err = parse_coro_chain(
-                handle,
-                offsets,
-                async_offsets,
-                coro_addr,
-                call_stack
-            );
-            if (err) {
-                goto err;
-            }
+    return result;
 
-            if (PyList_Reverse(call_stack)) {
-                goto err;
-            }
+error:
+    Py_XDECREF(result);
+    Py_XDECREF(call_stack);
+    Py_XDECREF(tn);
+    return NULL;
+}
+
+static int
+parse_task(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t task_address,
+    PyObject *render_to,
+    int recurse_task
+) {
+    char is_task;
+    PyObject* result = NULL;
+    PyObject* awaited_by = NULL;
+    int err;
+
+    err = read_char(
+        &unwinder->handle,
+        task_address + unwinder->async_debug_offsets.asyncio_task_object.task_is_task,
+        &is_task);
+    if (err) {
+        goto error;
+    }
+
+    if (is_task) {
+        result = create_task_result(unwinder, task_address, recurse_task);
+        if (!result) {
+            goto error;
+        }
+    } else {
+        result = PyList_New(0);
+        if (result == NULL) {
+            goto error;
         }
     }
 
     if (PyList_Append(render_to, result)) {
-        goto err;
+        goto error;
     }
 
     if (recurse_task) {
-        PyObject *awaited_by = PyList_New(0);
+        awaited_by = PyList_New(0);
         if (awaited_by == NULL) {
-            goto err;
+            goto error;
         }
+
         if (PyList_Append(result, awaited_by)) {
-            Py_DECREF(awaited_by);
-            goto err;
+            goto error;
         }
-        /* we can operate on a borrowed one to simplify cleanup */
         Py_DECREF(awaited_by);
 
-        if (parse_task_awaited_by(handle, offsets, async_offsets,
-                                task_address, awaited_by, 1)
-        ) {
-            goto err;
+        /* awaited_by is borrowed from 'result' to simplify cleanup */
+        if (parse_task_awaited_by(unwinder, task_address, awaited_by, 1) < 0) {
+            // Clear the pointer so the cleanup doesn't try to decref it since
+            // it's borrowed from 'result' and will be decrefed when result is
+            // deleted.
+            awaited_by = NULL;
+            goto error;
         }
     }
     Py_DECREF(result);
 
     return 0;
 
-err:
-    Py_DECREF(result);
+error:
+    Py_XDECREF(result);
+    Py_XDECREF(awaited_by);
     return -1;
 }
 
 static int
-parse_tasks_in_set(
-    proc_handle_t *handle,
-    struct _Py_DebugOffsets* offsets,
-    struct _Py_AsyncioModuleDebugOffsets* async_offsets,
-    uintptr_t set_addr,
+process_set_entry(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t table_ptr,
     PyObject *awaited_by,
     int recurse_task
 ) {
-    uintptr_t set_obj;
-    if (read_py_ptr(
-            handle,
-            set_addr,
-            &set_obj)
-    ) {
+    uintptr_t key_addr;
+    if (read_py_ptr(&unwinder->handle, table_ptr, &key_addr)) {
         return -1;
     }
 
-    Py_ssize_t num_els;
-    if (read_Py_ssize_t(
-            handle,
-            set_obj + offsets->set_object.used,
-            &num_els)
-    ) {
-        return -1;
-    }
+    if ((void*)key_addr != NULL) {
+        Py_ssize_t ref_cnt;
+        if (read_Py_ssize_t(&unwinder->handle, table_ptr, &ref_cnt)) {
+            return -1;
+        }
 
-    Py_ssize_t set_len;
-    if (read_Py_ssize_t(
-            handle,
-            set_obj + offsets->set_object.mask,
-            &set_len)
-    ) {
-        return -1;
+        if (ref_cnt) {
+            // if 'ref_cnt=0' it's a set dummy marker
+            if (parse_task(unwinder, key_addr, awaited_by, recurse_task)) {
+                return -1;
+            }
+            return 1; // Successfully processed a valid entry
+        }
     }
-    set_len++; // The set contains the `mask+1` element slots.
+    return 0; // Entry was NULL or dummy marker
+}
 
-    uintptr_t table_ptr;
-    if (read_ptr(
-            handle,
-            set_obj + offsets->set_object.table,
-            &table_ptr)
-    ) {
+static int
+parse_tasks_in_set(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t set_addr,
+    PyObject *awaited_by,
+    int recurse_task
+) {
+    char set_object[SIZEOF_SET_OBJ];
+    int err = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle,
+        set_addr,
+        SIZEOF_SET_OBJ,
+        set_object);
+    if (err < 0) {
         return -1;
     }
 
+    Py_ssize_t num_els = GET_MEMBER(Py_ssize_t, set_object, unwinder->debug_offsets.set_object.used);
+    Py_ssize_t set_len = GET_MEMBER(Py_ssize_t, set_object, unwinder->debug_offsets.set_object.mask) + 1; // The set contains the `mask+1` element slots.
+    uintptr_t table_ptr = GET_MEMBER(uintptr_t, set_object, unwinder->debug_offsets.set_object.table);
+
     Py_ssize_t i = 0;
     Py_ssize_t els = 0;
-    while (i < set_len) {
-        uintptr_t key_addr;
-        if (read_py_ptr(handle, table_ptr, &key_addr)) {
+    while (i < set_len && els < num_els) {
+        int result = process_set_entry(unwinder, table_ptr, awaited_by, recurse_task);
+
+        if (result < 0) {
             return -1;
         }
-
-        if ((void*)key_addr != NULL) {
-            Py_ssize_t ref_cnt;
-            if (read_Py_ssize_t(handle, table_ptr, &ref_cnt)) {
-                return -1;
-            }
-
-            if (ref_cnt) {
-                // if 'ref_cnt=0' it's a set dummy marker
-
-                if (parse_task(
-                    handle,
-                    offsets,
-                    async_offsets,
-                    key_addr,
-                    awaited_by,
-                    recurse_task
-                )
-                ) {
-                    return -1;
-                }
-
-                if (++els == num_els) {
-                    break;
-                }
-            }
+        if (result > 0) {
+            els++;
         }
 
         table_ptr += sizeof(void*) * 2;
@@ -714,81 +960,224 @@ parse_tasks_in_set(
 
 
 static int
-parse_task_awaited_by(
-    proc_handle_t *handle,
-    struct _Py_DebugOffsets* offsets,
-    struct _Py_AsyncioModuleDebugOffsets* async_offsets,
-    uintptr_t task_address,
-    PyObject *awaited_by,
-    int recurse_task
+setup_async_result_structure(PyObject **result, PyObject **calls)
+{
+    *result = PyList_New(1);
+    if (*result == NULL) {
+        return -1;
+    }
+
+    *calls = PyList_New(0);
+    if (*calls == NULL) {
+        Py_DECREF(*result);
+        *result = NULL;
+        return -1;
+    }
+
+    if (PyList_SetItem(*result, 0, *calls)) { /* steals ref to 'calls' */
+        Py_DECREF(*calls);
+        Py_DECREF(*result);
+        *result = NULL;
+        *calls = NULL;
+        return -1;
+    }
+
+    return 0;
+}
+
+static int
+add_task_info_to_result(
+    RemoteUnwinderObject *self,
+    PyObject *result,
+    uintptr_t running_task_addr
 ) {
-    uintptr_t task_ab_addr;
-    int err = read_py_ptr(
-        handle,
-        task_address + async_offsets->asyncio_task_object.task_awaited_by,
-        &task_ab_addr);
-    if (err) {
+    PyObject *tn = parse_task_name(self, running_task_addr);
+    if (tn == NULL) {
         return -1;
     }
 
-    if ((void*)task_ab_addr == NULL) {
-        return 0;
+    if (PyList_Append(result, tn)) {
+        Py_DECREF(tn);
+        return -1;
     }
+    Py_DECREF(tn);
 
-    char awaited_by_is_a_set;
-    err = read_char(
-        handle,
-        task_address + async_offsets->asyncio_task_object.task_awaited_by_is_set,
-        &awaited_by_is_a_set);
-    if (err) {
+    PyObject* awaited_by = PyList_New(0);
+    if (awaited_by == NULL) {
         return -1;
     }
 
-    if (awaited_by_is_a_set) {
-        if (parse_tasks_in_set(
-            handle,
-            offsets,
-            async_offsets,
-            task_address + async_offsets->asyncio_task_object.task_awaited_by,
-            awaited_by,
-            recurse_task
-        )
-         ) {
-            return -1;
-        }
-    } else {
-        uintptr_t sub_task;
-        if (read_py_ptr(
-                handle,
-                task_address + async_offsets->asyncio_task_object.task_awaited_by,
-                &sub_task)
-        ) {
-            return -1;
-        }
+    if (PyList_Append(result, awaited_by)) {
+        Py_DECREF(awaited_by);
+        return -1;
+    }
+    Py_DECREF(awaited_by);
 
-        if (parse_task(
-            handle,
-            offsets,
-            async_offsets,
-            sub_task,
-            awaited_by,
-            recurse_task
-        )
-        ) {
-            return -1;
-        }
+    if (parse_task_awaited_by(
+        self, running_task_addr, awaited_by, 1) < 0) {
+        return -1;
     }
 
     return 0;
 }
 
-typedef struct
+static int
+process_single_task_node(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t task_addr,
+    PyObject *result
+) {
+    PyObject *tn = NULL;
+    PyObject *current_awaited_by = NULL;
+    PyObject *task_id = NULL;
+    PyObject *result_item = NULL;
+
+    tn = parse_task_name(unwinder, task_addr);
+    if (tn == NULL) {
+        goto error;
+    }
+
+    current_awaited_by = PyList_New(0);
+    if (current_awaited_by == NULL) {
+        goto error;
+    }
+
+    task_id = PyLong_FromUnsignedLongLong(task_addr);
+    if (task_id == NULL) {
+        goto error;
+    }
+
+    result_item = PyTuple_New(3);
+    if (result_item == NULL) {
+        goto error;
+    }
+
+    PyTuple_SET_ITEM(result_item, 0, task_id);  // steals ref
+    PyTuple_SET_ITEM(result_item, 1, tn);  // steals ref
+    PyTuple_SET_ITEM(result_item, 2, current_awaited_by);  // steals ref
+
+    // References transferred to tuple
+    task_id = NULL;
+    tn = NULL;
+    current_awaited_by = NULL;
+
+    if (PyList_Append(result, result_item)) {
+        Py_DECREF(result_item);
+        return -1;
+    }
+    Py_DECREF(result_item);
+
+    // Get back current_awaited_by reference for parse_task_awaited_by
+    current_awaited_by = PyTuple_GET_ITEM(result_item, 2);
+    if (parse_task_awaited_by(unwinder, task_addr, current_awaited_by, 0) < 0) {
+        return -1;
+    }
+
+    return 0;
+
+error:
+    Py_XDECREF(tn);
+    Py_XDECREF(current_awaited_by);
+    Py_XDECREF(task_id);
+    Py_XDECREF(result_item);
+    return -1;
+}
+
+/* ============================================================================
+ * TLBC CACHING FUNCTIONS
+ * ============================================================================ */
+
+#ifdef Py_GIL_DISABLED
+
+typedef struct {
+    void *tlbc_array;  // Local copy of the TLBC array
+    Py_ssize_t tlbc_array_size;  // Size of the TLBC array
+    uint32_t generation;  // Generation when this was cached
+} TLBCCacheEntry;
+
+static void
+tlbc_cache_entry_destroy(void *ptr)
 {
-    int lineno;
-    int end_lineno;
-    int column;
-    int end_column;
-} LocationInfo;
+    TLBCCacheEntry *entry = (TLBCCacheEntry *)ptr;
+    if (entry->tlbc_array) {
+        PyMem_RawFree(entry->tlbc_array);
+    }
+    PyMem_RawFree(entry);
+}
+
+static TLBCCacheEntry *
+get_tlbc_cache_entry(RemoteUnwinderObject *self, uintptr_t code_addr, uint32_t current_generation)
+{
+    void *key = (void *)code_addr;
+    TLBCCacheEntry *entry = _Py_hashtable_get(self->tlbc_cache, key);
+
+    if (entry && entry->generation != current_generation) {
+        // Entry is stale, remove it by setting to NULL
+        _Py_hashtable_set(self->tlbc_cache, key, NULL);
+        entry = NULL;
+    }
+
+    return entry;
+}
+
+static int
+cache_tlbc_array(RemoteUnwinderObject *self, uintptr_t code_addr, uintptr_t tlbc_array_addr, uint32_t generation)
+{
+    uintptr_t tlbc_array_ptr;
+    void *tlbc_array = NULL;
+    TLBCCacheEntry *entry = NULL;
+
+    // Read the TLBC array pointer
+    if (read_ptr(&self->handle, tlbc_array_addr, &tlbc_array_ptr) != 0 || tlbc_array_ptr == 0) {
+        return 0; // No TLBC array
+    }
+
+    // Read the TLBC array size
+    Py_ssize_t tlbc_size;
+    if (_Py_RemoteDebug_PagedReadRemoteMemory(&self->handle, tlbc_array_ptr, sizeof(tlbc_size), &tlbc_size) != 0 || tlbc_size <= 0) {
+        return 0; // Invalid size
+    }
+
+    // Allocate and read the entire TLBC array
+    size_t array_data_size = tlbc_size * sizeof(void*);
+    tlbc_array = PyMem_RawMalloc(sizeof(Py_ssize_t) + array_data_size);
+    if (!tlbc_array) {
+        return -1; // Memory error
+    }
+
+    if (_Py_RemoteDebug_PagedReadRemoteMemory(&self->handle, tlbc_array_ptr, sizeof(Py_ssize_t) + array_data_size, tlbc_array) != 0) {
+        PyMem_RawFree(tlbc_array);
+        return 0; // Read error
+    }
+
+    // Create cache entry
+    entry = PyMem_RawMalloc(sizeof(TLBCCacheEntry));
+    if (!entry) {
+        PyMem_RawFree(tlbc_array);
+        return -1; // Memory error
+    }
+
+    entry->tlbc_array = tlbc_array;
+    entry->tlbc_array_size = tlbc_size;
+    entry->generation = generation;
+
+    // Store in cache
+    void *key = (void *)code_addr;
+    if (_Py_hashtable_set(self->tlbc_cache, key, entry) < 0) {
+        tlbc_cache_entry_destroy(entry);
+        return -1; // Cache error
+    }
+
+    return 1; // Success
+}
+
+
+
+#endif
+
+/* ============================================================================
+ * LINE TABLE PARSING FUNCTIONS
+ * ============================================================================ */
 
 static int
 scan_varint(const uint8_t **ptr)
@@ -818,7 +1207,6 @@ scan_signed_varint(const uint8_t **ptr)
     }
 }
 
-
 static bool
 parse_linetable(const uintptr_t addrq, const char* linetable, int firstlineno, LocationInfo* info)
 {
@@ -863,7 +1251,9 @@ parse_linetable(const uintptr_t addrq, const char* linetable, int firstlineno, L
             }
             default: {
                 uint8_t second_byte = *(ptr++);
-                assert((second_byte & 128) == 0);
+                if ((second_byte & 128) != 0) {
+                    return false;
+                }
                 info->column = code << 3 | (second_byte >> 4);
                 info->end_column = info->column + (second_byte & 15);
                 break;
@@ -877,240 +1267,387 @@ parse_linetable(const uintptr_t addrq, const char* linetable, int firstlineno, L
     return false;
 }
 
+/* ============================================================================
+ * CODE OBJECT AND FRAME PARSING FUNCTIONS
+ * ============================================================================ */
+
 static int
-read_remote_pointer(proc_handle_t *handle, uintptr_t address, uintptr_t *out_ptr, const char *error_message)
+parse_code_object(RemoteUnwinderObject *unwinder,
+                  PyObject **result,
+                  uintptr_t address,
+                  uintptr_t instruction_pointer,
+                  uintptr_t *previous_frame,
+                  int32_t tlbc_index)
 {
-    int bytes_read = _Py_RemoteDebug_ReadRemoteMemory(handle, address, sizeof(void *), out_ptr);
-    if (bytes_read < 0) {
-        return -1;
+    void *key = (void *)address;
+    CachedCodeMetadata *meta = NULL;
+    PyObject *func = NULL;
+    PyObject *file = NULL;
+    PyObject *linetable = NULL;
+    PyObject *lineno = NULL;
+    PyObject *tuple = NULL;
+
+#ifdef Py_GIL_DISABLED
+    // In free threading builds, code object addresses might have the low bit set
+    // as a flag, so we need to mask it off to get the real address
+    uintptr_t real_address = address & (~1);
+#else
+    uintptr_t real_address = address;
+#endif
+
+    if (unwinder && unwinder->code_object_cache != NULL) {
+        meta = _Py_hashtable_get(unwinder->code_object_cache, key);
     }
 
-    if ((void *)(*out_ptr) == NULL) {
-        PyErr_SetString(PyExc_RuntimeError, error_message);
-        return -1;
+    if (meta == NULL) {
+        char code_object[SIZEOF_CODE_OBJ];
+        if (_Py_RemoteDebug_PagedReadRemoteMemory(
+                &unwinder->handle, real_address, SIZEOF_CODE_OBJ, code_object) < 0)
+        {
+            goto error;
+        }
+
+        func = read_py_str(unwinder,
+            GET_MEMBER(uintptr_t, code_object, unwinder->debug_offsets.code_object.qualname), 1024);
+        if (!func) {
+            goto error;
+        }
+
+        file = read_py_str(unwinder,
+            GET_MEMBER(uintptr_t, code_object, unwinder->debug_offsets.code_object.filename), 1024);
+        if (!file) {
+            goto error;
+        }
+
+        linetable = read_py_bytes(unwinder,
+            GET_MEMBER(uintptr_t, code_object, unwinder->debug_offsets.code_object.linetable), 4096);
+        if (!linetable) {
+            goto error;
+        }
+
+        meta = PyMem_RawMalloc(sizeof(CachedCodeMetadata));
+        if (!meta) {
+            goto error;
+        }
+
+        meta->func_name = func;
+        meta->file_name = file;
+        meta->linetable = linetable;
+        meta->first_lineno = GET_MEMBER(int, code_object, unwinder->debug_offsets.code_object.firstlineno);
+        meta->addr_code_adaptive = real_address + unwinder->debug_offsets.code_object.co_code_adaptive;
+
+        if (unwinder && unwinder->code_object_cache && _Py_hashtable_set(unwinder->code_object_cache, key, meta) < 0) {
+            cached_code_metadata_destroy(meta);
+            goto error;
+        }
+
+        // Ownership transferred to meta
+        func = NULL;
+        file = NULL;
+        linetable = NULL;
+    }
+
+    uintptr_t ip = instruction_pointer;
+    ptrdiff_t addrq;
+
+#ifdef Py_GIL_DISABLED
+    // Handle thread-local bytecode (TLBC) in free threading builds
+    if (tlbc_index == 0 || unwinder->debug_offsets.code_object.co_tlbc == 0 || unwinder == NULL) {
+        // No TLBC or no unwinder - use main bytecode directly
+        addrq = (uint16_t *)ip - (uint16_t *)meta->addr_code_adaptive;
+        goto done_tlbc;
+    }
+
+    // Try to get TLBC data from cache (we'll get generation from the caller)
+    TLBCCacheEntry *tlbc_entry = get_tlbc_cache_entry(unwinder, real_address, unwinder->tlbc_generation);
+
+    if (!tlbc_entry) {
+        // Cache miss - try to read and cache TLBC array
+        if (cache_tlbc_array(unwinder, real_address, real_address + unwinder->debug_offsets.code_object.co_tlbc, unwinder->tlbc_generation) > 0) {
+            tlbc_entry = get_tlbc_cache_entry(unwinder, real_address, unwinder->tlbc_generation);
+        }
+    }
+
+    if (tlbc_entry && tlbc_index < tlbc_entry->tlbc_array_size) {
+        // Use cached TLBC data
+        uintptr_t *entries = (uintptr_t *)((char *)tlbc_entry->tlbc_array + sizeof(Py_ssize_t));
+        uintptr_t tlbc_bytecode_addr = entries[tlbc_index];
+
+        if (tlbc_bytecode_addr != 0) {
+            // Calculate offset from TLBC bytecode
+            addrq = (uint16_t *)ip - (uint16_t *)tlbc_bytecode_addr;
+            goto done_tlbc;
+        }
+    }
+
+    // Fall back to main bytecode
+    addrq = (uint16_t *)ip - (uint16_t *)meta->addr_code_adaptive;
+
+done_tlbc:
+#else
+    // Non-free-threaded build, always use the main bytecode
+    (void)tlbc_index; // Suppress unused parameter warning
+    (void)unwinder;   // Suppress unused parameter warning
+    addrq = (uint16_t *)ip - (uint16_t *)meta->addr_code_adaptive;
+#endif
+    ;  // Empty statement to avoid C23 extension warning
+    LocationInfo info = {0};
+    bool ok = parse_linetable(addrq, PyBytes_AS_STRING(meta->linetable),
+                              meta->first_lineno, &info);
+    if (!ok) {
+        info.lineno = -1;
+    }
+
+    lineno = PyLong_FromLong(info.lineno);
+    if (!lineno) {
+        goto error;
+    }
+
+    tuple = PyTuple_New(3);
+    if (!tuple) {
+        goto error;
     }
 
+    Py_INCREF(meta->func_name);
+    Py_INCREF(meta->file_name);
+    PyTuple_SET_ITEM(tuple, 0, meta->func_name);
+    PyTuple_SET_ITEM(tuple, 1, meta->file_name);
+    PyTuple_SET_ITEM(tuple, 2, lineno);
+
+    *result = tuple;
     return 0;
+
+error:
+    Py_XDECREF(func);
+    Py_XDECREF(file);
+    Py_XDECREF(linetable);
+    Py_XDECREF(lineno);
+    Py_XDECREF(tuple);
+    return -1;
 }
 
-static int
-read_instruction_ptr(proc_handle_t *handle, struct _Py_DebugOffsets *offsets,
-                     uintptr_t current_frame, uintptr_t *instruction_ptr)
+/* ============================================================================
+ * STACK CHUNK MANAGEMENT FUNCTIONS
+ * ============================================================================ */
+
+static void
+cleanup_stack_chunks(StackChunkList *chunks)
 {
-    return read_remote_pointer(
-        handle,
-        current_frame + offsets->interpreter_frame.instr_ptr,
-        instruction_ptr,
-        "No instruction ptr found"
-    );
+    for (size_t i = 0; i < chunks->count; ++i) {
+        PyMem_RawFree(chunks->chunks[i].local_copy);
+    }
+    PyMem_RawFree(chunks->chunks);
 }
 
 static int
-parse_code_object(proc_handle_t *handle,
-                  PyObject **result,
-                  struct _Py_DebugOffsets *offsets,
-                  uintptr_t address,
-                  uintptr_t current_frame,
-                  uintptr_t *previous_frame)
-{
-    uintptr_t addr_func_name, addr_file_name, addr_linetable, instruction_ptr;
+process_single_stack_chunk(
+    proc_handle_t *handle,
+    uintptr_t chunk_addr,
+    StackChunkInfo *chunk_info
+) {
+    // Start with default size assumption
+    size_t current_size = _PY_DATA_STACK_CHUNK_SIZE;
 
-    if (read_remote_pointer(handle, address + offsets->code_object.qualname, &addr_func_name, "No function name found") < 0 ||
-        read_remote_pointer(handle, address + offsets->code_object.filename, &addr_file_name, "No file name found") < 0 ||
-        read_remote_pointer(handle, address + offsets->code_object.linetable, &addr_linetable, "No linetable found") < 0 ||
-        read_instruction_ptr(handle, offsets, current_frame, &instruction_ptr) < 0) {
+    char *this_chunk = PyMem_RawMalloc(current_size);
+    if (!this_chunk) {
+        PyErr_NoMemory();
         return -1;
     }
 
-    int firstlineno;
-    if (_Py_RemoteDebug_ReadRemoteMemory(handle,
-                                         address + offsets->code_object.firstlineno,
-                                         sizeof(int),
-                                         &firstlineno) < 0) {
+    if (_Py_RemoteDebug_PagedReadRemoteMemory(handle, chunk_addr, current_size, this_chunk) < 0) {
+        PyMem_RawFree(this_chunk);
         return -1;
     }
 
-    PyObject *py_linetable = read_py_bytes(handle, offsets, addr_linetable);
-    if (!py_linetable) {
-        return -1;
+    // Check actual size and reread if necessary
+    size_t actual_size = GET_MEMBER(size_t, this_chunk, offsetof(_PyStackChunk, size));
+    if (actual_size != current_size) {
+        this_chunk = PyMem_RawRealloc(this_chunk, actual_size);
+        if (!this_chunk) {
+            PyErr_NoMemory();
+            return -1;
+        }
+
+        if (_Py_RemoteDebug_PagedReadRemoteMemory(handle, chunk_addr, actual_size, this_chunk) < 0) {
+            PyMem_RawFree(this_chunk);
+            return -1;
+        }
+        current_size = actual_size;
     }
 
-    uintptr_t addr_code_adaptive = address + offsets->code_object.co_code_adaptive;
-    ptrdiff_t addrq = (uint16_t *)instruction_ptr - (uint16_t *)addr_code_adaptive;
+    chunk_info->remote_addr = chunk_addr;
+    chunk_info->size = current_size;
+    chunk_info->local_copy = this_chunk;
+    return 0;
+}
 
-    LocationInfo info;
-    parse_linetable(addrq, PyBytes_AS_STRING(py_linetable), firstlineno, &info);
-    Py_DECREF(py_linetable);  // Done with linetable
+static int
+copy_stack_chunks(RemoteUnwinderObject *unwinder,
+                  uintptr_t tstate_addr,
+                  StackChunkList *out_chunks)
+{
+    uintptr_t chunk_addr;
+    StackChunkInfo *chunks = NULL;
+    size_t count = 0;
+    size_t max_chunks = 16;
 
-    PyObject *py_line = PyLong_FromLong(info.lineno);
-    if (!py_line) {
+    if (read_ptr(&unwinder->handle, tstate_addr + unwinder->debug_offsets.thread_state.datastack_chunk, &chunk_addr)) {
         return -1;
     }
 
-    PyObject *py_func_name = read_py_str(handle, offsets, addr_func_name, 256);
-    if (!py_func_name) {
-        Py_DECREF(py_line);
+    chunks = PyMem_RawMalloc(max_chunks * sizeof(StackChunkInfo));
+    if (!chunks) {
+        PyErr_NoMemory();
         return -1;
     }
 
-    PyObject *py_file_name = read_py_str(handle, offsets, addr_file_name, 256);
-    if (!py_file_name) {
-        Py_DECREF(py_line);
-        Py_DECREF(py_func_name);
-        return -1;
+    while (chunk_addr != 0) {
+        // Grow array if needed
+        if (count >= max_chunks) {
+            max_chunks *= 2;
+            StackChunkInfo *new_chunks = PyMem_RawRealloc(chunks, max_chunks * sizeof(StackChunkInfo));
+            if (!new_chunks) {
+                PyErr_NoMemory();
+                goto error;
+            }
+            chunks = new_chunks;
+        }
+
+        // Process this chunk
+        if (process_single_stack_chunk(&unwinder->handle, chunk_addr, &chunks[count]) < 0) {
+            goto error;
+        }
+
+        // Get next chunk address and increment count
+        chunk_addr = GET_MEMBER(uintptr_t, chunks[count].local_copy, offsetof(_PyStackChunk, previous));
+        count++;
     }
 
-    PyObject *result_tuple = PyTuple_New(3);
-    if (!result_tuple) {
-        Py_DECREF(py_line);
-        Py_DECREF(py_func_name);
-        Py_DECREF(py_file_name);
-        return -1;
+    out_chunks->chunks = chunks;
+    out_chunks->count = count;
+    return 0;
+
+error:
+    for (size_t i = 0; i < count; ++i) {
+        PyMem_RawFree(chunks[i].local_copy);
     }
+    PyMem_RawFree(chunks);
+    return -1;
+}
 
-    PyTuple_SET_ITEM(result_tuple, 0, py_func_name);  // steals ref
-    PyTuple_SET_ITEM(result_tuple, 1, py_file_name);  // steals ref
-    PyTuple_SET_ITEM(result_tuple, 2, py_line);       // steals ref
+static void *
+find_frame_in_chunks(StackChunkList *chunks, uintptr_t remote_ptr)
+{
+    for (size_t i = 0; i < chunks->count; ++i) {
+        uintptr_t base = chunks->chunks[i].remote_addr + offsetof(_PyStackChunk, data);
+        size_t payload = chunks->chunks[i].size - offsetof(_PyStackChunk, data);
 
-    *result = result_tuple;
-    return 0;
+        if (remote_ptr >= base && remote_ptr < base + payload) {
+            return (char *)chunks->chunks[i].local_copy + (remote_ptr - chunks->chunks[i].remote_addr);
+        }
+    }
+    return NULL;
 }
 
 static int
-parse_frame_object(
-    proc_handle_t *handle,
-    PyObject** result,
-    struct _Py_DebugOffsets* offsets,
+parse_frame_from_chunks(
+    RemoteUnwinderObject *unwinder,
+    PyObject **result,
     uintptr_t address,
-    uintptr_t* previous_frame
+    uintptr_t *previous_frame,
+    StackChunkList *chunks
 ) {
-    int err;
-
-    Py_ssize_t bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
-        handle,
-        address + offsets->interpreter_frame.previous,
-        sizeof(void*),
-        previous_frame
-    );
-    if (bytes_read < 0) {
+    void *frame_ptr = find_frame_in_chunks(chunks, address);
+    if (!frame_ptr) {
         return -1;
     }
 
-    char owner;
-    if (read_char(handle, address + offsets->interpreter_frame.owner, &owner)) {
-        return -1;
-    }
+    char *frame = (char *)frame_ptr;
+    *previous_frame = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.previous);
 
-    if (owner >= FRAME_OWNED_BY_INTERPRETER) {
+    if (GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner) >= FRAME_OWNED_BY_INTERPRETER ||
+        !GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable)) {
         return 0;
     }
 
-    uintptr_t address_of_code_object;
-    err = read_py_ptr(
-        handle,
-        address + offsets->interpreter_frame.executable,
-        &address_of_code_object
-    );
-    if (err) {
-        return -1;
-    }
+    uintptr_t instruction_pointer = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.instr_ptr);
 
-    if ((void*)address_of_code_object == NULL) {
-        return 0;
+    // Get tlbc_index for free threading builds
+    int32_t tlbc_index = 0;
+#ifdef Py_GIL_DISABLED
+    if (unwinder->debug_offsets.interpreter_frame.tlbc_index != 0) {
+        tlbc_index = GET_MEMBER(int32_t, frame, unwinder->debug_offsets.interpreter_frame.tlbc_index);
     }
+#endif
 
     return parse_code_object(
-        handle, result, offsets, address_of_code_object, address, previous_frame);
+        unwinder, result, GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable),
+        instruction_pointer, previous_frame, tlbc_index);
 }
 
+/* ============================================================================
+ * INTERPRETER STATE AND THREAD DISCOVERY FUNCTIONS
+ * ============================================================================ */
+
 static int
-parse_async_frame_object(
-    proc_handle_t *handle,
-    PyObject** result,
-    struct _Py_DebugOffsets* offsets,
-    uintptr_t address,
-    uintptr_t* previous_frame,
-    uintptr_t* code_object
+populate_initial_state_data(
+    int all_threads,
+    RemoteUnwinderObject *unwinder,
+    uintptr_t runtime_start_address,
+    uintptr_t *interpreter_state,
+    uintptr_t *tstate
 ) {
-    int err;
-
-    Py_ssize_t bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
-        handle,
-        address + offsets->interpreter_frame.previous,
-        sizeof(void*),
-        previous_frame
-    );
-    if (bytes_read < 0) {
-        return -1;
-    }
+    uint64_t interpreter_state_list_head =
+        unwinder->debug_offsets.runtime_state.interpreters_head;
 
-    char owner;
-    bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
-        handle, address + offsets->interpreter_frame.owner, sizeof(char), &owner);
+    uintptr_t address_of_interpreter_state;
+    int bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
+            &unwinder->handle,
+            runtime_start_address + interpreter_state_list_head,
+            sizeof(void*),
+            &address_of_interpreter_state);
     if (bytes_read < 0) {
         return -1;
     }
 
-    if (owner == FRAME_OWNED_BY_CSTACK || owner == FRAME_OWNED_BY_INTERPRETER) {
-        return 0;  // C frame
-    }
-
-    if (owner != FRAME_OWNED_BY_GENERATOR
-        && owner != FRAME_OWNED_BY_THREAD) {
-        PyErr_Format(PyExc_RuntimeError, "Unhandled frame owner %d.\n", owner);
+    if (address_of_interpreter_state == 0) {
+        PyErr_SetString(PyExc_RuntimeError, "No interpreter state found");
         return -1;
     }
 
-    err = read_py_ptr(
-        handle,
-        address + offsets->interpreter_frame.executable,
-        code_object
-    );
-    if (err) {
-        return -1;
-    }
+    *interpreter_state = address_of_interpreter_state;
 
-    assert(code_object != NULL);
-    if ((void*)*code_object == NULL) {
+    if (all_threads) {
+        *tstate = 0;
         return 0;
     }
 
-    if (parse_code_object(
-        handle, result, offsets, *code_object, address, previous_frame)) {
-        return -1;
-    }
-
-    return 1;
-}
+    uintptr_t address_of_thread = address_of_interpreter_state +
+                    unwinder->debug_offsets.interpreter_state.threads_main;
 
-static int
-read_async_debug(
-    proc_handle_t *handle,
-    struct _Py_AsyncioModuleDebugOffsets* async_debug
-) {
-    uintptr_t async_debug_addr = _Py_RemoteDebug_GetAsyncioDebugAddress(handle);
-    if (!async_debug_addr) {
+    if (_Py_RemoteDebug_PagedReadRemoteMemory(
+            &unwinder->handle,
+            address_of_thread,
+            sizeof(void*),
+            tstate) < 0) {
         return -1;
     }
 
-    size_t size = sizeof(struct _Py_AsyncioModuleDebugOffsets);
-    int result = _Py_RemoteDebug_ReadRemoteMemory(handle, async_debug_addr, size, async_debug);
-    return result;
+    return 0;
 }
 
 static int
 find_running_frame(
-    proc_handle_t *handle,
+    RemoteUnwinderObject *unwinder,
     uintptr_t runtime_start_address,
-    _Py_DebugOffsets* local_debug_offsets,
     uintptr_t *frame
 ) {
     uint64_t interpreter_state_list_head =
-        local_debug_offsets->runtime_state.interpreters_head;
+        unwinder->debug_offsets.runtime_state.interpreters_head;
 
     uintptr_t address_of_interpreter_state;
-    int bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
-            handle,
+    int bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
+            &unwinder->handle,
             runtime_start_address + interpreter_state_list_head,
             sizeof(void*),
             &address_of_interpreter_state);
@@ -1124,10 +1661,10 @@ find_running_frame(
     }
 
     uintptr_t address_of_thread;
-    bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
-            handle,
+    bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
+            &unwinder->handle,
             address_of_interpreter_state +
-                local_debug_offsets->interpreter_state.threads_main,
+                unwinder->debug_offsets.interpreter_state.threads_main,
             sizeof(void*),
             &address_of_thread);
     if (bytes_read < 0) {
@@ -1137,8 +1674,8 @@ find_running_frame(
     // No Python frames are available for us (can happen at tear-down).
     if ((void*)address_of_thread != NULL) {
         int err = read_ptr(
-            handle,
-            address_of_thread + local_debug_offsets->thread_state.current_frame,
+            &unwinder->handle,
+            address_of_thread + unwinder->debug_offsets.thread_state.current_frame,
             frame);
         if (err) {
             return -1;
@@ -1152,21 +1689,18 @@ find_running_frame(
 
 static int
 find_running_task(
-    proc_handle_t *handle,
-    uintptr_t runtime_start_address,
-    _Py_DebugOffsets *local_debug_offsets,
-    struct _Py_AsyncioModuleDebugOffsets *async_offsets,
+    RemoteUnwinderObject *unwinder,
     uintptr_t *running_task_addr
 ) {
     *running_task_addr = (uintptr_t)NULL;
 
     uint64_t interpreter_state_list_head =
-        local_debug_offsets->runtime_state.interpreters_head;
+        unwinder->debug_offsets.runtime_state.interpreters_head;
 
     uintptr_t address_of_interpreter_state;
-    int bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
-            handle,
-            runtime_start_address + interpreter_state_list_head,
+    int bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
+            &unwinder->handle,
+            unwinder->runtime_start_address + interpreter_state_list_head,
             sizeof(void*),
             &address_of_interpreter_state);
     if (bytes_read < 0) {
@@ -1179,10 +1713,10 @@ find_running_task(
     }
 
     uintptr_t address_of_thread;
-    bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
-            handle,
+    bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
+            &unwinder->handle,
             address_of_interpreter_state +
-                local_debug_offsets->interpreter_state.threads_head,
+                unwinder->debug_offsets.interpreter_state.threads_head,
             sizeof(void*),
             &address_of_thread);
     if (bytes_read < 0) {
@@ -1196,9 +1730,9 @@ find_running_task(
     }
 
     bytes_read = read_py_ptr(
-        handle,
+        &unwinder->handle,
         address_of_thread
-        + async_offsets->asyncio_thread_state.asyncio_running_loop,
+        + unwinder->async_debug_offsets.asyncio_thread_state.asyncio_running_loop,
         &address_of_running_loop);
     if (bytes_read == -1) {
         return -1;
@@ -1210,9 +1744,9 @@ find_running_task(
     }
 
     int err = read_ptr(
-        handle,
+        &unwinder->handle,
         address_of_thread
-        + async_offsets->asyncio_thread_state.asyncio_running_task,
+        + unwinder->async_debug_offsets.asyncio_thread_state.asyncio_running_task,
         running_task_addr);
     if (err) {
         return -1;
@@ -1222,579 +1756,936 @@ find_running_task(
 }
 
 static int
-append_awaited_by_for_thread(
-    proc_handle_t *handle,
-    uintptr_t head_addr,
-    struct _Py_DebugOffsets *debug_offsets,
-    struct _Py_AsyncioModuleDebugOffsets *async_offsets,
-    PyObject *result
+find_running_task_and_coro(
+    RemoteUnwinderObject *self,
+    uintptr_t *running_task_addr,
+    uintptr_t *running_coro_addr,
+    uintptr_t *running_task_code_obj
 ) {
-    struct llist_node task_node;
-
-    if (0 > _Py_RemoteDebug_ReadRemoteMemory(
-                handle,
-                head_addr,
-                sizeof(task_node),
-                &task_node))
-    {
+    *running_task_addr = (uintptr_t)NULL;
+    if (find_running_task(
+        self, running_task_addr) < 0) {
+        chain_exceptions(PyExc_RuntimeError, "Failed to find running task");
         return -1;
     }
 
-    size_t iteration_count = 0;
-    const size_t MAX_ITERATIONS = 2 << 15;  // A reasonable upper bound
-    while ((uintptr_t)task_node.next != head_addr) {
-        if (++iteration_count > MAX_ITERATIONS) {
-            PyErr_SetString(PyExc_RuntimeError, "Task list appears corrupted");
-            return -1;
-        }
-
-        if (task_node.next == NULL) {
-            PyErr_SetString(
-                PyExc_RuntimeError,
-                "Invalid linked list structure reading remote memory");
-            return -1;
-        }
-
-        uintptr_t task_addr = (uintptr_t)task_node.next
-            - async_offsets->asyncio_task_object.task_node;
-
-        PyObject *tn = parse_task_name(
-            handle,
-            debug_offsets,
-            async_offsets,
-            task_addr);
-        if (tn == NULL) {
-            return -1;
-        }
-
-        PyObject *current_awaited_by = PyList_New(0);
-        if (current_awaited_by == NULL) {
-            Py_DECREF(tn);
-            return -1;
-        }
-
-        PyObject* task_id = PyLong_FromUnsignedLongLong(task_addr);
-        if (task_id == NULL) {
-            Py_DECREF(tn);
-            Py_DECREF(current_awaited_by);
-            return -1;
-        }
-
-        PyObject *result_item = PyTuple_New(3);
-        if (result_item == NULL) {
-            Py_DECREF(tn);
-            Py_DECREF(current_awaited_by);
-            Py_DECREF(task_id);
-            return -1;
-        }
-
-        PyTuple_SET_ITEM(result_item, 0, task_id);  // steals ref
-        PyTuple_SET_ITEM(result_item, 1, tn);  // steals ref
-        PyTuple_SET_ITEM(result_item, 2, current_awaited_by);  // steals ref
-        if (PyList_Append(result, result_item)) {
-            Py_DECREF(result_item);
-            return -1;
-        }
-        Py_DECREF(result_item);
-
-        if (parse_task_awaited_by(handle, debug_offsets, async_offsets,
-                                  task_addr, current_awaited_by, 0))
-        {
-            return -1;
-        }
-
-        // onto the next one...
-        if (0 > _Py_RemoteDebug_ReadRemoteMemory(
-                    handle,
-                    (uintptr_t)task_node.next,
-                    sizeof(task_node),
-                    &task_node))
-        {
-            return -1;
-        }
-    }
-
-    return 0;
-}
-
-static int
-append_awaited_by(
-    proc_handle_t *handle,
-    unsigned long tid,
-    uintptr_t head_addr,
-    struct _Py_DebugOffsets *debug_offsets,
-    struct _Py_AsyncioModuleDebugOffsets *async_offsets,
-    PyObject *result)
-{
-    PyObject *tid_py = PyLong_FromUnsignedLong(tid);
-    if (tid_py == NULL) {
+    if ((void*)*running_task_addr == NULL) {
+        PyErr_SetString(PyExc_RuntimeError, "No running task found");
         return -1;
     }
 
-    PyObject *result_item = PyTuple_New(2);
-    if (result_item == NULL) {
-        Py_DECREF(tid_py);
+    if (read_py_ptr(
+        &self->handle,
+        *running_task_addr + self->async_debug_offsets.asyncio_task_object.task_coro,
+        running_coro_addr) < 0) {
+        chain_exceptions(PyExc_RuntimeError, "Failed to read running task coro");
         return -1;
     }
 
-    PyObject* awaited_by_for_thread = PyList_New(0);
-    if (awaited_by_for_thread == NULL) {
-        Py_DECREF(tid_py);
-        Py_DECREF(result_item);
+    if ((void*)*running_coro_addr == NULL) {
+        PyErr_SetString(PyExc_RuntimeError, "Running task coro is NULL");
         return -1;
     }
 
-    PyTuple_SET_ITEM(result_item, 0, tid_py);  // steals ref
-    PyTuple_SET_ITEM(result_item, 1, awaited_by_for_thread);  // steals ref
-    if (PyList_Append(result, result_item)) {
-        Py_DECREF(result_item);
+    // note: genobject's gi_iframe is an embedded struct so the address to
+    // the offset leads directly to its first field: f_executable
+    if (read_py_ptr(
+        &self->handle,
+        *running_coro_addr + self->debug_offsets.gen_object.gi_iframe,
+        running_task_code_obj) < 0) {
         return -1;
     }
-    Py_DECREF(result_item);
 
-    if (append_awaited_by_for_thread(
-            handle,
-            head_addr,
-            debug_offsets,
-            async_offsets,
-            awaited_by_for_thread))
-    {
+    if ((void*)*running_task_code_obj == NULL) {
+        PyErr_SetString(PyExc_RuntimeError, "Running task code object is NULL");
         return -1;
     }
 
     return 0;
 }
 
-static PyObject*
-get_all_awaited_by(PyObject* self, PyObject* args)
-{
-#if (!defined(__linux__) && !defined(__APPLE__))  && !defined(MS_WINDOWS) || \
-    (defined(__linux__) && !HAVE_PROCESS_VM_READV)
-    PyErr_SetString(
-        PyExc_RuntimeError,
-        "get_all_awaited_by is not implemented on this platform");
-    return NULL;
-#endif
 
-    int pid;
-    if (!PyArg_ParseTuple(args, "i", &pid)) {
-        return NULL;
+/* ============================================================================
+ * FRAME PARSING FUNCTIONS
+ * ============================================================================ */
+
+static int
+parse_frame_object(
+    RemoteUnwinderObject *unwinder,
+    PyObject** result,
+    uintptr_t address,
+    uintptr_t* previous_frame
+) {
+    char frame[SIZEOF_INTERP_FRAME];
+
+    Py_ssize_t bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle,
+        address,
+        SIZEOF_INTERP_FRAME,
+        frame
+    );
+    if (bytes_read < 0) {
+        return -1;
     }
 
-    proc_handle_t the_handle;
-    proc_handle_t *handle = &the_handle;
-    if (_Py_RemoteDebug_InitProcHandle(handle, pid) < 0) {
+    *previous_frame = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.previous);
+
+    if (GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner) >= FRAME_OWNED_BY_INTERPRETER) {
         return 0;
     }
 
-    PyObject *result = NULL;
-
-    uintptr_t runtime_start_addr = _Py_RemoteDebug_GetPyRuntimeAddress(handle);
-    if (runtime_start_addr == 0) {
-        if (!PyErr_Occurred()) {
-            PyErr_SetString(
-                PyExc_RuntimeError, "Failed to get .PyRuntime address");
-        }
-        goto result_err;
+    if ((void*)GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable) == NULL) {
+        return 0;
     }
-    struct _Py_DebugOffsets local_debug_offsets;
 
-    if (_Py_RemoteDebug_ReadDebugOffsets(handle, &runtime_start_addr, &local_debug_offsets)) {
-        chain_exceptions(PyExc_RuntimeError, "Failed to read debug offsets");
-        goto result_err;
+    uintptr_t instruction_pointer = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.instr_ptr);
+
+    // Get tlbc_index for free threading builds
+    int32_t tlbc_index = 0;
+#ifdef Py_GIL_DISABLED
+    if (unwinder->debug_offsets.interpreter_frame.tlbc_index != 0) {
+        tlbc_index = GET_MEMBER(int32_t, frame, unwinder->debug_offsets.interpreter_frame.tlbc_index);
     }
+#endif
 
-    struct _Py_AsyncioModuleDebugOffsets local_async_debug;
-    if (read_async_debug(handle, &local_async_debug)) {
-        chain_exceptions(PyExc_RuntimeError, "Failed to read asyncio debug offsets");
-        goto result_err;
+    return parse_code_object(
+        unwinder, result, GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable),
+        instruction_pointer, previous_frame, tlbc_index);
+}
+
+static int
+parse_async_frame_object(
+    RemoteUnwinderObject *unwinder,
+    PyObject** result,
+    uintptr_t address,
+    uintptr_t* previous_frame,
+    uintptr_t* code_object
+) {
+    char frame[SIZEOF_INTERP_FRAME];
+
+    Py_ssize_t bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle,
+        address,
+        SIZEOF_INTERP_FRAME,
+        frame
+    );
+    if (bytes_read < 0) {
+        return -1;
     }
 
-    result = PyList_New(0);
-    if (result == NULL) {
-        goto result_err;
+    *previous_frame = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.previous);
+
+    if (GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner) == FRAME_OWNED_BY_CSTACK ||
+        GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner) == FRAME_OWNED_BY_INTERPRETER) {
+        return 0;  // C frame
     }
 
-    uint64_t interpreter_state_list_head =
-        local_debug_offsets.runtime_state.interpreters_head;
+    if (GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner) != FRAME_OWNED_BY_GENERATOR
+        && GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner) != FRAME_OWNED_BY_THREAD) {
+        PyErr_Format(PyExc_RuntimeError, "Unhandled frame owner %d.\n",
+                    GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner));
+        return -1;
+    }
 
-    uintptr_t interpreter_state_addr;
-    if (0 > _Py_RemoteDebug_ReadRemoteMemory(
-                handle,
-                runtime_start_addr + interpreter_state_list_head,
-                sizeof(void*),
-                &interpreter_state_addr))
-    {
-        goto result_err;
+    *code_object = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable);
+    // Strip tag bits for consistent comparison
+    *code_object &= ~Py_TAG_BITS;
+
+    assert(code_object != NULL);
+    if ((void*)*code_object == NULL) {
+        return 0;
     }
 
-    uintptr_t thread_state_addr;
-    unsigned long tid = 0;
-    if (0 > _Py_RemoteDebug_ReadRemoteMemory(
-                handle,
-                interpreter_state_addr
-                + local_debug_offsets.interpreter_state.threads_head,
-                sizeof(void*),
-                &thread_state_addr))
-    {
-        goto result_err;
+    uintptr_t instruction_pointer = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.instr_ptr);
+
+    // Get tlbc_index for free threading builds
+    int32_t tlbc_index = 0;
+#ifdef Py_GIL_DISABLED
+    if (unwinder->debug_offsets.interpreter_frame.tlbc_index != 0) {
+        tlbc_index = GET_MEMBER(int32_t, frame, unwinder->debug_offsets.interpreter_frame.tlbc_index);
     }
+#endif
 
-    uintptr_t head_addr;
-    while (thread_state_addr != 0) {
-        if (0 > _Py_RemoteDebug_ReadRemoteMemory(
-                    handle,
-                    thread_state_addr
-                    + local_debug_offsets.thread_state.native_thread_id,
-                    sizeof(tid),
-                    &tid))
-        {
-            goto result_err;
+    if (parse_code_object(
+        unwinder, result, *code_object, instruction_pointer, previous_frame, tlbc_index)) {
+        return -1;
+    }
+
+    return 1;
+}
+
+static int
+parse_async_frame_chain(
+    RemoteUnwinderObject *self,
+    PyObject *calls,
+    uintptr_t running_task_code_obj
+) {
+    uintptr_t address_of_current_frame;
+    if (find_running_frame(self, self->runtime_start_address, &address_of_current_frame) < 0) {
+        chain_exceptions(PyExc_RuntimeError, "Failed to find running frame");
+        return -1;
+    }
+
+    uintptr_t address_of_code_object;
+    while ((void*)address_of_current_frame != NULL) {
+        PyObject* frame_info = NULL;
+        int res = parse_async_frame_object(
+            self,
+            &frame_info,
+            address_of_current_frame,
+            &address_of_current_frame,
+            &address_of_code_object
+        );
+
+        if (res < 0) {
+            chain_exceptions(PyExc_RuntimeError, "Failed to parse async frame object");
+            return -1;
         }
 
-        head_addr = thread_state_addr
-            + local_async_debug.asyncio_thread_state.asyncio_tasks_head;
+        if (!frame_info) {
+            continue;
+        }
 
-        if (append_awaited_by(handle, tid, head_addr, &local_debug_offsets,
-                              &local_async_debug, result))
-        {
-            goto result_err;
+        if (PyList_Append(calls, frame_info) == -1) {
+            Py_DECREF(frame_info);
+            return -1;
         }
 
-        if (0 > _Py_RemoteDebug_ReadRemoteMemory(
-                    handle,
-                    thread_state_addr + local_debug_offsets.thread_state.next,
-                    sizeof(void*),
-                    &thread_state_addr))
-        {
-            goto result_err;
+        Py_DECREF(frame_info);
+
+        if (address_of_code_object == running_task_code_obj) {
+            break;
         }
     }
 
-    head_addr = interpreter_state_addr
-        + local_async_debug.asyncio_interpreter_state.asyncio_tasks_head;
+    return 0;
+}
 
-    // On top of a per-thread task lists used by default by asyncio to avoid
-    // contention, there is also a fallback per-interpreter list of tasks;
-    // any tasks still pending when a thread is destroyed will be moved to the
-    // per-interpreter task list.  It's unlikely we'll find anything here, but
-    // interesting for debugging.
-    if (append_awaited_by(handle, 0, head_addr, &local_debug_offsets,
-                        &local_async_debug, result))
-    {
-        goto result_err;
+/* ============================================================================
+ * AWAITED BY PARSING FUNCTIONS
+ * ============================================================================ */
+
+static int
+append_awaited_by_for_thread(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t head_addr,
+    PyObject *result
+) {
+    char task_node[SIZEOF_LLIST_NODE];
+
+    if (_Py_RemoteDebug_PagedReadRemoteMemory(&unwinder->handle, head_addr,
+                                              sizeof(task_node), task_node) < 0) {
+        return -1;
     }
 
-    _Py_RemoteDebug_CleanupProcHandle(handle);
-    return result;
+    size_t iteration_count = 0;
+    const size_t MAX_ITERATIONS = 2 << 15;  // A reasonable upper bound
 
-result_err:
-    Py_XDECREF(result);
-    _Py_RemoteDebug_CleanupProcHandle(handle);
-    return NULL;
-}
+    while (GET_MEMBER(uintptr_t, task_node, unwinder->debug_offsets.llist_node.next) != head_addr) {
+        if (++iteration_count > MAX_ITERATIONS) {
+            PyErr_SetString(PyExc_RuntimeError, "Task list appears corrupted");
+            return -1;
+        }
 
-static PyObject*
-get_stack_trace(PyObject* self, PyObject* args)
-{
-#if (!defined(__linux__) && !defined(__APPLE__))  && !defined(MS_WINDOWS) || \
-    (defined(__linux__) && !HAVE_PROCESS_VM_READV)
-    PyErr_SetString(
-        PyExc_RuntimeError,
-        "get_stack_trace is not supported on this platform");
-    return NULL;
-#endif
+        if (GET_MEMBER(uintptr_t, task_node, unwinder->debug_offsets.llist_node.next) == 0) {
+            PyErr_SetString(PyExc_RuntimeError,
+                           "Invalid linked list structure reading remote memory");
+            return -1;
+        }
 
-    int pid;
-    if (!PyArg_ParseTuple(args, "i", &pid)) {
-        return NULL;
-    }
+        uintptr_t task_addr = (uintptr_t)GET_MEMBER(uintptr_t, task_node, unwinder->debug_offsets.llist_node.next)
+            - unwinder->async_debug_offsets.asyncio_task_object.task_node;
 
-    proc_handle_t the_handle;
-    proc_handle_t *handle = &the_handle;
-    if (_Py_RemoteDebug_InitProcHandle(handle, pid) < 0) {
-        return 0;
+        if (process_single_task_node(unwinder, task_addr, result) < 0) {
+            return -1;
+        }
+
+        // Read next node
+        if (_Py_RemoteDebug_PagedReadRemoteMemory(
+                &unwinder->handle,
+                (uintptr_t)GET_MEMBER(uintptr_t, task_node, unwinder->debug_offsets.llist_node.next),
+                sizeof(task_node),
+                task_node) < 0) {
+            return -1;
+        }
     }
 
-    PyObject* result = NULL;
+    return 0;
+}
 
-    uintptr_t runtime_start_address = _Py_RemoteDebug_GetPyRuntimeAddress(handle);
-    if (runtime_start_address == 0) {
-        if (!PyErr_Occurred()) {
-            PyErr_SetString(
-                PyExc_RuntimeError, "Failed to get .PyRuntime address");
-        }
-        goto result_err;
+static int
+append_awaited_by(
+    RemoteUnwinderObject *unwinder,
+    unsigned long tid,
+    uintptr_t head_addr,
+    PyObject *result)
+{
+    PyObject *tid_py = PyLong_FromUnsignedLong(tid);
+    if (tid_py == NULL) {
+        return -1;
     }
-    struct _Py_DebugOffsets local_debug_offsets;
 
-    if (_Py_RemoteDebug_ReadDebugOffsets(handle, &runtime_start_address, &local_debug_offsets)) {
-        chain_exceptions(PyExc_RuntimeError, "Failed to read debug offsets");
-        goto result_err;
+    PyObject *result_item = PyTuple_New(2);
+    if (result_item == NULL) {
+        Py_DECREF(tid_py);
+        return -1;
     }
 
-    uintptr_t address_of_current_frame;
-    if (find_running_frame(
-        handle, runtime_start_address, &local_debug_offsets,
-        &address_of_current_frame)
-    ) {
-        goto result_err;
+    PyObject* awaited_by_for_thread = PyList_New(0);
+    if (awaited_by_for_thread == NULL) {
+        Py_DECREF(tid_py);
+        Py_DECREF(result_item);
+        return -1;
     }
 
-    result = PyList_New(0);
-    if (result == NULL) {
-        goto result_err;
+    PyTuple_SET_ITEM(result_item, 0, tid_py);  // steals ref
+    PyTuple_SET_ITEM(result_item, 1, awaited_by_for_thread);  // steals ref
+    if (PyList_Append(result, result_item)) {
+        Py_DECREF(result_item);
+        return -1;
     }
+    Py_DECREF(result_item);
 
-    while ((void*)address_of_current_frame != NULL) {
-        PyObject* frame_info = NULL;
-        if (parse_frame_object(
-                    handle,
-                    &frame_info,
-                    &local_debug_offsets,
-                    address_of_current_frame,
-                    &address_of_current_frame)
-            < 0)
-        {
-            Py_CLEAR(result);
-            goto result_err;
+    if (append_awaited_by_for_thread(unwinder, head_addr, awaited_by_for_thread))
+    {
+        return -1;
+    }
+
+    return 0;
+}
+
+/* ============================================================================
+ * STACK UNWINDING FUNCTIONS
+ * ============================================================================ */
+
+static int
+process_frame_chain(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t initial_frame_addr,
+    StackChunkList *chunks,
+    PyObject *frame_info
+) {
+    uintptr_t frame_addr = initial_frame_addr;
+    uintptr_t prev_frame_addr = 0;
+    const size_t MAX_FRAMES = 1024;
+    size_t frame_count = 0;
+
+    while ((void*)frame_addr != NULL) {
+        PyObject *frame = NULL;
+        uintptr_t next_frame_addr = 0;
+
+        if (++frame_count > MAX_FRAMES) {
+            PyErr_SetString(PyExc_RuntimeError, "Too many stack frames (possible infinite loop)");
+            return -1;
         }
 
-        if (!frame_info) {
-            continue;
+        // Try chunks first, fallback to direct memory read
+        if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, chunks) < 0) {
+            PyErr_Clear();
+            if (parse_frame_object(unwinder, &frame, frame_addr, &next_frame_addr) < 0) {
+                return -1;
+            }
         }
 
-        if (PyList_Append(result, frame_info) == -1) {
-            Py_CLEAR(result);
-            goto result_err;
+        if (!frame) {
+            break;
         }
 
-        Py_DECREF(frame_info);
-        frame_info = NULL;
+        if (prev_frame_addr && frame_addr != prev_frame_addr) {
+            PyErr_Format(PyExc_RuntimeError,
+                        "Broken frame chain: expected frame at 0x%lx, got 0x%lx",
+                        prev_frame_addr, frame_addr);
+            Py_DECREF(frame);
+            return -1;
+        }
 
+        if (PyList_Append(frame_info, frame) == -1) {
+            Py_DECREF(frame);
+            return -1;
+        }
+        Py_DECREF(frame);
+
+        prev_frame_addr = next_frame_addr;
+        frame_addr = next_frame_addr;
     }
 
-result_err:
-    _Py_RemoteDebug_CleanupProcHandle(handle);
-    return result;
+    return 0;
 }
 
 static PyObject*
-get_async_stack_trace(PyObject* self, PyObject* args)
-{
-#if (!defined(__linux__) && !defined(__APPLE__))  && !defined(MS_WINDOWS) || \
-    (defined(__linux__) && !HAVE_PROCESS_VM_READV)
-    PyErr_SetString(
-        PyExc_RuntimeError,
-        "get_stack_trace is not supported on this platform");
-    return NULL;
-#endif
-    int pid;
+unwind_stack_for_thread(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t *current_tstate
+) {
+    PyObject *frame_info = NULL;
+    PyObject *thread_id = NULL;
+    PyObject *result = NULL;
+    StackChunkList chunks = {0};
 
-    if (!PyArg_ParseTuple(args, "i", &pid)) {
-        return NULL;
+    char ts[SIZEOF_THREAD_STATE];
+    int bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle, *current_tstate, unwinder->debug_offsets.thread_state.size, ts);
+    if (bytes_read < 0) {
+        goto error;
     }
 
-    proc_handle_t the_handle;
-    proc_handle_t *handle = &the_handle;
-    if (_Py_RemoteDebug_InitProcHandle(handle, pid) < 0) {
-        return 0;
-    }
+    uintptr_t frame_addr = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.current_frame);
 
-    PyObject *result = NULL;
+    frame_info = PyList_New(0);
+    if (!frame_info) {
+        goto error;
+    }
 
-    uintptr_t runtime_start_address = _Py_RemoteDebug_GetPyRuntimeAddress(handle);
-    if (runtime_start_address == 0) {
-        if (!PyErr_Occurred()) {
-            PyErr_SetString(
-                PyExc_RuntimeError, "Failed to get .PyRuntime address");
-        }
-        goto result_err;
+    if (copy_stack_chunks(unwinder, *current_tstate, &chunks) < 0) {
+        goto error;
     }
-    struct _Py_DebugOffsets local_debug_offsets;
 
-    if (_Py_RemoteDebug_ReadDebugOffsets(handle, &runtime_start_address, &local_debug_offsets)) {
-        chain_exceptions(PyExc_RuntimeError, "Failed to read debug offsets");
-        goto result_err;
+    if (process_frame_chain(unwinder, frame_addr, &chunks, frame_info) < 0) {
+        goto error;
     }
 
-    struct _Py_AsyncioModuleDebugOffsets local_async_debug;
-    if (read_async_debug(handle, &local_async_debug)) {
-        chain_exceptions(PyExc_RuntimeError, "Failed to read asyncio debug offsets");
-        goto result_err;
+    *current_tstate = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.next);
+
+    thread_id = PyLong_FromLongLong(
+        GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.native_thread_id));
+    if (thread_id == NULL) {
+        goto error;
     }
 
-    result = PyList_New(1);
+    result = PyTuple_New(2);
     if (result == NULL) {
-        goto result_err;
+        goto error;
     }
-    PyObject* calls = PyList_New(0);
-    if (calls == NULL) {
-        goto result_err;
+
+    PyTuple_SET_ITEM(result, 0, thread_id);  // Steals reference
+    PyTuple_SET_ITEM(result, 1, frame_info); // Steals reference
+
+    cleanup_stack_chunks(&chunks);
+    return result;
+
+error:
+    Py_XDECREF(frame_info);
+    Py_XDECREF(thread_id);
+    Py_XDECREF(result);
+    cleanup_stack_chunks(&chunks);
+    return NULL;
+}
+
+
+/* ============================================================================
+ * REMOTEUNWINDER CLASS IMPLEMENTATION
+ * ============================================================================ */
+
+/*[clinic input]
+class _remote_debugging.RemoteUnwinder "RemoteUnwinderObject *" "&RemoteUnwinder_Type"
+[clinic start generated code]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=55f164d8803318be]*/
+
+/*[clinic input]
+_remote_debugging.RemoteUnwinder.__init__
+    pid: int
+    *
+    all_threads: bool = False
+
+Initialize a new RemoteUnwinder object for debugging a remote Python process.
+
+Args:
+    pid: Process ID of the target Python process to debug
+    all_threads: If True, initialize state for all threads in the process.
+                If False, only initialize for the main thread.
+
+The RemoteUnwinder provides functionality to inspect and debug a running Python
+process, including examining thread states, stack frames and other runtime data.
+
+Raises:
+    PermissionError: If access to the target process is denied
+    OSError: If unable to attach to the target process or access its memory
+    RuntimeError: If unable to read debug information from the target process
+[clinic start generated code]*/
+
+static int
+_remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
+                                               int pid, int all_threads)
+/*[clinic end generated code: output=b8027cb247092081 input=6a2056b04e6f050e]*/
+{
+    if (_Py_RemoteDebug_InitProcHandle(&self->handle, pid) < 0) {
+        return -1;
     }
-    if (PyList_SetItem(result, 0, calls)) { /* steals ref to 'calls' */
-        Py_DECREF(calls);
-        goto result_err;
+
+    self->runtime_start_address = _Py_RemoteDebug_GetPyRuntimeAddress(&self->handle);
+    if (self->runtime_start_address == 0) {
+        return -1;
     }
 
-    uintptr_t running_task_addr = (uintptr_t)NULL;
-    if (find_running_task(
-        handle, runtime_start_address, &local_debug_offsets, &local_async_debug,
-        &running_task_addr)
-    ) {
-        chain_exceptions(PyExc_RuntimeError, "Failed to find running task");
-        goto result_err;
+    if (_Py_RemoteDebug_ReadDebugOffsets(&self->handle,
+                                         &self->runtime_start_address,
+                                         &self->debug_offsets) < 0)
+    {
+        return -1;
     }
 
-    if ((void*)running_task_addr == NULL) {
-        PyErr_SetString(PyExc_RuntimeError, "No running task found");
-        goto result_err;
+    // Try to read async debug offsets, but don't fail if they're not available
+    self->async_debug_offsets_available = 1;
+    if (read_async_debug(self) < 0) {
+        PyErr_Clear();
+        memset(&self->async_debug_offsets, 0, sizeof(self->async_debug_offsets));
+        self->async_debug_offsets_available = 0;
     }
 
-    uintptr_t running_coro_addr;
-    if (read_py_ptr(
-        handle,
-        running_task_addr + local_async_debug.asyncio_task_object.task_coro,
-        &running_coro_addr
-    )) {
-        chain_exceptions(PyExc_RuntimeError, "Failed to read running task coro");
-        goto result_err;
+    if (populate_initial_state_data(all_threads, self, self->runtime_start_address,
+                    &self->interpreter_addr ,&self->tstate_addr) < 0)
+    {
+        return -1;
     }
 
-    if ((void*)running_coro_addr == NULL) {
-        PyErr_SetString(PyExc_RuntimeError, "Running task coro is NULL");
-        goto result_err;
+    self->code_object_cache = _Py_hashtable_new_full(
+        _Py_hashtable_hash_ptr,
+        _Py_hashtable_compare_direct,
+        NULL,  // keys are stable pointers, don't destroy
+        cached_code_metadata_destroy,
+        NULL
+    );
+    if (self->code_object_cache == NULL) {
+        PyErr_NoMemory();
+        return -1;
     }
 
-    // note: genobject's gi_iframe is an embedded struct so the address to
-    // the offset leads directly to its first field: f_executable
-    uintptr_t address_of_running_task_code_obj;
-    if (read_py_ptr(
-        handle,
-        running_coro_addr + local_debug_offsets.gen_object.gi_iframe,
-        &address_of_running_task_code_obj
-    )) {
-        goto result_err;
+#ifdef Py_GIL_DISABLED
+    // Initialize TLBC cache
+    self->tlbc_generation = 0;
+    self->tlbc_cache = _Py_hashtable_new_full(
+        _Py_hashtable_hash_ptr,
+        _Py_hashtable_compare_direct,
+        NULL,  // keys are stable pointers, don't destroy
+        tlbc_cache_entry_destroy,
+        NULL
+    );
+    if (self->tlbc_cache == NULL) {
+        _Py_hashtable_destroy(self->code_object_cache);
+        PyErr_NoMemory();
+        return -1;
     }
+#endif
 
-    if ((void*)address_of_running_task_code_obj == NULL) {
-        PyErr_SetString(PyExc_RuntimeError, "Running task code object is NULL");
-        goto result_err;
+    return 0;
+}
+
+/*[clinic input]
+@critical_section
+_remote_debugging.RemoteUnwinder.get_stack_trace
+
+Returns a list of stack traces for all threads in the target process.
+
+Each element in the returned list is a tuple of (thread_id, frame_list), where:
+- thread_id is the OS thread identifier
+- frame_list is a list of tuples (function_name, filename, line_number) representing
+  the Python stack frames for that thread, ordered from most recent to oldest
+
+Example:
+    [
+        (1234, [
+            ('process_data', 'worker.py', 127),
+            ('run_worker', 'worker.py', 45),
+            ('main', 'app.py', 23)
+        ]),
+        (1235, [
+            ('handle_request', 'server.py', 89),
+            ('serve_forever', 'server.py', 52)
+        ])
+    ]
+
+Raises:
+    RuntimeError: If there is an error copying memory from the target process
+    OSError: If there is an error accessing the target process
+    PermissionError: If access to the target process is denied
+    UnicodeDecodeError: If there is an error decoding strings from the target process
+
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self)
+/*[clinic end generated code: output=666192b90c69d567 input=331dbe370578badf]*/
+{
+    PyObject* result = NULL;
+    // Read interpreter state into opaque buffer
+    char interp_state_buffer[INTERP_STATE_BUFFER_SIZE];
+    if (_Py_RemoteDebug_PagedReadRemoteMemory(
+            &self->handle,
+            self->interpreter_addr,
+            INTERP_STATE_BUFFER_SIZE,
+            interp_state_buffer) < 0) {
+        goto exit;
     }
 
-    uintptr_t address_of_current_frame;
-    if (find_running_frame(
-        handle, runtime_start_address, &local_debug_offsets,
-        &address_of_current_frame)
-    ) {
-        chain_exceptions(PyExc_RuntimeError, "Failed to find running frame");
-        goto result_err;
+    // Get code object generation from buffer
+    uint64_t code_object_generation = GET_MEMBER(uint64_t, interp_state_buffer,
+            self->debug_offsets.interpreter_state.code_object_generation);
+
+    if (code_object_generation != self->code_object_generation) {
+        self->code_object_generation = code_object_generation;
+        _Py_hashtable_clear(self->code_object_cache);
     }
 
-    uintptr_t address_of_code_object;
-    while ((void*)address_of_current_frame != NULL) {
-        PyObject* frame_info = NULL;
-        int res = parse_async_frame_object(
-            handle,
-            &frame_info,
-            &local_debug_offsets,
-            address_of_current_frame,
-            &address_of_current_frame,
-            &address_of_code_object
-        );
+#ifdef Py_GIL_DISABLED
+    // Check TLBC generation and invalidate cache if needed
+    uint32_t current_tlbc_generation = GET_MEMBER(uint32_t, interp_state_buffer,
+                                                  self->debug_offsets.interpreter_state.tlbc_generation);
+    if (current_tlbc_generation != self->tlbc_generation) {
+        self->tlbc_generation = current_tlbc_generation;
+        _Py_hashtable_clear(self->tlbc_cache);
+    }
+#endif
 
-        if (res < 0) {
-            chain_exceptions(PyExc_RuntimeError, "Failed to parse async frame object");
-            goto result_err;
-        }
+    uintptr_t current_tstate;
+    if (self->tstate_addr == 0) {
+        // Get threads head from buffer
+        current_tstate = GET_MEMBER(uintptr_t, interp_state_buffer,
+                self->debug_offsets.interpreter_state.threads_head);
+    } else {
+        current_tstate = self->tstate_addr;
+    }
 
+    result = PyList_New(0);
+    if (!result) {
+        goto exit;
+    }
+
+    while (current_tstate != 0) {
+        PyObject* frame_info = unwind_stack_for_thread(self, &current_tstate);
         if (!frame_info) {
-            continue;
+            Py_CLEAR(result);
+            goto exit;
         }
 
-        if (PyList_Append(calls, frame_info) == -1) {
-            Py_DECREF(calls);
-            goto result_err;
+        if (PyList_Append(result, frame_info) == -1) {
+            Py_DECREF(frame_info);
+            Py_CLEAR(result);
+            goto exit;
         }
-
         Py_DECREF(frame_info);
-        frame_info = NULL;
 
-        if (address_of_code_object == address_of_running_task_code_obj) {
+        // We are targeting a single tstate, break here
+        if (self->tstate_addr) {
             break;
         }
     }
 
-    PyObject *tn = parse_task_name(
-        handle, &local_debug_offsets, &local_async_debug, running_task_addr);
-    if (tn == NULL) {
-        goto result_err;
+exit:
+   _Py_RemoteDebug_ClearCache(&self->handle);
+    return result;
+}
+
+/*[clinic input]
+@critical_section
+_remote_debugging.RemoteUnwinder.get_all_awaited_by
+
+Get all tasks and their awaited_by relationships from the remote process.
+
+This provides a tree structure showing which tasks are waiting for other tasks.
+
+For each task, returns:
+1. The call stack frames leading to where the task is currently executing
+2. The name of the task
+3. A list of tasks that this task is waiting for, with their own frames/names/etc
+
+Returns a list of [frames, task_name, subtasks] where:
+- frames: List of (func_name, filename, lineno) showing the call stack
+- task_name: String identifier for the task
+- subtasks: List of tasks being awaited by this task, in same format
+
+Raises:
+    RuntimeError: If AsyncioDebug section is not available in the remote process
+    MemoryError: If memory allocation fails
+    OSError: If reading from the remote process fails
+
+Example output:
+[
+    # Task c2_root waiting for two subtasks
+    [
+        # Call stack of c2_root
+        [("c5", "script.py", 10), ("c4", "script.py", 14)],
+        "c2_root",
+        [
+            # First subtask (sub_main_2) and what it's waiting for
+            [
+                [("c1", "script.py", 23)],
+                "sub_main_2",
+                [...]
+            ],
+            # Second subtask and its waiters
+            [...]
+        ]
+    ]
+]
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_all_awaited_by_impl(RemoteUnwinderObject *self)
+/*[clinic end generated code: output=6a49cd345e8aec53 input=a452c652bb00701a]*/
+{
+    if (!self->async_debug_offsets_available) {
+        PyErr_SetString(PyExc_RuntimeError, "AsyncioDebug section not available");
+        return NULL;
     }
-    if (PyList_Append(result, tn)) {
-        Py_DECREF(tn);
+
+    PyObject *result = PyList_New(0);
+    if (result == NULL) {
         goto result_err;
     }
-    Py_DECREF(tn);
 
-    PyObject* awaited_by = PyList_New(0);
-    if (awaited_by == NULL) {
+    uintptr_t thread_state_addr;
+    unsigned long tid = 0;
+    if (0 > _Py_RemoteDebug_PagedReadRemoteMemory(
+                &self->handle,
+                self->interpreter_addr
+                + self->debug_offsets.interpreter_state.threads_main,
+                sizeof(void*),
+                &thread_state_addr))
+    {
         goto result_err;
     }
-    if (PyList_Append(result, awaited_by)) {
-        Py_DECREF(awaited_by);
-        goto result_err;
+
+    uintptr_t head_addr;
+    while (thread_state_addr != 0) {
+        if (0 > _Py_RemoteDebug_PagedReadRemoteMemory(
+                    &self->handle,
+                    thread_state_addr
+                    + self->debug_offsets.thread_state.native_thread_id,
+                    sizeof(tid),
+                    &tid))
+        {
+            goto result_err;
+        }
+
+        head_addr = thread_state_addr
+            + self->async_debug_offsets.asyncio_thread_state.asyncio_tasks_head;
+
+        if (append_awaited_by(self, tid, head_addr, result))
+        {
+            goto result_err;
+        }
+
+        if (0 > _Py_RemoteDebug_PagedReadRemoteMemory(
+                    &self->handle,
+                    thread_state_addr + self->debug_offsets.thread_state.next,
+                    sizeof(void*),
+                    &thread_state_addr))
+        {
+            goto result_err;
+        }
     }
-    Py_DECREF(awaited_by);
 
-    if (parse_task_awaited_by(
-        handle, &local_debug_offsets, &local_async_debug,
-        running_task_addr, awaited_by, 1)
-    ) {
+    head_addr = self->interpreter_addr
+        + self->async_debug_offsets.asyncio_interpreter_state.asyncio_tasks_head;
+
+    // On top of a per-thread task lists used by default by asyncio to avoid
+    // contention, there is also a fallback per-interpreter list of tasks;
+    // any tasks still pending when a thread is destroyed will be moved to the
+    // per-interpreter task list.  It's unlikely we'll find anything here, but
+    // interesting for debugging.
+    if (append_awaited_by(self, 0, head_addr, result))
+    {
         goto result_err;
     }
 
-    _Py_RemoteDebug_CleanupProcHandle(handle);
+    _Py_RemoteDebug_ClearCache(&self->handle);
     return result;
 
 result_err:
-    _Py_RemoteDebug_CleanupProcHandle(handle);
+    _Py_RemoteDebug_ClearCache(&self->handle);
     Py_XDECREF(result);
     return NULL;
 }
 
+/*[clinic input]
+@critical_section
+_remote_debugging.RemoteUnwinder.get_async_stack_trace
 
-static PyMethodDef methods[] = {
-    {"get_stack_trace", get_stack_trace, METH_VARARGS,
-        "Get the Python stack from a given pid"},
-    {"get_async_stack_trace", get_async_stack_trace, METH_VARARGS,
-        "Get the asyncio stack from a given pid"},
-    {"get_all_awaited_by", get_all_awaited_by, METH_VARARGS,
-        "Get all tasks and their awaited_by from a given pid"},
-    {NULL, NULL, 0, NULL},
+Returns information about the currently running async task and its stack trace.
+
+Returns a tuple of (task_info, stack_frames) where:
+- task_info is a tuple of (task_id, task_name) identifying the task
+- stack_frames is a list of tuples (function_name, filename, line_number) representing
+  the Python stack frames for the task, ordered from most recent to oldest
+
+Example:
+    ((4345585712, 'Task-1'), [
+        ('run_echo_server', 'server.py', 127),
+        ('serve_forever', 'server.py', 45),
+        ('main', 'app.py', 23)
+    ])
+
+Raises:
+    RuntimeError: If AsyncioDebug section is not available in the target process
+    RuntimeError: If there is an error copying memory from the target process
+    OSError: If there is an error accessing the target process
+    PermissionError: If access to the target process is denied
+    UnicodeDecodeError: If there is an error decoding strings from the target process
+
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_async_stack_trace_impl(RemoteUnwinderObject *self)
+/*[clinic end generated code: output=6433d52b55e87bbe input=11b7150c59d4c60f]*/
+{
+    if (!self->async_debug_offsets_available) {
+        PyErr_SetString(PyExc_RuntimeError, "AsyncioDebug section not available");
+        return NULL;
+    }
+
+    PyObject *result = NULL;
+    PyObject *calls = NULL;
+
+    if (setup_async_result_structure(&result, &calls) < 0) {
+        goto cleanup;
+    }
+
+    uintptr_t running_task_addr, running_coro_addr, running_task_code_obj;
+    if (find_running_task_and_coro(self, &running_task_addr,
+                                   &running_coro_addr, &running_task_code_obj) < 0) {
+        goto cleanup;
+    }
+
+    if (parse_async_frame_chain(self, calls, running_task_code_obj) < 0) {
+        goto cleanup;
+    }
+
+    if (add_task_info_to_result(self, result, running_task_addr) < 0) {
+        goto cleanup;
+    }
+
+    _Py_RemoteDebug_ClearCache(&self->handle);
+    return result;
+
+cleanup:
+    _Py_RemoteDebug_ClearCache(&self->handle);
+    Py_XDECREF(result);
+    return NULL;
+}
+
+static PyMethodDef RemoteUnwinder_methods[] = {
+    _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_STACK_TRACE_METHODDEF
+    _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_ALL_AWAITED_BY_METHODDEF
+    _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_ASYNC_STACK_TRACE_METHODDEF
+    {NULL, NULL}
 };
 
-static struct PyModuleDef module = {
-    .m_base = PyModuleDef_HEAD_INIT,
-    .m_name = "_remote_debugging",
-    .m_size = -1,
-    .m_methods = methods,
+static void
+RemoteUnwinder_dealloc(RemoteUnwinderObject *self)
+{
+    PyTypeObject *tp = Py_TYPE(self);
+    if (self->code_object_cache) {
+        _Py_hashtable_destroy(self->code_object_cache);
+    }
+#ifdef Py_GIL_DISABLED
+    if (self->tlbc_cache) {
+        _Py_hashtable_destroy(self->tlbc_cache);
+    }
+#endif
+    if (self->handle.pid != 0) {
+        _Py_RemoteDebug_ClearCache(&self->handle);
+        _Py_RemoteDebug_CleanupProcHandle(&self->handle);
+    }
+    PyObject_Del(self);
+    Py_DECREF(tp);
+}
+
+static PyType_Slot RemoteUnwinder_slots[] = {
+    {Py_tp_doc, (void *)"RemoteUnwinder(pid): Inspect stack of a remote Python process."},
+    {Py_tp_methods, RemoteUnwinder_methods},
+    {Py_tp_init, _remote_debugging_RemoteUnwinder___init__},
+    {Py_tp_dealloc, RemoteUnwinder_dealloc},
+    {0, NULL}
 };
 
-PyMODINIT_FUNC
-PyInit__remote_debugging(void)
+static PyType_Spec RemoteUnwinder_spec = {
+    .name = "_remote_debugging.RemoteUnwinder",
+    .basicsize = sizeof(RemoteUnwinderObject),
+    .flags = Py_TPFLAGS_DEFAULT,
+    .slots = RemoteUnwinder_slots,
+};
+
+/* ============================================================================
+ * MODULE INITIALIZATION
+ * ============================================================================ */
+
+static int
+_remote_debugging_exec(PyObject *m)
 {
-    PyObject* mod = PyModule_Create(&module);
-    if (mod == NULL) {
-        return NULL;
+    RemoteDebuggingState *st = RemoteDebugging_GetState(m);
+#define CREATE_TYPE(mod, type, spec)                                        \
+    do {                                                                    \
+        type = (PyTypeObject *)PyType_FromMetaclass(NULL, mod, spec, NULL); \
+        if (type == NULL) {                                                 \
+            return -1;                                                      \
+        }                                                                   \
+    } while (0)
+
+    CREATE_TYPE(m, st->RemoteDebugging_Type, &RemoteUnwinder_spec);
+
+    if (PyModule_AddType(m, st->RemoteDebugging_Type) < 0) {
+        return -1;
     }
 #ifdef Py_GIL_DISABLED
-    PyUnstable_Module_SetGIL(mod, Py_MOD_GIL_NOT_USED);
+    PyUnstable_Module_SetGIL(m, Py_MOD_GIL_NOT_USED);
 #endif
-    int rc = PyModule_AddIntConstant(
-        mod, "PROCESS_VM_READV_SUPPORTED", HAVE_PROCESS_VM_READV);
+    int rc = PyModule_AddIntConstant(m, "PROCESS_VM_READV_SUPPORTED", HAVE_PROCESS_VM_READV);
     if (rc < 0) {
-        Py_DECREF(mod);
-        return NULL;
+        return -1;
     }
-    return mod;
+    if (RemoteDebugging_InitState(st) < 0) {
+        return -1;
+    }
+    return 0;
 }
+
+static int
+remote_debugging_traverse(PyObject *mod, visitproc visit, void *arg)
+{
+    RemoteDebuggingState *state = RemoteDebugging_GetState(mod);
+    Py_VISIT(state->RemoteDebugging_Type);
+    return 0;
+}
+
+static int
+remote_debugging_clear(PyObject *mod)
+{
+    RemoteDebuggingState *state = RemoteDebugging_GetState(mod);
+    Py_CLEAR(state->RemoteDebugging_Type);
+    return 0;
+}
+
+static void
+remote_debugging_free(void *mod)
+{
+    (void)remote_debugging_clear((PyObject *)mod);
+}
+
+static PyModuleDef_Slot remote_debugging_slots[] = {
+    {Py_mod_exec, _remote_debugging_exec},
+    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
+    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
+    {0, NULL},
+};
+
+static PyMethodDef remote_debugging_methods[] = {
+    {NULL, NULL, 0, NULL},
+};
+
+static struct PyModuleDef remote_debugging_module = {
+    PyModuleDef_HEAD_INIT,
+    .m_name = "_remote_debugging",
+    .m_size = sizeof(RemoteDebuggingState),
+    .m_methods = remote_debugging_methods,
+    .m_slots = remote_debugging_slots,
+    .m_traverse = remote_debugging_traverse,
+    .m_clear = remote_debugging_clear,
+    .m_free = remote_debugging_free,
+};
+
+PyMODINIT_FUNC
+PyInit__remote_debugging(void)
+{
+    return PyModuleDef_Init(&remote_debugging_module);
+}
+
diff --git a/Modules/clinic/_remote_debugging_module.c.h b/Modules/clinic/_remote_debugging_module.c.h
new file mode 100644
index 00000000000000..e83e2fd7fd2b5b
--- /dev/null
+++ b/Modules/clinic/_remote_debugging_module.c.h
@@ -0,0 +1,243 @@
+/*[clinic input]
+preserve
+[clinic start generated code]*/
+
+#if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+#  include "pycore_gc.h"          // PyGC_Head
+#  include "pycore_runtime.h"     // _Py_ID()
+#endif
+#include "pycore_critical_section.h"// Py_BEGIN_CRITICAL_SECTION()
+#include "pycore_modsupport.h"    // _PyArg_UnpackKeywords()
+
+PyDoc_STRVAR(_remote_debugging_RemoteUnwinder___init____doc__,
+"RemoteUnwinder(pid, *, all_threads=False)\n"
+"--\n"
+"\n"
+"Initialize a new RemoteUnwinder object for debugging a remote Python process.\n"
+"\n"
+"Args:\n"
+"    pid: Process ID of the target Python process to debug\n"
+"    all_threads: If True, initialize state for all threads in the process.\n"
+"                If False, only initialize for the main thread.\n"
+"\n"
+"The RemoteUnwinder provides functionality to inspect and debug a running Python\n"
+"process, including examining thread states, stack frames and other runtime data.\n"
+"\n"
+"Raises:\n"
+"    PermissionError: If access to the target process is denied\n"
+"    OSError: If unable to attach to the target process or access its memory\n"
+"    RuntimeError: If unable to read debug information from the target process");
+
+static int
+_remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
+                                               int pid, int all_threads);
+
+static int
+_remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+    int return_value = -1;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 2
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(pid), &_Py_ID(all_threads), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"pid", "all_threads", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "RemoteUnwinder",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[2];
+    PyObject * const *fastargs;
+    Py_ssize_t nargs = PyTuple_GET_SIZE(args);
+    Py_ssize_t noptargs = nargs + (kwargs ? PyDict_GET_SIZE(kwargs) : 0) - 1;
+    int pid;
+    int all_threads = 0;
+
+    fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser,
+            /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!fastargs) {
+        goto exit;
+    }
+    pid = PyLong_AsInt(fastargs[0]);
+    if (pid == -1 && PyErr_Occurred()) {
+        goto exit;
+    }
+    if (!noptargs) {
+        goto skip_optional_kwonly;
+    }
+    all_threads = PyObject_IsTrue(fastargs[1]);
+    if (all_threads < 0) {
+        goto exit;
+    }
+skip_optional_kwonly:
+    return_value = _remote_debugging_RemoteUnwinder___init___impl((RemoteUnwinderObject *)self, pid, all_threads);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_RemoteUnwinder_get_stack_trace__doc__,
+"get_stack_trace($self, /)\n"
+"--\n"
+"\n"
+"Returns a list of stack traces for all threads in the target process.\n"
+"\n"
+"Each element in the returned list is a tuple of (thread_id, frame_list), where:\n"
+"- thread_id is the OS thread identifier\n"
+"- frame_list is a list of tuples (function_name, filename, line_number) representing\n"
+"  the Python stack frames for that thread, ordered from most recent to oldest\n"
+"\n"
+"Example:\n"
+"    [\n"
+"        (1234, [\n"
+"            (\'process_data\', \'worker.py\', 127),\n"
+"            (\'run_worker\', \'worker.py\', 45),\n"
+"            (\'main\', \'app.py\', 23)\n"
+"        ]),\n"
+"        (1235, [\n"
+"            (\'handle_request\', \'server.py\', 89),\n"
+"            (\'serve_forever\', \'server.py\', 52)\n"
+"        ])\n"
+"    ]\n"
+"\n"
+"Raises:\n"
+"    RuntimeError: If there is an error copying memory from the target process\n"
+"    OSError: If there is an error accessing the target process\n"
+"    PermissionError: If access to the target process is denied\n"
+"    UnicodeDecodeError: If there is an error decoding strings from the target process");
+
+#define _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_STACK_TRACE_METHODDEF    \
+    {"get_stack_trace", (PyCFunction)_remote_debugging_RemoteUnwinder_get_stack_trace, METH_NOARGS, _remote_debugging_RemoteUnwinder_get_stack_trace__doc__},
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self);
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_stack_trace(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    PyObject *return_value = NULL;
+
+    Py_BEGIN_CRITICAL_SECTION(self);
+    return_value = _remote_debugging_RemoteUnwinder_get_stack_trace_impl((RemoteUnwinderObject *)self);
+    Py_END_CRITICAL_SECTION();
+
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_RemoteUnwinder_get_all_awaited_by__doc__,
+"get_all_awaited_by($self, /)\n"
+"--\n"
+"\n"
+"Get all tasks and their awaited_by relationships from the remote process.\n"
+"\n"
+"This provides a tree structure showing which tasks are waiting for other tasks.\n"
+"\n"
+"For each task, returns:\n"
+"1. The call stack frames leading to where the task is currently executing\n"
+"2. The name of the task\n"
+"3. A list of tasks that this task is waiting for, with their own frames/names/etc\n"
+"\n"
+"Returns a list of [frames, task_name, subtasks] where:\n"
+"- frames: List of (func_name, filename, lineno) showing the call stack\n"
+"- task_name: String identifier for the task\n"
+"- subtasks: List of tasks being awaited by this task, in same format\n"
+"\n"
+"Raises:\n"
+"    RuntimeError: If AsyncioDebug section is not available in the remote process\n"
+"    MemoryError: If memory allocation fails\n"
+"    OSError: If reading from the remote process fails\n"
+"\n"
+"Example output:\n"
+"[\n"
+"    [\n"
+"        [(\"c5\", \"script.py\", 10), (\"c4\", \"script.py\", 14)],\n"
+"        \"c2_root\",\n"
+"        [\n"
+"            [\n"
+"                [(\"c1\", \"script.py\", 23)],\n"
+"                \"sub_main_2\",\n"
+"                [...]\n"
+"            ],\n"
+"            [...]\n"
+"        ]\n"
+"    ]\n"
+"]");
+
+#define _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_ALL_AWAITED_BY_METHODDEF    \
+    {"get_all_awaited_by", (PyCFunction)_remote_debugging_RemoteUnwinder_get_all_awaited_by, METH_NOARGS, _remote_debugging_RemoteUnwinder_get_all_awaited_by__doc__},
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_all_awaited_by_impl(RemoteUnwinderObject *self);
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_all_awaited_by(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    PyObject *return_value = NULL;
+
+    Py_BEGIN_CRITICAL_SECTION(self);
+    return_value = _remote_debugging_RemoteUnwinder_get_all_awaited_by_impl((RemoteUnwinderObject *)self);
+    Py_END_CRITICAL_SECTION();
+
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_RemoteUnwinder_get_async_stack_trace__doc__,
+"get_async_stack_trace($self, /)\n"
+"--\n"
+"\n"
+"Returns information about the currently running async task and its stack trace.\n"
+"\n"
+"Returns a tuple of (task_info, stack_frames) where:\n"
+"- task_info is a tuple of (task_id, task_name) identifying the task\n"
+"- stack_frames is a list of tuples (function_name, filename, line_number) representing\n"
+"  the Python stack frames for the task, ordered from most recent to oldest\n"
+"\n"
+"Example:\n"
+"    ((4345585712, \'Task-1\'), [\n"
+"        (\'run_echo_server\', \'server.py\', 127),\n"
+"        (\'serve_forever\', \'server.py\', 45),\n"
+"        (\'main\', \'app.py\', 23)\n"
+"    ])\n"
+"\n"
+"Raises:\n"
+"    RuntimeError: If AsyncioDebug section is not available in the target process\n"
+"    RuntimeError: If there is an error copying memory from the target process\n"
+"    OSError: If there is an error accessing the target process\n"
+"    PermissionError: If access to the target process is denied\n"
+"    UnicodeDecodeError: If there is an error decoding strings from the target process");
+
+#define _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_ASYNC_STACK_TRACE_METHODDEF    \
+    {"get_async_stack_trace", (PyCFunction)_remote_debugging_RemoteUnwinder_get_async_stack_trace, METH_NOARGS, _remote_debugging_RemoteUnwinder_get_async_stack_trace__doc__},
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_async_stack_trace_impl(RemoteUnwinderObject *self);
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_async_stack_trace(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    PyObject *return_value = NULL;
+
+    Py_BEGIN_CRITICAL_SECTION(self);
+    return_value = _remote_debugging_RemoteUnwinder_get_async_stack_trace_impl((RemoteUnwinderObject *)self);
+    Py_END_CRITICAL_SECTION();
+
+    return return_value;
+}
+/*[clinic end generated code: output=654772085f1f4bf6 input=a9049054013a1b77]*/
diff --git a/Objects/codeobject.c b/Objects/codeobject.c
index 4f06a36a130207..ee869d991d93cd 100644
--- a/Objects/codeobject.c
+++ b/Objects/codeobject.c
@@ -2364,6 +2364,8 @@ free_monitoring_data(_PyCoMonitoringData *data)
 static void
 code_dealloc(PyObject *self)
 {
+    PyThreadState *tstate = PyThreadState_GET();
+    _Py_atomic_add_uint64(&tstate->interp->_code_object_generation, 1);
     PyCodeObject *co = _PyCodeObject_CAST(self);
     _PyObject_ResurrectStart(self);
     notify_code_watchers(PY_CODE_EVENT_DESTROY, co);
diff --git a/Python/index_pool.c b/Python/index_pool.c
index 007c81a0fc16ec..520a65938ec6c7 100644
--- a/Python/index_pool.c
+++ b/Python/index_pool.c
@@ -172,6 +172,9 @@ _PyIndexPool_AllocIndex(_PyIndexPool *pool)
     else {
         index = heap_pop(free_indices);
     }
+
+    pool->tlbc_generation++;
+
     UNLOCK_POOL(pool);
     return index;
 }
@@ -180,6 +183,7 @@ void
 _PyIndexPool_FreeIndex(_PyIndexPool *pool, int32_t index)
 {
     LOCK_POOL(pool);
+    pool->tlbc_generation++;
     heap_add(&pool->free_indices, index);
     UNLOCK_POOL(pool);
 }
diff --git a/Python/perf_jit_trampoline.c b/Python/perf_jit_trampoline.c
index 1211e0e9f112b7..5c7cb5b0a9913c 100644
--- a/Python/perf_jit_trampoline.c
+++ b/Python/perf_jit_trampoline.c
@@ -1,241 +1,354 @@
+/*
+ * Python Perf Trampoline Support - JIT Dump Implementation
+ *
+ * This file implements the perf jitdump API for Python's performance profiling
+ * integration. It allows perf (Linux performance analysis tool) to understand
+ * and profile dynamically generated Python bytecode by creating JIT dump files
+ * that perf can inject into its analysis.
+ *
+ *
+ * IMPORTANT: This file exports specific callback functions that are part of
+ * Python's internal API. Do not modify the function signatures or behavior
+ * of exported functions without coordinating with the Python core team.
+ *
+ * Usually the binary and libraries are mapped in separate region like below:
+ *
+ *   address ->
+ *    --+---------------------+--//--+---------------------+--
+ *      | .text | .data | ... |      | .text | .data | ... |
+ *    --+---------------------+--//--+---------------------+--
+ *          myprog                      libc.so
+ *
+ * So it'd be easy and straight-forward to find a mapped binary or library from an
+ * address.
+ *
+ * But for JIT code, the code arena only cares about the code section. But the
+ * resulting DSOs (which is generated by perf inject -j) contain ELF headers and
+ * unwind info too. Then it'd generate following address space with synthesized
+ * MMAP events. Let's say it has a sample between address B and C.
+ *
+ *                                                sample
+ *                                                  |
+ *   address ->                         A       B   v   C
+ *   ---------------------------------------------------------------------------------------------------
+ *   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
+ *   /tmp/jitted-PID-1.so           | (headers) | .text | unwind info |
+ *   /tmp/jitted-PID-2.so                   | (headers) | .text | unwind info |
+ *     ...
+ *   ---------------------------------------------------------------------------------------------------
+ *
+ * If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see
+ * the unwind info. If it maps both .text section and unwind sections, the sample
+ * could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing
+ * which one is right. So to make perf happy we have non-overlapping ranges for each
+ * DSO:
+ *
+ *   address ->
+ *   -------------------------------------------------------------------------------------------------------
+ *   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
+ *   /tmp/jitted-PID-1.so                         | (headers) | .text | unwind info |
+ *   /tmp/jitted-PID-2.so                                               | (headers) | .text | unwind info |
+ *     ...
+ *   -------------------------------------------------------------------------------------------------------
+ *
+ * As the trampolines are constant, we add a constant padding but in general the padding needs to have the
+ * size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50
+ */
+
+
+
 #include "Python.h"
 #include "pycore_ceval.h"         // _PyPerf_Callbacks
 #include "pycore_frame.h"
 #include "pycore_interp.h"
 #include "pycore_runtime.h"       // _PyRuntime
 
-
 #ifdef PY_HAVE_PERF_TRAMPOLINE
 
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/mman.h>             // mmap()
-#include <sys/types.h>
-#include <unistd.h>               // sysconf()
-#include <sys/time.h>           // gettimeofday()
-#include <sys/syscall.h>
-
-// ----------------------------------
-//         Perf jitdump API
-// ----------------------------------
-
-typedef struct {
-    FILE* perf_map;
-    PyThread_type_lock map_lock;
-    void* mapped_buffer;
-    size_t mapped_size;
-    int code_id;
-} PerfMapJitState;
-
-static PerfMapJitState perf_jit_map_state;
+/* Standard library includes for perf jitdump implementation */
+#include <elf.h>                  // ELF architecture constants
+#include <fcntl.h>                // File control operations
+#include <stdio.h>                // Standard I/O operations
+#include <stdlib.h>               // Standard library functions
+#include <sys/mman.h>             // Memory mapping functions (mmap)
+#include <sys/types.h>            // System data types
+#include <unistd.h>               // System calls (sysconf, getpid)
+#include <sys/time.h>             // Time functions (gettimeofday)
+#include <sys/syscall.h>          // System call interface
+
+// =============================================================================
+//                           CONSTANTS AND CONFIGURATION
+// =============================================================================
 
 /*
-Usually the binary and libraries are mapped in separate region like below:
-
-  address ->
-   --+---------------------+--//--+---------------------+--
-     | .text | .data | ... |      | .text | .data | ... |
-   --+---------------------+--//--+---------------------+--
-         myprog                      libc.so
-
-So it'd be easy and straight-forward to find a mapped binary or library from an
-address.
-
-But for JIT code, the code arena only cares about the code section. But the
-resulting DSOs (which is generated by perf inject -j) contain ELF headers and
-unwind info too. Then it'd generate following address space with synthesized
-MMAP events. Let's say it has a sample between address B and C.
-
-                                               sample
-                                                 |
-  address ->                         A       B   v   C
-  ---------------------------------------------------------------------------------------------------
-  /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
-  /tmp/jitted-PID-1.so           | (headers) | .text | unwind info |
-  /tmp/jitted-PID-2.so                   | (headers) | .text | unwind info |
-    ...
-  ---------------------------------------------------------------------------------------------------
-
-If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see
-the unwind info. If it maps both .text section and unwind sections, the sample
-could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing
-which one is right. So to make perf happy we have non-overlapping ranges for each
-DSO:
-
-  address ->
-  -------------------------------------------------------------------------------------------------------
-  /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
-  /tmp/jitted-PID-1.so                         | (headers) | .text | unwind info |
-  /tmp/jitted-PID-2.so                                               | (headers) | .text | unwind info |
-    ...
-  -------------------------------------------------------------------------------------------------------
-
-As the trampolines are constant, we add a constant padding but in general the padding needs to have the
-size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50
+ * Memory layout considerations for perf jitdump:
+ *
+ * Perf expects non-overlapping memory regions for each JIT-compiled function.
+ * When perf processes the jitdump file, it creates synthetic DSO (Dynamic
+ * Shared Object) files that contain:
+ * - ELF headers
+ * - .text section (actual machine code)
+ * - Unwind information (for stack traces)
+ *
+ * To ensure proper address space layout, we add padding between code regions.
+ * This prevents address conflicts when perf maps the synthesized DSOs.
+ *
+ * Memory layout example:
+ * /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding]
+ * /tmp/jitted-PID-1.so:                                       [headers][.text][unwind_info][padding]
+ *
+ * The padding size (0x100) is chosen to accommodate typical unwind info sizes
+ * while maintaining 16-byte alignment requirements.
  */
-
 #define PERF_JIT_CODE_PADDING 0x100
-#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
-
-typedef uint64_t uword;
-typedef const char* CodeComments;
 
-#define Pd "d"
-#define MB (1024 * 1024)
-
-#define EM_386      3
-#define EM_X86_64   62
-#define EM_ARM      40
-#define EM_AARCH64  183
-#define EM_RISCV    243
+/* Convenient access to the global trampoline API state */
+#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
 
-#define TARGET_ARCH_IA32   0
-#define TARGET_ARCH_X64    0
-#define TARGET_ARCH_ARM    0
-#define TARGET_ARCH_ARM64  0
-#define TARGET_ARCH_RISCV32 0
-#define TARGET_ARCH_RISCV64 0
+/* Type aliases for clarity and portability */
+typedef uint64_t uword;                    // Word-sized unsigned integer
+typedef const char* CodeComments;          // Code comment strings
 
-#define FLAG_generate_perf_jitdump 0
-#define FLAG_write_protect_code 0
-#define FLAG_write_protect_vm_isolate 0
-#define FLAG_code_comments 0
+/* Memory size constants */
+#define MB (1024 * 1024)                   // 1 Megabyte for buffer sizing
 
-#define UNREACHABLE()
+// =============================================================================
+//                        ARCHITECTURE-SPECIFIC DEFINITIONS
+// =============================================================================
 
-static uword GetElfMachineArchitecture(void) {
-#if TARGET_ARCH_IA32
-    return EM_386;
-#elif TARGET_ARCH_X64
+/*
+ * Returns the ELF machine architecture constant for the current platform.
+ * This is required for the jitdump header to correctly identify the target
+ * architecture for perf processing.
+ *
+ */
+static uint64_t GetElfMachineArchitecture(void) {
+#if defined(__x86_64__) || defined(_M_X64)
     return EM_X86_64;
-#elif TARGET_ARCH_ARM
-    return EM_ARM;
-#elif TARGET_ARCH_ARM64
+#elif defined(__i386__) || defined(_M_IX86)
+    return EM_386;
+#elif defined(__aarch64__)
     return EM_AARCH64;
-#elif TARGET_ARCH_RISCV32 || TARGET_ARCH_RISCV64
+#elif defined(__arm__) || defined(_M_ARM)
+    return EM_ARM;
+#elif defined(__riscv)
     return EM_RISCV;
 #else
-    UNREACHABLE();
+    Py_UNREACHABLE();  // Unsupported architecture - should never reach here
     return 0;
 #endif
 }
 
+// =============================================================================
+//                           PERF JITDUMP DATA STRUCTURES
+// =============================================================================
+
+/*
+ * Perf jitdump file format structures
+ *
+ * These structures define the binary format that perf expects for JIT dump files.
+ * The format is documented in the Linux perf tools source code and must match
+ * exactly for proper perf integration.
+ */
+
+/*
+ * Jitdump file header - written once at the beginning of each jitdump file
+ * Contains metadata about the process and jitdump format version
+ */
 typedef struct {
-    uint32_t magic;
-    uint32_t version;
-    uint32_t size;
-    uint32_t elf_mach_target;
-    uint32_t reserved;
-    uint32_t process_id;
-    uint64_t time_stamp;
-    uint64_t flags;
+    uint32_t magic;              // Magic number (0x4A695444 = "JiTD")
+    uint32_t version;            // Jitdump format version (currently 1)
+    uint32_t size;               // Size of this header structure
+    uint32_t elf_mach_target;    // Target architecture (from GetElfMachineArchitecture)
+    uint32_t reserved;           // Reserved field (must be 0)
+    uint32_t process_id;         // Process ID of the JIT compiler
+    uint64_t time_stamp;         // Timestamp when jitdump was created
+    uint64_t flags;              // Feature flags (currently unused)
 } Header;
 
- enum PerfEvent {
-    PerfLoad = 0,
-    PerfMove = 1,
-    PerfDebugInfo = 2,
-    PerfClose = 3,
-    PerfUnwindingInfo = 4
+/*
+ * Perf event types supported by the jitdump format
+ * Each event type has a corresponding structure format
+ */
+enum PerfEvent {
+    PerfLoad = 0,           // Code load event (new JIT function)
+    PerfMove = 1,           // Code move event (function relocated)
+    PerfDebugInfo = 2,      // Debug information event
+    PerfClose = 3,          // JIT session close event
+    PerfUnwindingInfo = 4   // Stack unwinding information event
 };
 
+/*
+ * Base event structure - common header for all perf events
+ * Every event in the jitdump file starts with this structure
+ */
 struct BaseEvent {
-    uint32_t event;
-    uint32_t size;
-    uint64_t time_stamp;
-  };
+    uint32_t event;         // Event type (from PerfEvent enum)
+    uint32_t size;          // Total size of this event including payload
+    uint64_t time_stamp;    // Timestamp when event occurred
+};
 
+/*
+ * Code load event - indicates a new JIT-compiled function is available
+ * This is the most important event type for Python profiling
+ */
 typedef struct {
-    struct BaseEvent base;
-    uint32_t process_id;
-    uint32_t thread_id;
-    uint64_t vma;
-    uint64_t code_address;
-    uint64_t code_size;
-    uint64_t code_id;
+    struct BaseEvent base;   // Common event header
+    uint32_t process_id;     // Process ID where code was generated
+    uint32_t thread_id;      // Thread ID where code was generated
+    uint64_t vma;            // Virtual memory address where code is loaded
+    uint64_t code_address;   // Address of the actual machine code
+    uint64_t code_size;      // Size of the machine code in bytes
+    uint64_t code_id;        // Unique identifier for this code region
+    /* Followed by:
+     * - null-terminated function name string
+     * - raw machine code bytes
+     */
 } CodeLoadEvent;
 
+/*
+ * Code unwinding information event - provides DWARF data for stack traces
+ * Essential for proper stack unwinding during profiling
+ */
 typedef struct {
-    struct BaseEvent base;
-    uint64_t unwind_data_size;
-    uint64_t eh_frame_hdr_size;
-    uint64_t mapped_size;
+    struct BaseEvent base;      // Common event header
+    uint64_t unwind_data_size;  // Size of the unwinding data
+    uint64_t eh_frame_hdr_size; // Size of the EH frame header
+    uint64_t mapped_size;       // Total mapped size (with padding)
+    /* Followed by:
+     * - EH frame header
+     * - DWARF unwinding information
+     * - Padding to alignment boundary
+     */
 } CodeUnwindingInfoEvent;
 
-static const intptr_t nanoseconds_per_second = 1000000000;
-
-// Dwarf encoding constants
+// =============================================================================
+//                              GLOBAL STATE MANAGEMENT
+// =============================================================================
 
-static const uint8_t DwarfUData4 = 0x03;
-static const uint8_t DwarfSData4 = 0x0b;
-static const uint8_t DwarfPcRel = 0x10;
-static const uint8_t DwarfDataRel = 0x30;
-// static uint8_t DwarfOmit = 0xff;
+/*
+ * Global state for the perf jitdump implementation
+ *
+ * This structure maintains all the state needed for generating jitdump files.
+ * It's designed as a singleton since there's typically only one jitdump file
+ * per Python process.
+ */
 typedef struct {
-    unsigned char version;
-    unsigned char eh_frame_ptr_enc;
-    unsigned char fde_count_enc;
-    unsigned char table_enc;
-    int32_t eh_frame_ptr;
-    int32_t eh_fde_count;
-    int32_t from;
-    int32_t to;
-} EhFrameHeader;
+    FILE* perf_map;          // File handle for the jitdump file
+    PyThread_type_lock map_lock;  // Thread synchronization lock
+    void* mapped_buffer;     // Memory-mapped region (signals perf we're active)
+    size_t mapped_size;      // Size of the mapped region
+    int code_id;             // Counter for unique code region identifiers
+} PerfMapJitState;
+
+/* Global singleton instance */
+static PerfMapJitState perf_jit_map_state;
+
+// =============================================================================
+//                              TIME UTILITIES
+// =============================================================================
 
+/* Time conversion constant */
+static const intptr_t nanoseconds_per_second = 1000000000;
+
+/*
+ * Get current monotonic time in nanoseconds
+ *
+ * Monotonic time is preferred for event timestamps because it's not affected
+ * by system clock adjustments. This ensures consistent timing relationships
+ * between events even if the system clock is changed.
+ *
+ * Returns: Current monotonic time in nanoseconds since an arbitrary epoch
+ */
 static int64_t get_current_monotonic_ticks(void) {
     struct timespec ts;
     if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) {
-        UNREACHABLE();
+        Py_UNREACHABLE();  // Should never fail on supported systems
         return 0;
     }
-    // Convert to nanoseconds.
+
+    /* Convert to nanoseconds for maximum precision */
     int64_t result = ts.tv_sec;
     result *= nanoseconds_per_second;
     result += ts.tv_nsec;
     return result;
 }
 
+/*
+ * Get current wall clock time in microseconds
+ *
+ * Used for the jitdump file header timestamp. Unlike monotonic time,
+ * this represents actual wall clock time that can be correlated with
+ * other system events.
+ *
+ * Returns: Current time in microseconds since Unix epoch
+ */
 static int64_t get_current_time_microseconds(void) {
-  // gettimeofday has microsecond resolution.
-  struct timeval tv;
-  if (gettimeofday(&tv, NULL) < 0) {
-    UNREACHABLE();
-    return 0;
-  }
-  return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec;
+    struct timeval tv;
+    if (gettimeofday(&tv, NULL) < 0) {
+        Py_UNREACHABLE();  // Should never fail on supported systems
+        return 0;
+    }
+    return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec;
 }
 
+// =============================================================================
+//                              UTILITY FUNCTIONS
+// =============================================================================
 
+/*
+ * Round up a value to the next multiple of a given number
+ *
+ * This is essential for maintaining proper alignment requirements in the
+ * jitdump format. Many structures need to be aligned to specific boundaries
+ * (typically 8 or 16 bytes) for efficient processing by perf.
+ *
+ * Args:
+ *   value: The value to round up
+ *   multiple: The multiple to round up to
+ *
+ * Returns: The smallest value >= input that is a multiple of 'multiple'
+ */
 static size_t round_up(int64_t value, int64_t multiple) {
     if (multiple == 0) {
-        // Avoid division by zero
-        return value;
+        return value;  // Avoid division by zero
     }
 
     int64_t remainder = value % multiple;
     if (remainder == 0) {
-        // Value is already a multiple of 'multiple'
-        return value;
+        return value;  // Already aligned
     }
 
-    // Calculate the difference to the next multiple
+    /* Calculate how much to add to reach the next multiple */
     int64_t difference = multiple - remainder;
-
-    // Add the difference to the value
     int64_t rounded_up_value = value + difference;
 
     return rounded_up_value;
 }
 
+// =============================================================================
+//                              FILE I/O UTILITIES
+// =============================================================================
 
+/*
+ * Write data to the jitdump file with error handling
+ *
+ * This function ensures that all data is written to the file, handling
+ * partial writes that can occur with large buffers or when the system
+ * is under load.
+ *
+ * Args:
+ *   buffer: Pointer to data to write
+ *   size: Number of bytes to write
+ */
 static void perf_map_jit_write_fully(const void* buffer, size_t size) {
     FILE* out_file = perf_jit_map_state.perf_map;
     const char* ptr = (const char*)(buffer);
+
     while (size > 0) {
         const size_t written = fwrite(ptr, 1, size, out_file);
         if (written == 0) {
-            UNREACHABLE();
+            Py_UNREACHABLE();  // Write failure - should be very rare
             break;
         }
         size -= written;
@@ -243,284 +356,720 @@ static void perf_map_jit_write_fully(const void* buffer, size_t size) {
     }
 }
 
+/*
+ * Write the jitdump file header
+ *
+ * The header must be written exactly once at the beginning of each jitdump
+ * file. It provides metadata that perf uses to parse the rest of the file.
+ *
+ * Args:
+ *   pid: Process ID to include in the header
+ *   out_file: File handle to write to (currently unused, uses global state)
+ */
 static void perf_map_jit_write_header(int pid, FILE* out_file) {
     Header header;
-    header.magic = 0x4A695444;
-    header.version = 1;
-    header.size = sizeof(Header);
-    header.elf_mach_target = GetElfMachineArchitecture();
-    header.process_id = pid;
-    header.time_stamp = get_current_time_microseconds();
-    header.flags = 0;
-    perf_map_jit_write_fully(&header, sizeof(header));
-}
 
-static void* perf_map_jit_init(void) {
-    char filename[100];
-    int pid = getpid();
-    snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid);
-    const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666);
-    if (fd == -1) {
-        return NULL;
-    }
+    /* Initialize header with required values */
+    header.magic = 0x4A695444;                    // "JiTD" magic number
+    header.version = 1;                           // Current jitdump version
+    header.size = sizeof(Header);                 // Header size for validation
+    header.elf_mach_target = GetElfMachineArchitecture();  // Target architecture
+    header.process_id = pid;                      // Process identifier
+    header.time_stamp = get_current_time_microseconds();   // Creation time
+    header.flags = 0;                             // No special flags currently used
 
-    const long page_size = sysconf(_SC_PAGESIZE);  // NOLINT(runtime/int)
-    if (page_size == -1) {
-        close(fd);
-        return NULL;
-    }
-
-    // The perf jit interface forces us to map the first page of the file
-    // to signal that we are using the interface.
-    perf_jit_map_state.mapped_buffer = mmap(NULL, page_size, PROT_READ | PROT_EXEC, MAP_PRIVATE, fd, 0);
-    if (perf_jit_map_state.mapped_buffer == NULL) {
-        close(fd);
-        return NULL;
-    }
-    perf_jit_map_state.mapped_size = page_size;
-    perf_jit_map_state.perf_map = fdopen(fd, "w+");
-    if (perf_jit_map_state.perf_map == NULL) {
-        close(fd);
-        return NULL;
-    }
-    setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB);
-    perf_map_jit_write_header(pid, perf_jit_map_state.perf_map);
-
-    perf_jit_map_state.map_lock = PyThread_allocate_lock();
-    if (perf_jit_map_state.map_lock == NULL) {
-        fclose(perf_jit_map_state.perf_map);
-        return NULL;
-    }
-    perf_jit_map_state.code_id = 0;
-
-    trampoline_api.code_padding = PERF_JIT_CODE_PADDING;
-    return &perf_jit_map_state;
+    perf_map_jit_write_fully(&header, sizeof(header));
 }
 
-/* DWARF definitions. */
+// =============================================================================
+//                              DWARF CONSTANTS AND UTILITIES
+// =============================================================================
+
+/*
+ * DWARF (Debug With Arbitrary Record Formats) constants
+ *
+ * DWARF is a debugging data format used to provide stack unwinding information.
+ * These constants define the various encoding types and opcodes used in
+ * DWARF Call Frame Information (CFI) records.
+ */
 
+/* DWARF Call Frame Information version */
 #define DWRF_CIE_VERSION 1
 
+/* DWARF CFA (Call Frame Address) opcodes */
 enum {
-    DWRF_CFA_nop = 0x0,
-    DWRF_CFA_offset_extended = 0x5,
-    DWRF_CFA_def_cfa = 0xc,
-    DWRF_CFA_def_cfa_offset = 0xe,
-    DWRF_CFA_offset_extended_sf = 0x11,
-    DWRF_CFA_advance_loc = 0x40,
-    DWRF_CFA_offset = 0x80
+    DWRF_CFA_nop = 0x0,                    // No operation
+    DWRF_CFA_offset_extended = 0x5,        // Extended offset instruction
+    DWRF_CFA_def_cfa = 0xc,               // Define CFA rule
+    DWRF_CFA_def_cfa_offset = 0xe,        // Define CFA offset
+    DWRF_CFA_offset_extended_sf = 0x11,   // Extended signed offset
+    DWRF_CFA_advance_loc = 0x40,          // Advance location counter
+    DWRF_CFA_offset = 0x80                // Simple offset instruction
 };
 
-enum
-  {
-    DWRF_EH_PE_absptr = 0x00,
-    DWRF_EH_PE_omit = 0xff,
-
-    /* FDE data encoding.  */
-    DWRF_EH_PE_uleb128 = 0x01,
-    DWRF_EH_PE_udata2 = 0x02,
-    DWRF_EH_PE_udata4 = 0x03,
-    DWRF_EH_PE_udata8 = 0x04,
-    DWRF_EH_PE_sleb128 = 0x09,
-    DWRF_EH_PE_sdata2 = 0x0a,
-    DWRF_EH_PE_sdata4 = 0x0b,
-    DWRF_EH_PE_sdata8 = 0x0c,
-    DWRF_EH_PE_signed = 0x08,
-
-    /* FDE flags.  */
-    DWRF_EH_PE_pcrel = 0x10,
-    DWRF_EH_PE_textrel = 0x20,
-    DWRF_EH_PE_datarel = 0x30,
-    DWRF_EH_PE_funcrel = 0x40,
-    DWRF_EH_PE_aligned = 0x50,
-
-    DWRF_EH_PE_indirect = 0x80
-  };
+/* DWARF Exception Handling pointer encodings */
+enum {
+    DWRF_EH_PE_absptr = 0x00,             // Absolute pointer
+    DWRF_EH_PE_omit = 0xff,               // Omitted value
+
+    /* Data type encodings */
+    DWRF_EH_PE_uleb128 = 0x01,            // Unsigned LEB128
+    DWRF_EH_PE_udata2 = 0x02,             // Unsigned 2-byte
+    DWRF_EH_PE_udata4 = 0x03,             // Unsigned 4-byte
+    DWRF_EH_PE_udata8 = 0x04,             // Unsigned 8-byte
+    DWRF_EH_PE_sleb128 = 0x09,            // Signed LEB128
+    DWRF_EH_PE_sdata2 = 0x0a,             // Signed 2-byte
+    DWRF_EH_PE_sdata4 = 0x0b,             // Signed 4-byte
+    DWRF_EH_PE_sdata8 = 0x0c,             // Signed 8-byte
+    DWRF_EH_PE_signed = 0x08,             // Signed flag
+
+    /* Reference type encodings */
+    DWRF_EH_PE_pcrel = 0x10,              // PC-relative
+    DWRF_EH_PE_textrel = 0x20,            // Text-relative
+    DWRF_EH_PE_datarel = 0x30,            // Data-relative
+    DWRF_EH_PE_funcrel = 0x40,            // Function-relative
+    DWRF_EH_PE_aligned = 0x50,            // Aligned
+    DWRF_EH_PE_indirect = 0x80            // Indirect
+};
 
+/* Additional DWARF constants for debug information */
 enum { DWRF_TAG_compile_unit = 0x11 };
-
 enum { DWRF_children_no = 0, DWRF_children_yes = 1 };
+enum {
+    DWRF_AT_name = 0x03,         // Name attribute
+    DWRF_AT_stmt_list = 0x10,    // Statement list
+    DWRF_AT_low_pc = 0x11,       // Low PC address
+    DWRF_AT_high_pc = 0x12       // High PC address
+};
+enum {
+    DWRF_FORM_addr = 0x01,       // Address form
+    DWRF_FORM_data4 = 0x06,      // 4-byte data
+    DWRF_FORM_string = 0x08      // String form
+};
 
-enum { DWRF_AT_name = 0x03, DWRF_AT_stmt_list = 0x10, DWRF_AT_low_pc = 0x11, DWRF_AT_high_pc = 0x12 };
-
-enum { DWRF_FORM_addr = 0x01, DWRF_FORM_data4 = 0x06, DWRF_FORM_string = 0x08 };
-
-enum { DWRF_LNS_extended_op = 0, DWRF_LNS_copy = 1, DWRF_LNS_advance_pc = 2, DWRF_LNS_advance_line = 3 };
+/* Line number program opcodes */
+enum {
+    DWRF_LNS_extended_op = 0,    // Extended opcode
+    DWRF_LNS_copy = 1,           // Copy operation
+    DWRF_LNS_advance_pc = 2,     // Advance program counter
+    DWRF_LNS_advance_line = 3    // Advance line number
+};
 
-enum { DWRF_LNE_end_sequence = 1, DWRF_LNE_set_address = 2 };
+/* Line number extended opcodes */
+enum {
+    DWRF_LNE_end_sequence = 1,   // End of sequence
+    DWRF_LNE_set_address = 2     // Set address
+};
 
+/*
+ * Architecture-specific DWARF register numbers
+ *
+ * These constants define the register numbering scheme used by DWARF
+ * for each supported architecture. The numbers must match the ABI
+ * specification for proper stack unwinding.
+ */
 enum {
 #ifdef __x86_64__
-    /* Yes, the order is strange, but correct. */
-    DWRF_REG_AX,
-    DWRF_REG_DX,
-    DWRF_REG_CX,
-    DWRF_REG_BX,
-    DWRF_REG_SI,
-    DWRF_REG_DI,
-    DWRF_REG_BP,
-    DWRF_REG_SP,
-    DWRF_REG_8,
-    DWRF_REG_9,
-    DWRF_REG_10,
-    DWRF_REG_11,
-    DWRF_REG_12,
-    DWRF_REG_13,
-    DWRF_REG_14,
-    DWRF_REG_15,
-    DWRF_REG_RA,
+    /* x86_64 register numbering (note: order is defined by x86_64 ABI) */
+    DWRF_REG_AX,    // RAX
+    DWRF_REG_DX,    // RDX
+    DWRF_REG_CX,    // RCX
+    DWRF_REG_BX,    // RBX
+    DWRF_REG_SI,    // RSI
+    DWRF_REG_DI,    // RDI
+    DWRF_REG_BP,    // RBP
+    DWRF_REG_SP,    // RSP
+    DWRF_REG_8,     // R8
+    DWRF_REG_9,     // R9
+    DWRF_REG_10,    // R10
+    DWRF_REG_11,    // R11
+    DWRF_REG_12,    // R12
+    DWRF_REG_13,    // R13
+    DWRF_REG_14,    // R14
+    DWRF_REG_15,    // R15
+    DWRF_REG_RA,    // Return address (RIP)
 #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
-    DWRF_REG_SP = 31,
-    DWRF_REG_RA = 30,
+    /* AArch64 register numbering */
+    DWRF_REG_FP = 29,  // Frame Pointer
+    DWRF_REG_RA = 30,  // Link register (return address)
+    DWRF_REG_SP = 31,  // Stack pointer
 #else
 #    error "Unsupported target architecture"
 #endif
 };
 
-typedef struct ELFObjectContext
-{
-    uint8_t* p; /* Pointer to next address in obj.space. */
-    uint8_t* startp; /* Pointer to start address in obj.space. */
-    uint8_t* eh_frame_p; /* Pointer to start address in obj.space. */
-    uint32_t code_size; /* Size of machine code. */
+/* DWARF encoding constants used in EH frame headers */
+static const uint8_t DwarfUData4 = 0x03;     // Unsigned 4-byte data
+static const uint8_t DwarfSData4 = 0x0b;     // Signed 4-byte data
+static const uint8_t DwarfPcRel = 0x10;      // PC-relative encoding
+static const uint8_t DwarfDataRel = 0x30;    // Data-relative encoding
+
+// =============================================================================
+//                              ELF OBJECT CONTEXT
+// =============================================================================
+
+/*
+ * Context for building ELF/DWARF structures
+ *
+ * This structure maintains state while constructing DWARF unwind information.
+ * It acts as a simple buffer manager with pointers to track current position
+ * and important landmarks within the buffer.
+ */
+typedef struct ELFObjectContext {
+    uint8_t* p;            // Current write position in buffer
+    uint8_t* startp;       // Start of buffer (for offset calculations)
+    uint8_t* eh_frame_p;   // Start of EH frame data (for relative offsets)
+    uint32_t code_size;    // Size of the code being described
 } ELFObjectContext;
 
-/* Append a null-terminated string. */
-static uint32_t
-elfctx_append_string(ELFObjectContext* ctx, const char* str)
-{
+/*
+ * EH Frame Header structure for DWARF unwinding
+ *
+ * This structure provides metadata about the DWARF unwinding information
+ * that follows. It's required by the perf jitdump format to enable proper
+ * stack unwinding during profiling.
+ */
+typedef struct {
+    unsigned char version;           // EH frame version (always 1)
+    unsigned char eh_frame_ptr_enc;  // Encoding of EH frame pointer
+    unsigned char fde_count_enc;     // Encoding of FDE count
+    unsigned char table_enc;         // Encoding of table entries
+    int32_t eh_frame_ptr;           // Pointer to EH frame data
+    int32_t eh_fde_count;           // Number of FDEs (Frame Description Entries)
+    int32_t from;                   // Start address of code range
+    int32_t to;                     // End address of code range
+} EhFrameHeader;
+
+// =============================================================================
+//                              DWARF GENERATION UTILITIES
+// =============================================================================
+
+/*
+ * Append a null-terminated string to the ELF context buffer
+ *
+ * Args:
+ *   ctx: ELF object context
+ *   str: String to append (must be null-terminated)
+ *
+ * Returns: Offset from start of buffer where string was written
+ */
+static uint32_t elfctx_append_string(ELFObjectContext* ctx, const char* str) {
     uint8_t* p = ctx->p;
     uint32_t ofs = (uint32_t)(p - ctx->startp);
+
+    /* Copy string including null terminator */
     do {
         *p++ = (uint8_t)*str;
     } while (*str++);
+
     ctx->p = p;
     return ofs;
 }
 
-/* Append a SLEB128 value. */
-static void
-elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v)
-{
+/*
+ * Append a SLEB128 (Signed Little Endian Base 128) value
+ *
+ * SLEB128 is a variable-length encoding used extensively in DWARF.
+ * It efficiently encodes small numbers in fewer bytes.
+ *
+ * Args:
+ *   ctx: ELF object context
+ *   v: Signed value to encode
+ */
+static void elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v) {
     uint8_t* p = ctx->p;
+
+    /* Encode 7 bits at a time, with continuation bit in MSB */
     for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) {
-        *p++ = (uint8_t)((v & 0x7f) | 0x80);
+        *p++ = (uint8_t)((v & 0x7f) | 0x80);  // Set continuation bit
     }
-    *p++ = (uint8_t)(v & 0x7f);
+    *p++ = (uint8_t)(v & 0x7f);  // Final byte without continuation bit
+
     ctx->p = p;
 }
 
-/* Append a ULEB128 to buffer. */
-static void
-elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v)
-{
+/*
+ * Append a ULEB128 (Unsigned Little Endian Base 128) value
+ *
+ * Similar to SLEB128 but for unsigned values.
+ *
+ * Args:
+ *   ctx: ELF object context
+ *   v: Unsigned value to encode
+ */
+static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) {
     uint8_t* p = ctx->p;
+
+    /* Encode 7 bits at a time, with continuation bit in MSB */
     for (; v >= 0x80; v >>= 7) {
-        *p++ = (char)((v & 0x7f) | 0x80);
+        *p++ = (char)((v & 0x7f) | 0x80);  // Set continuation bit
     }
-    *p++ = (char)v;
+    *p++ = (char)v;  // Final byte without continuation bit
+
     ctx->p = p;
 }
 
-/* Shortcuts to generate DWARF structures. */
-#define DWRF_U8(x) (*p++ = (x))
-#define DWRF_I8(x) (*(int8_t*)p = (x), p++)
-#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2)
-#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4)
-#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t))
-#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p)
-#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p)
-#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p)
-#define DWRF_ALIGNNOP(s)                                                                                \
-    while ((uintptr_t)p & ((s)-1)) {                                                                    \
-        *p++ = DWRF_CFA_nop;                                                                            \
+/*
+ * Macros for generating DWARF structures
+ *
+ * These macros provide a convenient way to write various data types
+ * to the DWARF buffer while automatically advancing the pointer.
+ */
+#define DWRF_U8(x) (*p++ = (x))                                    // Write unsigned 8-bit
+#define DWRF_I8(x) (*(int8_t*)p = (x), p++)                       // Write signed 8-bit
+#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2)                 // Write unsigned 16-bit
+#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4)                 // Write unsigned 32-bit
+#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t)) // Write address
+#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p) // Write ULEB128
+#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p) // Write SLEB128
+#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p) // Write string
+
+/* Align to specified boundary with NOP instructions */
+#define DWRF_ALIGNNOP(s)                                          \
+    while ((uintptr_t)p & ((s)-1)) {                              \
+        *p++ = DWRF_CFA_nop;                                       \
     }
-#define DWRF_SECTION(name, stmt)                                                                        \
-    {                                                                                                   \
-        uint32_t* szp_##name = (uint32_t*)p;                                                            \
-        p += 4;                                                                                         \
-        stmt;                                                                                           \
-        *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4);                                       \
+
+/* Write a DWARF section with automatic size calculation */
+#define DWRF_SECTION(name, stmt)                                  \
+    {                                                             \
+        uint32_t* szp_##name = (uint32_t*)p;                      \
+        p += 4;                                                   \
+        stmt;                                                     \
+        *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \
     }
 
-/* Initialize .eh_frame section. */
-static void
-elf_init_ehframe(ELFObjectContext* ctx)
-{
+// =============================================================================
+//                              DWARF EH FRAME GENERATION
+// =============================================================================
+
+/*
+ * Initialize DWARF .eh_frame section for a code region
+ *
+ * The .eh_frame section contains Call Frame Information (CFI) that describes
+ * how to unwind the stack at any point in the code. This is essential for
+ * proper profiling as it allows perf to generate accurate call graphs.
+ *
+ * The function generates two main components:
+ * 1. CIE (Common Information Entry) - describes calling conventions
+ * 2. FDE (Frame Description Entry) - describes specific function unwinding
+ *
+ * Args:
+ *   ctx: ELF object context containing code size and buffer pointers
+ */
+static void elf_init_ehframe(ELFObjectContext* ctx) {
     uint8_t* p = ctx->p;
-    uint8_t* framep = p;
-
-    /* Emit DWARF EH CIE. */
-    DWRF_SECTION(CIE, DWRF_U32(0); /* Offset to CIE itself. */
-                 DWRF_U8(DWRF_CIE_VERSION);
-                 DWRF_STR("zR"); /* Augmentation. */
-                 DWRF_UV(1); /* Code alignment factor. */
-                 DWRF_SV(-(int64_t)sizeof(uintptr_t)); /* Data alignment factor. */
-                 DWRF_U8(DWRF_REG_RA); /* Return address register. */
-                 DWRF_UV(1);
-                 DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); /* Augmentation data. */
-                 DWRF_U8(DWRF_CFA_def_cfa); DWRF_UV(DWRF_REG_SP); DWRF_UV(sizeof(uintptr_t));
-                 DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); DWRF_UV(1);
-                 DWRF_ALIGNNOP(sizeof(uintptr_t));
+    uint8_t* framep = p;  // Remember start of frame data
+
+    /*
+    * DWARF Unwind Table for Trampoline Function
+    *
+    * This section defines DWARF Call Frame Information (CFI) using encoded macros
+    * like `DWRF_U8`, `DWRF_UV`, and `DWRF_SECTION` to describe how the trampoline function
+    * preserves and restores registers. This is used by profiling tools (e.g., `perf`)
+    * and debuggers for stack unwinding in JIT-compiled code.
+    *
+    * -------------------------------------------------
+    * TO REGENERATE THIS TABLE FROM GCC OBJECTS:
+    * -------------------------------------------------
+    *
+    * 1. Create a trampoline source file (e.g., `trampoline.c`):
+    *
+    *      #include <Python.h>
+    *      typedef PyObject* (*py_evaluator)(void*, void*, int);
+    *      PyObject* trampoline(void *ts, void *f, int throwflag, py_evaluator evaluator) {
+    *          return evaluator(ts, f, throwflag);
+    *      }
+    *
+    * 2. Compile to an object file with frame pointer preservation:
+    *
+    *      gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c
+    *
+    * 3. Extract DWARF unwind info from the object file:
+    *
+    *      readelf -w trampoline.o
+    *
+    *    Example output from `.eh_frame`:
+    *
+    *      00000000 CIE
+    *        Version:               1
+    *        Augmentation:          "zR"
+    *        Code alignment factor: 4
+    *        Data alignment factor: -8
+    *        Return address column: 30
+    *        DW_CFA_def_cfa: r31 (sp) ofs 0
+    *
+    *      00000014 FDE cie=00000000 pc=0..14
+    *        DW_CFA_advance_loc: 4
+    *        DW_CFA_def_cfa_offset: 16
+    *        DW_CFA_offset: r29 at cfa-16
+    *        DW_CFA_offset: r30 at cfa-8
+    *        DW_CFA_advance_loc: 12
+    *        DW_CFA_restore: r30
+    *        DW_CFA_restore: r29
+    *        DW_CFA_def_cfa_offset: 0
+    *
+    * -- These values can be verified by comparing with `readelf -w` or `llvm-dwarfdump --eh-frame`.
+    *
+    * ----------------------------------
+    * HOW TO TRANSLATE TO DWRF_* MACROS:
+    * ----------------------------------
+    *
+    * After compiling your trampoline with:
+    *
+    *     gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c
+    *
+    * run:
+    *
+    *     readelf -w trampoline.o
+    *
+    * to inspect the generated `.eh_frame` data. You will see two main components:
+    *
+    *     1. A CIE (Common Information Entry): shared configuration used by all FDEs.
+    *     2. An FDE (Frame Description Entry): function-specific unwind instructions.
+    *
+    * ---------------------
+    * Translating the CIE:
+    * ---------------------
+    * From `readelf -w`, you might see:
+    *
+    *   00000000 0000000000000010 00000000 CIE
+    *     Version:               1
+    *     Augmentation:          "zR"
+    *     Code alignment factor: 4
+    *     Data alignment factor: -8
+    *     Return address column: 30
+    *     Augmentation data:     1b
+    *     DW_CFA_def_cfa: r31 (sp) ofs 0
+    *
+    * Map this to:
+    *
+    *     DWRF_SECTION(CIE,
+    *         DWRF_U32(0);                             // CIE ID (always 0 for CIEs)
+    *         DWRF_U8(DWRF_CIE_VERSION);              // Version: 1
+    *         DWRF_STR("zR");                         // Augmentation string "zR"
+    *         DWRF_UV(4);                             // Code alignment factor = 4
+    *         DWRF_SV(-8);                            // Data alignment factor = -8
+    *         DWRF_U8(DWRF_REG_RA);                   // Return address register (e.g., x30 = 30)
+    *         DWRF_UV(1);                             // Augmentation data length = 1
+    *         DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // Encoding for FDE pointers
+    *
+    *         DWRF_U8(DWRF_CFA_def_cfa);              // DW_CFA_def_cfa
+    *         DWRF_UV(DWRF_REG_SP);                   // Register: SP (r31)
+    *         DWRF_UV(0);                             // Offset = 0
+    *
+    *         DWRF_ALIGNNOP(sizeof(uintptr_t));       // Align to pointer size boundary
+    *     )
+    *
+    * Notes:
+    *   - Use `DWRF_UV` for unsigned LEB128, `DWRF_SV` for signed LEB128.
+    *   - `DWRF_REG_RA` and `DWRF_REG_SP` are architecture-defined constants.
+    *
+    * ---------------------
+    * Translating the FDE:
+    * ---------------------
+    * From `readelf -w`:
+    *
+    *   00000014 0000000000000020 00000018 FDE cie=00000000 pc=0000000000000000..0000000000000014
+    *     DW_CFA_advance_loc: 4
+    *     DW_CFA_def_cfa_offset: 16
+    *     DW_CFA_offset: r29 at cfa-16
+    *     DW_CFA_offset: r30 at cfa-8
+    *     DW_CFA_advance_loc: 12
+    *     DW_CFA_restore: r30
+    *     DW_CFA_restore: r29
+    *     DW_CFA_def_cfa_offset: 0
+    *
+    * Map the FDE header and instructions to:
+    *
+    *     DWRF_SECTION(FDE,
+    *         DWRF_U32((uint32_t)(p - framep));       // Offset to CIE (relative from here)
+    *         DWRF_U32(-0x30);                        // Initial PC-relative location of the code
+    *         DWRF_U32(ctx->code_size);               // Code range covered by this FDE
+    *         DWRF_U8(0);                             // Augmentation data length (none)
+    *
+    *         DWRF_U8(DWRF_CFA_advance_loc | 1);      // Advance location by 1 unit (1 * 4 = 4 bytes)
+    *         DWRF_U8(DWRF_CFA_def_cfa_offset);       // CFA = SP + 16
+    *         DWRF_UV(16);
+    *
+    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Save x29 (frame pointer)
+    *         DWRF_UV(2);                             // At offset 2 * 8 = 16 bytes
+    *
+    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Save x30 (return address)
+    *         DWRF_UV(1);                             // At offset 1 * 8 = 8 bytes
+    *
+    *         DWRF_U8(DWRF_CFA_advance_loc | 3);      // Advance location by 3 units (3 * 4 = 12 bytes)
+    *
+    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore x30
+    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore x29
+    *
+    *         DWRF_U8(DWRF_CFA_def_cfa_offset);       // CFA = SP
+    *         DWRF_UV(0);
+    *     )
+    *
+    * To regenerate:
+    *   1. Get the `code alignment factor`, `data alignment factor`, and `RA column` from the CIE.
+    *   2. Note the range of the function from the FDE's `pc=...` line and map it to the JIT code as
+    *      the code is in a different address space every time.
+    *   3. For each `DW_CFA_*` entry, use the corresponding `DWRF_*` macro:
+    *        - `DW_CFA_def_cfa_offset`     → DWRF_U8(DWRF_CFA_def_cfa_offset), DWRF_UV(value)
+    *        - `DW_CFA_offset: rX`         → DWRF_U8(DWRF_CFA_offset | reg), DWRF_UV(offset)
+    *        - `DW_CFA_restore: rX`        → DWRF_U8(DWRF_CFA_offset | reg) // restore is same as reusing offset
+    *        - `DW_CFA_advance_loc: N`     → DWRF_U8(DWRF_CFA_advance_loc | (N / code_alignment_factor))
+    *   4. Use `DWRF_REG_FP`, `DWRF_REG_RA`, etc., for register numbers.
+    *   5. Use `sizeof(uintptr_t)` (typically 8) for pointer size calculations and alignment.
+    */
+
+    /*
+     * Emit DWARF EH CIE (Common Information Entry)
+     *
+     * The CIE describes the calling conventions and basic unwinding rules
+     * that apply to all functions in this compilation unit.
+     */
+    DWRF_SECTION(CIE,
+        DWRF_U32(0);                           // CIE ID (0 indicates this is a CIE)
+        DWRF_U8(DWRF_CIE_VERSION);            // CIE version (1)
+        DWRF_STR("zR");                       // Augmentation string ("zR" = has LSDA)
+        DWRF_UV(1);                           // Code alignment factor
+        DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative)
+        DWRF_U8(DWRF_REG_RA);                 // Return address register number
+        DWRF_UV(1);                           // Augmentation data length
+        DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // FDE pointer encoding
+
+        /* Initial CFI instructions - describe default calling convention */
+        DWRF_U8(DWRF_CFA_def_cfa);            // Define CFA (Call Frame Address)
+        DWRF_UV(DWRF_REG_SP);                 // CFA = SP register
+        DWRF_UV(sizeof(uintptr_t));           // CFA = SP + pointer_size
+        DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved
+        DWRF_UV(1);                           // At offset 1 from CFA
+
+        DWRF_ALIGNNOP(sizeof(uintptr_t));     // Align to pointer boundary
     )
 
-    ctx->eh_frame_p = p;
-
-    /* Emit DWARF EH FDE. */
-    DWRF_SECTION(FDE, DWRF_U32((uint32_t)(p - framep)); /* Offset to CIE. */
-                 DWRF_U32(-0x30); /* Machine code offset relative to .text. */
-                 DWRF_U32(ctx->code_size); /* Machine code length. */
-                 DWRF_U8(0); /* Augmentation data. */
-    /* Registers saved in CFRAME. */
+    ctx->eh_frame_p = p;  // Remember start of FDE data
+
+    /*
+     * Emit DWARF EH FDE (Frame Description Entry)
+     *
+     * The FDE describes unwinding information specific to this function.
+     * It references the CIE and provides function-specific CFI instructions.
+     */
+    DWRF_SECTION(FDE,
+        DWRF_U32((uint32_t)(p - framep));     // Offset to CIE (backwards reference)
+        DWRF_U32(-0x30);                      // Machine code offset relative to .text
+        DWRF_U32(ctx->code_size);             // Address range covered by this FDE (code lenght)
+        DWRF_U8(0);                           // Augmentation data length (none)
+
+        /*
+         * Architecture-specific CFI instructions
+         *
+         * These instructions describe how registers are saved and restored
+         * during function calls. Each architecture has different calling
+         * conventions and register usage patterns.
+         */
 #ifdef __x86_64__
-                 DWRF_U8(DWRF_CFA_advance_loc | 4);
-                 DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(16);
-                 DWRF_U8(DWRF_CFA_advance_loc | 6);
-                 DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(8);
-    /* Extra registers saved for JIT-compiled code. */
+        /* x86_64 calling convention unwinding rules */
+        DWRF_U8(DWRF_CFA_advance_loc | 4);    // Advance location by 4 bytes
+        DWRF_U8(DWRF_CFA_def_cfa_offset);     // Redefine CFA offset
+        DWRF_UV(16);                          // New offset: SP + 16
+        DWRF_U8(DWRF_CFA_advance_loc | 6);    // Advance location by 6 bytes
+        DWRF_U8(DWRF_CFA_def_cfa_offset);     // Redefine CFA offset
+        DWRF_UV(8);                           // New offset: SP + 8
 #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
-                 DWRF_U8(DWRF_CFA_advance_loc | 1);
-                 DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(16);
-                 DWRF_U8(DWRF_CFA_offset | 29); DWRF_UV(2);
-                 DWRF_U8(DWRF_CFA_offset | 30); DWRF_UV(1);
-                 DWRF_U8(DWRF_CFA_advance_loc | 3);
-                 DWRF_U8(DWRF_CFA_offset | -(64 - 29));
-                 DWRF_U8(DWRF_CFA_offset | -(64 - 30));
-                 DWRF_U8(DWRF_CFA_def_cfa_offset);
-                 DWRF_UV(0);
+        /* AArch64 calling convention unwinding rules */
+        DWRF_U8(DWRF_CFA_advance_loc | 1);        // Advance location by 1 instruction (stp x29, x30)
+        DWRF_U8(DWRF_CFA_def_cfa_offset);         // Redefine CFA offset
+        DWRF_UV(16);                              // CFA = SP + 16 (stack pointer after push)
+        DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP);   // Frame pointer (x29) saved
+        DWRF_UV(2);                               // At offset 2 from CFA (2 * 8 = 16 bytes)
+        DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA);   // Link register (x30) saved
+        DWRF_UV(1);                               // At offset 1 from CFA (1 * 8 = 8 bytes)
+        DWRF_U8(DWRF_CFA_advance_loc | 3);        // Advance by 3 instructions (mov x16, x3; mov x29, sp; ldp...)
+        DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP);   // Restore frame pointer (x29)
+        DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA);   // Restore link register (x30)
+        DWRF_U8(DWRF_CFA_def_cfa_offset);         // Final CFA adjustment
+        DWRF_UV(0);                               // CFA = SP + 0 (stack restored)
+
 #else
 #    error "Unsupported target architecture"
 #endif
-                 DWRF_ALIGNNOP(sizeof(uintptr_t));)
 
-    ctx->p = p;
+        DWRF_ALIGNNOP(sizeof(uintptr_t));     // Align to pointer boundary
+    )
+
+    ctx->p = p;  // Update context pointer to end of generated data
+}
+
+// =============================================================================
+//                              JITDUMP INITIALIZATION
+// =============================================================================
+
+/*
+ * Initialize the perf jitdump interface
+ *
+ * This function sets up everything needed to generate jitdump files:
+ * 1. Creates the jitdump file with a unique name
+ * 2. Maps the first page to signal perf that we're using the interface
+ * 3. Writes the jitdump header
+ * 4. Initializes synchronization primitives
+ *
+ * The memory mapping is crucial - perf detects jitdump files by scanning
+ * for processes that have mapped files matching the pattern /tmp/jit-*.dump
+ *
+ * Returns: Pointer to initialized state, or NULL on failure
+ */
+static void* perf_map_jit_init(void) {
+    char filename[100];
+    int pid = getpid();
+
+    /* Create unique filename based on process ID */
+    snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid);
+
+    /* Create/open the jitdump file with appropriate permissions */
+    const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666);
+    if (fd == -1) {
+        return NULL;  // Failed to create file
+    }
+
+    /* Get system page size for memory mapping */
+    const long page_size = sysconf(_SC_PAGESIZE);
+    if (page_size == -1) {
+        close(fd);
+        return NULL;  // Failed to get page size
+    }
+
+    /*
+     * Map the first page of the jitdump file
+     *
+     * This memory mapping serves as a signal to perf that this process
+     * is generating JIT code. Perf scans /proc/.../maps looking for mapped
+     * files that match the jitdump naming pattern.
+     *
+     * The mapping must be PROT_READ | PROT_EXEC to be detected by perf.
+     */
+    perf_jit_map_state.mapped_buffer = mmap(
+        NULL,                    // Let kernel choose address
+        page_size,               // Map one page
+        PROT_READ | PROT_EXEC,   // Read and execute permissions (required by perf)
+        MAP_PRIVATE,             // Private mapping
+        fd,                      // File descriptor
+        0                        // Offset 0 (first page)
+    );
+
+    if (perf_jit_map_state.mapped_buffer == NULL) {
+        close(fd);
+        return NULL;  // Memory mapping failed
+    }
+
+    perf_jit_map_state.mapped_size = page_size;
+
+    /* Convert file descriptor to FILE* for easier I/O operations */
+    perf_jit_map_state.perf_map = fdopen(fd, "w+");
+    if (perf_jit_map_state.perf_map == NULL) {
+        close(fd);
+        return NULL;  // Failed to create FILE*
+    }
+
+    /*
+     * Set up file buffering for better performance
+     *
+     * We use a large buffer (2MB) because jitdump files can be written
+     * frequently during program execution. Buffering reduces system call
+     * overhead and improves overall performance.
+     */
+    setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB);
+
+    /* Write the jitdump file header */
+    perf_map_jit_write_header(pid, perf_jit_map_state.perf_map);
+
+    /*
+     * Initialize thread synchronization lock
+     *
+     * Multiple threads may attempt to write to the jitdump file
+     * simultaneously. This lock ensures thread-safe access to the
+     * global jitdump state.
+     */
+    perf_jit_map_state.map_lock = PyThread_allocate_lock();
+    if (perf_jit_map_state.map_lock == NULL) {
+        fclose(perf_jit_map_state.perf_map);
+        return NULL;  // Failed to create lock
+    }
+
+    /* Initialize code ID counter */
+    perf_jit_map_state.code_id = 0;
+
+    /* Configure trampoline API with padding information */
+    trampoline_api.code_padding = PERF_JIT_CODE_PADDING;
+
+    return &perf_jit_map_state;
 }
 
+// =============================================================================
+//                              MAIN JITDUMP ENTRY WRITING
+// =============================================================================
+
+/*
+ * Write a complete jitdump entry for a Python function
+ *
+ * This is the main function called by Python's trampoline system whenever
+ * a new piece of JIT-compiled code needs to be recorded. It writes both
+ * the unwinding information and the code load event to the jitdump file.
+ *
+ * The function performs these steps:
+ * 1. Initialize jitdump system if not already done
+ * 2. Extract function name and filename from Python code object
+ * 3. Generate DWARF unwinding information
+ * 4. Write unwinding info event to jitdump file
+ * 5. Write code load event to jitdump file
+ *
+ * Args:
+ *   state: Jitdump state (currently unused, uses global state)
+ *   code_addr: Address where the compiled code resides
+ *   code_size: Size of the compiled code in bytes
+ *   co: Python code object containing metadata
+ *
+ * IMPORTANT: This function signature is part of Python's internal API
+ * and must not be changed without coordinating with core Python development.
+ */
 static void perf_map_jit_write_entry(void *state, const void *code_addr,
-                         unsigned int code_size, PyCodeObject *co)
+                                    unsigned int code_size, PyCodeObject *co)
 {
-
+    /* Initialize jitdump system on first use */
     if (perf_jit_map_state.perf_map == NULL) {
         void* ret = perf_map_jit_init();
         if(ret == NULL){
-            return;
+            return;  // Initialization failed, silently abort
         }
     }
 
+    /*
+     * Extract function information from Python code object
+     *
+     * We create a human-readable function name by combining the qualified
+     * name (includes class/module context) with the filename. This helps
+     * developers identify functions in perf reports.
+     */
     const char *entry = "";
     if (co->co_qualname != NULL) {
         entry = PyUnicode_AsUTF8(co->co_qualname);
     }
+
     const char *filename = "";
     if (co->co_filename != NULL) {
         filename = PyUnicode_AsUTF8(co->co_filename);
     }
 
-
+    /*
+     * Create formatted function name for perf display
+     *
+     * Format: "py::<function_name>:<filename>"
+     * The "py::" prefix helps identify Python functions in mixed-language
+     * profiles (e.g., when profiling C extensions alongside Python code).
+     */
     size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
     char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
     if (perf_map_entry == NULL) {
-        return;
+        return;  // Memory allocation failed
     }
     snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);
 
@@ -528,90 +1077,185 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr,
     uword base = (uword)code_addr;
     uword size = code_size;
 
-    // Write the code unwinding info event.
-
-    // Create unwinding information (eh frame)
+    /*
+     * Generate DWARF unwinding information
+     *
+     * DWARF data is essential for proper stack unwinding during profiling.
+     * Without it, perf cannot generate accurate call graphs, especially
+     * in optimized code where frame pointers may be omitted.
+     */
     ELFObjectContext ctx;
-    char buffer[1024];
+    char buffer[1024];  // Buffer for DWARF data (1KB should be sufficient)
     ctx.code_size = code_size;
     ctx.startp = ctx.p = (uint8_t*)buffer;
+
+    /* Generate EH frame (Exception Handling frame) data */
     elf_init_ehframe(&ctx);
     int eh_frame_size = ctx.p - ctx.startp;
 
-    // Populate the unwind info event for perf
+    /*
+     * Write Code Unwinding Information Event
+     *
+     * This event must be written before the code load event to ensure
+     * perf has the unwinding information available when it processes
+     * the code region.
+     */
     CodeUnwindingInfoEvent ev2;
     ev2.base.event = PerfUnwindingInfo;
     ev2.base.time_stamp = get_current_monotonic_ticks();
     ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
-    // Ensure we have enough space between DSOs when perf maps them
+
+    /* Verify we don't exceed our padding budget */
     assert(ev2.unwind_data_size <= PERF_JIT_CODE_PADDING);
+
     ev2.eh_frame_hdr_size = sizeof(EhFrameHeader);
-    ev2.mapped_size = round_up(ev2.unwind_data_size, 16);
+    ev2.mapped_size = round_up(ev2.unwind_data_size, 16);  // 16-byte alignment
+
+    /* Calculate total event size with padding */
     int content_size = sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size;
-    int padding_size = round_up(content_size, 8) - content_size;
+    int padding_size = round_up(content_size, 8) - content_size;  // 8-byte align
     ev2.base.size = content_size + padding_size;
-    perf_map_jit_write_fully(&ev2, sizeof(ev2));
 
+    /* Write the unwinding info event header */
+    perf_map_jit_write_fully(&ev2, sizeof(ev2));
 
-    // Populate the eh Frame header
+    /*
+     * Write EH Frame Header
+     *
+     * The EH frame header provides metadata about the DWARF unwinding
+     * information that follows. It includes pointers and counts that
+     * help perf navigate the unwinding data efficiently.
+     */
     EhFrameHeader f;
     f.version = 1;
-    f.eh_frame_ptr_enc = DwarfSData4 | DwarfPcRel;
-    f.fde_count_enc = DwarfUData4;
-    f.table_enc = DwarfSData4 | DwarfDataRel;
+    f.eh_frame_ptr_enc = DwarfSData4 | DwarfPcRel;  // PC-relative signed 4-byte
+    f.fde_count_enc = DwarfUData4;                  // Unsigned 4-byte count
+    f.table_enc = DwarfSData4 | DwarfDataRel;       // Data-relative signed 4-byte
+
+    /* Calculate relative offsets for EH frame navigation */
     f.eh_frame_ptr = -(eh_frame_size + 4 * sizeof(unsigned char));
-    f.eh_fde_count = 1;
+    f.eh_fde_count = 1;  // We generate exactly one FDE per function
     f.from = -(round_up(code_size, 8) + eh_frame_size);
+
     int cie_size = ctx.eh_frame_p - ctx.startp;
     f.to = -(eh_frame_size - cie_size);
 
+    /* Write EH frame data and header */
     perf_map_jit_write_fully(ctx.startp, eh_frame_size);
     perf_map_jit_write_fully(&f, sizeof(f));
 
+    /* Write padding to maintain alignment */
     char padding_bytes[] = "\0\0\0\0\0\0\0\0";
     perf_map_jit_write_fully(&padding_bytes, padding_size);
 
-    // Write the code load event.
+    /*
+     * Write Code Load Event
+     *
+     * This event tells perf about the new code region. It includes:
+     * - Memory addresses and sizes
+     * - Process and thread identification
+     * - Function name for symbol resolution
+     * - The actual machine code bytes
+     */
     CodeLoadEvent ev;
     ev.base.event = PerfLoad;
     ev.base.size = sizeof(ev) + (name_length+1) + size;
     ev.base.time_stamp = get_current_monotonic_ticks();
     ev.process_id = getpid();
-    ev.thread_id = syscall(SYS_gettid);
-    ev.vma = base;
-    ev.code_address = base;
+    ev.thread_id = syscall(SYS_gettid);  // Get thread ID via system call
+    ev.vma = base;                       // Virtual memory address
+    ev.code_address = base;              // Same as VMA for our use case
     ev.code_size = size;
+
+    /* Assign unique code ID and increment counter */
     perf_jit_map_state.code_id += 1;
     ev.code_id = perf_jit_map_state.code_id;
 
+    /* Write code load event and associated data */
     perf_map_jit_write_fully(&ev, sizeof(ev));
-    perf_map_jit_write_fully(perf_map_entry, name_length+1);
-    perf_map_jit_write_fully((void*)(base), size);
-    return;
+    perf_map_jit_write_fully(perf_map_entry, name_length+1);  // Include null terminator
+    perf_map_jit_write_fully((void*)(base), size);           // Copy actual machine code
+
+    /* Clean up allocated memory */
+    PyMem_RawFree(perf_map_entry);
 }
 
+// =============================================================================
+//                              CLEANUP AND FINALIZATION
+// =============================================================================
+
+/*
+ * Finalize and cleanup the perf jitdump system
+ *
+ * This function is called when Python is shutting down or when the
+ * perf trampoline system is being disabled. It ensures all resources
+ * are properly released and all buffered data is flushed to disk.
+ *
+ * Args:
+ *   state: Jitdump state (currently unused, uses global state)
+ *
+ * Returns: 0 on success
+ *
+ * IMPORTANT: This function signature is part of Python's internal API
+ * and must not be changed without coordinating with core Python development.
+ */
 static int perf_map_jit_fini(void* state) {
+    /*
+     * Close jitdump file with proper synchronization
+     *
+     * We need to acquire the lock to ensure no other threads are
+     * writing to the file when we close it. This prevents corruption
+     * and ensures all data is properly flushed.
+     */
     if (perf_jit_map_state.perf_map != NULL) {
-        // close the file
         PyThread_acquire_lock(perf_jit_map_state.map_lock, 1);
-        fclose(perf_jit_map_state.perf_map);
+        fclose(perf_jit_map_state.perf_map);  // This also flushes buffers
         PyThread_release_lock(perf_jit_map_state.map_lock);
 
-        // clean up the lock and state
+        /* Clean up synchronization primitive */
         PyThread_free_lock(perf_jit_map_state.map_lock);
         perf_jit_map_state.perf_map = NULL;
     }
+
+    /*
+     * Unmap the memory region
+     *
+     * This removes the signal to perf that we were generating JIT code.
+     * After this point, perf will no longer detect this process as
+     * having JIT capabilities.
+     */
     if (perf_jit_map_state.mapped_buffer != NULL) {
         munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size);
+        perf_jit_map_state.mapped_buffer = NULL;
     }
+
+    /* Clear global state reference */
     trampoline_api.state = NULL;
-    return 0;
+
+    return 0;  // Success
 }
 
+// =============================================================================
+//                              PUBLIC API EXPORT
+// =============================================================================
+
+/*
+ * Python Perf Callbacks Structure
+ *
+ * This structure defines the callback interface that Python's trampoline
+ * system uses to integrate with perf profiling. It contains function
+ * pointers for initialization, event writing, and cleanup.
+ *
+ * CRITICAL: This structure and its contents are part of Python's internal
+ * API. The function signatures and behavior must remain stable to maintain
+ * compatibility with the Python interpreter's perf integration system.
+ *
+ * Used by: Python's _PyPerf_Callbacks system in pycore_ceval.h
+ */
 _PyPerf_Callbacks _Py_perfmap_jit_callbacks = {
-    &perf_map_jit_init,
-    &perf_map_jit_write_entry,
-    &perf_map_jit_fini,
+    &perf_map_jit_init,        // Initialization function
+    &perf_map_jit_write_entry, // Event writing function
+    &perf_map_jit_fini,        // Cleanup function
 };
 
-#endif
+#endif /* PY_HAVE_PERF_TRAMPOLINE */
\ No newline at end of file
diff --git a/Python/pystate.c b/Python/pystate.c
index 4144e6edefc073..0544b15aad1cc8 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -567,6 +567,7 @@ init_interpreter(PyInterpreterState *interp,
     }
     interp->sys_profile_initialized = false;
     interp->sys_trace_initialized = false;
+    interp->_code_object_generation = 0;
     interp->jit = false;
     interp->executor_list_head = NULL;
     interp->executor_deletion_list_head = NULL;
@@ -777,6 +778,10 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate)
     for (int t = 0; t < PY_MONITORING_TOOL_IDS; t++) {
         Py_CLEAR(interp->monitoring_tool_names[t]);
     }
+    interp->_code_object_generation = 0;
+#ifdef Py_GIL_DISABLED
+    interp->tlbc_indices.tlbc_generation = 0;
+#endif
 
     PyConfig_Clear(&interp->config);
     _PyCodec_Fini(interp);
@@ -1346,9 +1351,6 @@ tstate_is_alive(PyThreadState *tstate)
 // lifecycle
 //----------
 
-/* Minimum size of data stack chunk */
-#define DATA_STACK_CHUNK_SIZE (16*1024)
-
 static _PyStackChunk*
 allocate_chunk(int size_in_bytes, _PyStackChunk* previous)
 {
@@ -2897,7 +2899,7 @@ _PyInterpreterState_HasFeature(PyInterpreterState *interp, unsigned long feature
 static PyObject **
 push_chunk(PyThreadState *tstate, int size)
 {
-    int allocate_size = DATA_STACK_CHUNK_SIZE;
+    int allocate_size = _PY_DATA_STACK_CHUNK_SIZE;
     while (allocate_size < (int)sizeof(PyObject*)*(size + MINIMUM_OVERHEAD)) {
         allocate_size *= 2;
     }
diff --git a/Python/remote_debug.h b/Python/remote_debug.h
index edc77c302916ca..dbc6bdd09a693f 100644
--- a/Python/remote_debug.h
+++ b/Python/remote_debug.h
@@ -73,19 +73,71 @@ extern "C" {
 #    define HAVE_PROCESS_VM_READV 0
 #endif
 
+static inline size_t
+get_page_size(void) {
+    size_t page_size = 0;
+    if (page_size == 0) {
+#ifdef MS_WINDOWS
+        SYSTEM_INFO si;
+        GetSystemInfo(&si);
+        page_size = si.dwPageSize;
+#else
+        page_size = (size_t)getpagesize();
+#endif
+    }
+    return page_size;
+}
+
+typedef struct page_cache_entry {
+    uintptr_t page_addr; // page-aligned base address
+    char *data;
+    int valid;
+    struct page_cache_entry *next;
+} page_cache_entry_t;
+
+#define MAX_PAGES 1024
+
 // Define a platform-independent process handle structure
 typedef struct {
     pid_t pid;
-#ifdef MS_WINDOWS
+#if defined(__APPLE__)
+    mach_port_t task;
+#elif defined(MS_WINDOWS)
     HANDLE hProcess;
 #endif
+    page_cache_entry_t pages[MAX_PAGES];
+    Py_ssize_t page_size;
 } proc_handle_t;
 
+static void
+_Py_RemoteDebug_FreePageCache(proc_handle_t *handle)
+{
+    for (int i = 0; i < MAX_PAGES; i++) {
+        PyMem_RawFree(handle->pages[i].data);
+        handle->pages[i].data = NULL;
+        handle->pages[i].valid = 0;
+    }
+}
+
+void
+_Py_RemoteDebug_ClearCache(proc_handle_t *handle)
+{
+    for (int i = 0; i < MAX_PAGES; i++) {
+        handle->pages[i].valid = 0;
+    }
+}
+
+#if defined(__APPLE__) && TARGET_OS_OSX
+static mach_port_t pid_to_task(pid_t pid);
+#endif
+
 // Initialize the process handle
 static int
 _Py_RemoteDebug_InitProcHandle(proc_handle_t *handle, pid_t pid) {
     handle->pid = pid;
-#ifdef MS_WINDOWS
+#if defined(__APPLE__)
+    handle->task = pid_to_task(handle->pid);
+#elif defined(MS_WINDOWS)
     handle->hProcess = OpenProcess(
         PROCESS_VM_READ | PROCESS_VM_WRITE | PROCESS_VM_OPERATION | PROCESS_QUERY_INFORMATION,
         FALSE, pid);
@@ -94,6 +146,11 @@ _Py_RemoteDebug_InitProcHandle(proc_handle_t *handle, pid_t pid) {
         return -1;
     }
 #endif
+    handle->page_size = get_page_size();
+    for (int i = 0; i < MAX_PAGES; i++) {
+        handle->pages[i].data = NULL;
+        handle->pages[i].valid = 0;
+    }
     return 0;
 }
 
@@ -107,6 +164,7 @@ _Py_RemoteDebug_CleanupProcHandle(proc_handle_t *handle) {
     }
 #endif
     handle->pid = 0;
+    _Py_RemoteDebug_FreePageCache(handle);
 }
 
 #if defined(__APPLE__) && TARGET_OS_OSX
@@ -755,7 +813,7 @@ _Py_RemoteDebug_ReadRemoteMemory(proc_handle_t *handle, uintptr_t remote_address
 #elif defined(__APPLE__) && TARGET_OS_OSX
     Py_ssize_t result = -1;
     kern_return_t kr = mach_vm_read_overwrite(
-        pid_to_task(handle->pid),
+        handle->task,
         (mach_vm_address_t)remote_address,
         len,
         (mach_vm_address_t)dst,
@@ -780,6 +838,59 @@ _Py_RemoteDebug_ReadRemoteMemory(proc_handle_t *handle, uintptr_t remote_address
 #endif
 }
 
+int
+_Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle,
+                                      uintptr_t addr,
+                                      size_t size,
+                                      void *out)
+{
+    size_t page_size = handle->page_size;
+    uintptr_t page_base = addr & ~(page_size - 1);
+    size_t offset_in_page = addr - page_base;
+
+    if (offset_in_page + size > page_size) {
+        return _Py_RemoteDebug_ReadRemoteMemory(handle, addr, size, out);
+    }
+
+    // Search for valid cached page
+    for (int i = 0; i < MAX_PAGES; i++) {
+        page_cache_entry_t *entry = &handle->pages[i];
+        if (entry->valid && entry->page_addr == page_base) {
+            memcpy(out, entry->data + offset_in_page, size);
+            return 0;
+        }
+    }
+
+    // Find reusable slot
+    for (int i = 0; i < MAX_PAGES; i++) {
+        page_cache_entry_t *entry = &handle->pages[i];
+        if (!entry->valid) {
+            if (entry->data == NULL) {
+                entry->data = PyMem_RawMalloc(page_size);
+                if (entry->data == NULL) {
+                    PyErr_NoMemory();
+                    return -1;
+                }
+            }
+
+            if (_Py_RemoteDebug_ReadRemoteMemory(handle, page_base, page_size, entry->data) < 0) {
+                // Try to just copy the exact ammount as a fallback
+                PyErr_Clear();
+                goto fallback;
+            }
+
+            entry->page_addr = page_base;
+            entry->valid = 1;
+            memcpy(out, entry->data + offset_in_page, size);
+            return 0;
+        }
+    }
+
+fallback:
+    // Cache full — fallback to uncached read
+    return _Py_RemoteDebug_ReadRemoteMemory(handle, addr, size, out);
+}
+
 static int
 _Py_RemoteDebug_ReadDebugOffsets(
     proc_handle_t *handle,