From 52509cc94b1a18cb325dbfa7e5f830b32759a903 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Simon?= <loic.pano@gmail.com>
Date: Sun, 25 May 2025 18:23:38 +0200
Subject: [PATCH 1/8] gh-134582: Fix t-strings untokenize() roundtrip removing
 space between braces (#134603)

---
 Lib/test/test_tokenize.py                          |  4 ++++
 Lib/tokenize.py                                    | 14 +++++++-------
 .../2025-05-23-23-43-39.gh-issue-134582.9POq3l.rst |  1 +
 3 files changed, 12 insertions(+), 7 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Library/2025-05-23-23-43-39.gh-issue-134582.9POq3l.rst

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 2d41a5e5ac0697..e6b19fe1812d44 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1975,6 +1975,10 @@ def test_roundtrip(self):
         for case in cases:
             self.check_roundtrip(case)
 
+        self.check_roundtrip(r"t'{ {}}'")
+        self.check_roundtrip(r"t'{f'{ {}}'}{ {}}'")
+        self.check_roundtrip(r"f'{t'{ {}}'}{ {}}'")
+
 
     def test_continuation(self):
         # Balancing continuation
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 8d01fd7bce41b0..559a7aecbde2d1 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -274,7 +274,7 @@ def compat(self, token, iterable):
         toks_append = self.tokens.append
         startline = token[0] in (NEWLINE, NL)
         prevstring = False
-        in_fstring = 0
+        in_fstring_or_tstring = 0
 
         for tok in _itertools.chain([token], iterable):
             toknum, tokval = tok[:2]
@@ -293,10 +293,10 @@ def compat(self, token, iterable):
             else:
                 prevstring = False
 
-            if toknum == FSTRING_START:
-                in_fstring += 1
-            elif toknum == FSTRING_END:
-                in_fstring -= 1
+            if toknum in {FSTRING_START, TSTRING_START}:
+                in_fstring_or_tstring += 1
+            elif toknum in {FSTRING_END, TSTRING_END}:
+                in_fstring_or_tstring -= 1
             if toknum == INDENT:
                 indents.append(tokval)
                 continue
@@ -311,8 +311,8 @@ def compat(self, token, iterable):
             elif toknum in {FSTRING_MIDDLE, TSTRING_MIDDLE}:
                 tokval = self.escape_brackets(tokval)
 
-            # Insert a space between two consecutive brackets if we are in an f-string
-            if tokval in {"{", "}"} and self.tokens and self.tokens[-1] == tokval and in_fstring:
+            # Insert a space between two consecutive brackets if we are in an f-string or t-string
+            if tokval in {"{", "}"} and self.tokens and self.tokens[-1] == tokval and in_fstring_or_tstring:
                 tokval = ' ' + tokval
 
             # Insert a space between two consecutive f-strings
diff --git a/Misc/NEWS.d/next/Library/2025-05-23-23-43-39.gh-issue-134582.9POq3l.rst b/Misc/NEWS.d/next/Library/2025-05-23-23-43-39.gh-issue-134582.9POq3l.rst
new file mode 100644
index 00000000000000..23e1d5891b685f
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-05-23-23-43-39.gh-issue-134582.9POq3l.rst
@@ -0,0 +1 @@
+Fix tokenize.untokenize() round-trip errors related to t-strings braces escaping

From b51b08a0a5fedde4f74e4cc338b8b5ad9656ad50 Mon Sep 17 00:00:00 2001
From: Jelle Zijlstra <jelle.zijlstra@gmail.com>
Date: Sun, 25 May 2025 10:23:28 -0700
Subject: [PATCH 2/8] annotationlib docs: note that ForwardRef.evaluate
 eventually defaults to empty globals (#134661)

---
 Doc/library/annotationlib.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Doc/library/annotationlib.rst b/Doc/library/annotationlib.rst
index 41c9ce479ff0f8..7dfc11449a6cbc 100644
--- a/Doc/library/annotationlib.rst
+++ b/Doc/library/annotationlib.rst
@@ -211,6 +211,10 @@ Classes
       means may not have any information about their scope, so passing
       arguments to this method may be necessary to evaluate them successfully.
 
+      If no *owner*, *globals*, *locals*, or *type_params* are provided and the
+      :class:`~ForwardRef` does not contain information about its origin,
+      empty globals and locals dictionaries are used.
+
    .. versionadded:: 3.14
 
 

From 57fef27cfc2bdfc1e3a65ef8c8a760198d15b14d Mon Sep 17 00:00:00 2001
From: Jelle Zijlstra <jelle.zijlstra@gmail.com>
Date: Sun, 25 May 2025 10:26:39 -0700
Subject: [PATCH 3/8] gh-133960: Improve typing.evaluate_forward_ref (#133961)

As explained in #133960, this removes most of the behavior differences with ForwardRef.evaluate.
The remaining difference is about recursive evaluation of forwardrefs; this is practically useful
in cases where an annotation refers to a type alias that itself is string-valued.

This also improves several edge cases that were previously not handled optimally. For example,
the function now takes advantage of the partial evaluation behavior of ForwardRef.evaluate() to
evaluate more ForwardRefs in the FORWARDREF format.

This also fixes #133959 as a side effect, because the buggy behavior in #133959 derives from
evaluate_forward_ref().
---
 Doc/library/typing.rst                        |  15 +--
 Lib/test/test_typing.py                       | 116 +++++++++++++++---
 Lib/typing.py                                 |  52 ++++----
 ...-05-12-20-38-57.gh-issue-133960.Aee79f.rst |   3 +
 4 files changed, 131 insertions(+), 55 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Library/2025-05-12-20-38-57.gh-issue-133960.Aee79f.rst

diff --git a/Doc/library/typing.rst b/Doc/library/typing.rst
index 54cc3ea3311adf..dd8ea3c364f49a 100644
--- a/Doc/library/typing.rst
+++ b/Doc/library/typing.rst
@@ -3500,20 +3500,11 @@ Introspection helpers
    Evaluate an :class:`annotationlib.ForwardRef` as a :term:`type hint`.
 
    This is similar to calling :meth:`annotationlib.ForwardRef.evaluate`,
-   but unlike that method, :func:`!evaluate_forward_ref` also:
-
-   * Recursively evaluates forward references nested within the type hint.
-   * Raises :exc:`TypeError` when it encounters certain objects that are
-     not valid type hints.
-   * Replaces type hints that evaluate to :const:`!None` with
-     :class:`types.NoneType`.
-   * Supports the :attr:`~annotationlib.Format.FORWARDREF` and
-     :attr:`~annotationlib.Format.STRING` formats.
+   but unlike that method, :func:`!evaluate_forward_ref` also
+   recursively evaluates forward references nested within the type hint.
 
    See the documentation for :meth:`annotationlib.ForwardRef.evaluate` for
-   the meaning of the *owner*, *globals*, *locals*, and *type_params* parameters.
-   *format* specifies the format of the annotation and is a member of
-   the :class:`annotationlib.Format` enum.
+   the meaning of the *owner*, *globals*, *locals*, *type_params*, and *format* parameters.
 
    .. versionadded:: 3.14
 
diff --git a/Lib/test/test_typing.py b/Lib/test/test_typing.py
index 246be22a0d8ec4..e2b6f459aa24fc 100644
--- a/Lib/test/test_typing.py
+++ b/Lib/test/test_typing.py
@@ -6859,12 +6859,10 @@ def test_forward_ref_and_final(self):
         self.assertEqual(hints, {'value': Final})
 
     def test_top_level_class_var(self):
-        # https://bugs.python.org/issue45166
-        with self.assertRaisesRegex(
-            TypeError,
-            r'typing.ClassVar\[int\] is not valid as type argument',
-        ):
-            get_type_hints(ann_module6)
+        # This is not meaningful but we don't raise for it.
+        # https://github.com/python/cpython/issues/133959
+        hints = get_type_hints(ann_module6)
+        self.assertEqual(hints, {'wrong': ClassVar[int]})
 
     def test_get_type_hints_typeddict(self):
         self.assertEqual(get_type_hints(TotalMovie), {'title': str, 'year': int})
@@ -6967,6 +6965,11 @@ def foo(a: 'Callable[..., T]'):
         self.assertEqual(get_type_hints(foo, globals(), locals()),
                          {'a': Callable[..., T]})
 
+    def test_special_forms_no_forward(self):
+        def f(x: ClassVar[int]):
+            pass
+        self.assertEqual(get_type_hints(f), {'x': ClassVar[int]})
+
     def test_special_forms_forward(self):
 
         class C:
@@ -6982,8 +6985,9 @@ class CF:
         self.assertEqual(get_type_hints(C, globals())['b'], Final[int])
         self.assertEqual(get_type_hints(C, globals())['x'], ClassVar)
         self.assertEqual(get_type_hints(C, globals())['y'], Final)
-        with self.assertRaises(TypeError):
-            get_type_hints(CF, globals()),
+        lfi = get_type_hints(CF, globals())['b']
+        self.assertIs(get_origin(lfi), list)
+        self.assertEqual(get_args(lfi), (Final[int],))
 
     def test_union_forward_recursion(self):
         ValueList = List['Value']
@@ -7216,33 +7220,113 @@ class C(Generic[T]): pass
 class EvaluateForwardRefTests(BaseTestCase):
     def test_evaluate_forward_ref(self):
         int_ref = ForwardRef('int')
-        missing = ForwardRef('missing')
+        self.assertIs(typing.evaluate_forward_ref(int_ref), int)
         self.assertIs(
             typing.evaluate_forward_ref(int_ref, type_params=()),
             int,
         )
+        self.assertIs(
+            typing.evaluate_forward_ref(int_ref, format=annotationlib.Format.VALUE),
+            int,
+        )
         self.assertIs(
             typing.evaluate_forward_ref(
-                int_ref, type_params=(), format=annotationlib.Format.FORWARDREF,
+                int_ref, format=annotationlib.Format.FORWARDREF,
             ),
             int,
         )
+        self.assertEqual(
+            typing.evaluate_forward_ref(
+                int_ref, format=annotationlib.Format.STRING,
+            ),
+            'int',
+        )
+
+    def test_evaluate_forward_ref_undefined(self):
+        missing = ForwardRef('missing')
+        with self.assertRaises(NameError):
+            typing.evaluate_forward_ref(missing)
         self.assertIs(
             typing.evaluate_forward_ref(
-                missing, type_params=(), format=annotationlib.Format.FORWARDREF,
+                missing, format=annotationlib.Format.FORWARDREF,
             ),
             missing,
         )
         self.assertEqual(
             typing.evaluate_forward_ref(
-                int_ref, type_params=(), format=annotationlib.Format.STRING,
+                missing, format=annotationlib.Format.STRING,
             ),
-            'int',
+            "missing",
         )
 
-    def test_evaluate_forward_ref_no_type_params(self):
-        ref = ForwardRef('int')
-        self.assertIs(typing.evaluate_forward_ref(ref), int)
+    def test_evaluate_forward_ref_nested(self):
+        ref = ForwardRef("int | list['str']")
+        self.assertEqual(
+            typing.evaluate_forward_ref(ref),
+            int | list[str],
+        )
+        self.assertEqual(
+            typing.evaluate_forward_ref(ref, format=annotationlib.Format.FORWARDREF),
+            int | list[str],
+        )
+        self.assertEqual(
+            typing.evaluate_forward_ref(ref, format=annotationlib.Format.STRING),
+            "int | list['str']",
+        )
+
+        why = ForwardRef('"\'str\'"')
+        self.assertIs(typing.evaluate_forward_ref(why), str)
+
+    def test_evaluate_forward_ref_none(self):
+        none_ref = ForwardRef('None')
+        self.assertIs(typing.evaluate_forward_ref(none_ref), None)
+
+    def test_globals(self):
+        A = "str"
+        ref = ForwardRef('list[A]')
+        with self.assertRaises(NameError):
+            typing.evaluate_forward_ref(ref)
+        self.assertEqual(
+            typing.evaluate_forward_ref(ref, globals={'A': A}),
+            list[str],
+        )
+
+    def test_owner(self):
+        ref = ForwardRef("A")
+
+        with self.assertRaises(NameError):
+            typing.evaluate_forward_ref(ref)
+
+        # We default to the globals of `owner`,
+        # so it no longer raises `NameError`
+        self.assertIs(
+            typing.evaluate_forward_ref(ref, owner=Loop), A
+        )
+
+    def test_inherited_owner(self):
+        # owner passed to evaluate_forward_ref
+        ref = ForwardRef("list['A']")
+        self.assertEqual(
+            typing.evaluate_forward_ref(ref, owner=Loop),
+            list[A],
+        )
+
+        # owner set on the ForwardRef
+        ref = ForwardRef("list['A']", owner=Loop)
+        self.assertEqual(
+            typing.evaluate_forward_ref(ref),
+            list[A],
+        )
+
+    def test_partial_evaluation(self):
+        ref = ForwardRef("list[A]")
+        with self.assertRaises(NameError):
+            typing.evaluate_forward_ref(ref)
+
+        self.assertEqual(
+            typing.evaluate_forward_ref(ref, format=annotationlib.Format.FORWARDREF),
+            list[EqualToForwardRef('A')],
+        )
 
 
 class CollectionsAbcTests(BaseTestCase):
diff --git a/Lib/typing.py b/Lib/typing.py
index 98af61be8b0716..ed1dd4fc6413a5 100644
--- a/Lib/typing.py
+++ b/Lib/typing.py
@@ -956,12 +956,8 @@ def evaluate_forward_ref(
     """Evaluate a forward reference as a type hint.
 
     This is similar to calling the ForwardRef.evaluate() method,
-    but unlike that method, evaluate_forward_ref() also:
-
-    * Recursively evaluates forward references nested within the type hint.
-    * Rejects certain objects that are not valid type hints.
-    * Replaces type hints that evaluate to None with types.NoneType.
-    * Supports the *FORWARDREF* and *STRING* formats.
+    but unlike that method, evaluate_forward_ref() also
+    recursively evaluates forward references nested within the type hint.
 
     *forward_ref* must be an instance of ForwardRef. *owner*, if given,
     should be the object that holds the annotations that the forward reference
@@ -981,23 +977,24 @@ def evaluate_forward_ref(
     if forward_ref.__forward_arg__ in _recursive_guard:
         return forward_ref
 
-    try:
-        value = forward_ref.evaluate(globals=globals, locals=locals,
-                                     type_params=type_params, owner=owner)
-    except NameError:
-        if format == _lazy_annotationlib.Format.FORWARDREF:
-            return forward_ref
-        else:
-            raise
-
-    type_ = _type_check(
-        value,
-        "Forward references must evaluate to types.",
-        is_argument=forward_ref.__forward_is_argument__,
-        allow_special_forms=forward_ref.__forward_is_class__,
-    )
+    if format is None:
+        format = _lazy_annotationlib.Format.VALUE
+    value = forward_ref.evaluate(globals=globals, locals=locals,
+                                 type_params=type_params, owner=owner, format=format)
+
+    if (isinstance(value, _lazy_annotationlib.ForwardRef)
+            and format == _lazy_annotationlib.Format.FORWARDREF):
+        return value
+
+    if isinstance(value, str):
+        value = _make_forward_ref(value, module=forward_ref.__forward_module__,
+                                  owner=owner or forward_ref.__owner__,
+                                  is_argument=forward_ref.__forward_is_argument__,
+                                  is_class=forward_ref.__forward_is_class__)
+    if owner is None:
+        owner = forward_ref.__owner__
     return _eval_type(
-        type_,
+        value,
         globals,
         locals,
         type_params,
@@ -2338,12 +2335,12 @@ def get_type_hints(obj, globalns=None, localns=None, include_extras=False,
                 # This only affects ForwardRefs.
                 base_globals, base_locals = base_locals, base_globals
             for name, value in ann.items():
-                if value is None:
-                    value = type(None)
                 if isinstance(value, str):
                     value = _make_forward_ref(value, is_argument=False, is_class=True)
                 value = _eval_type(value, base_globals, base_locals, base.__type_params__,
                                    format=format, owner=obj)
+                if value is None:
+                    value = type(None)
                 hints[name] = value
         if include_extras or format == Format.STRING:
             return hints
@@ -2377,8 +2374,6 @@ def get_type_hints(obj, globalns=None, localns=None, include_extras=False,
         localns = globalns
     type_params = getattr(obj, "__type_params__", ())
     for name, value in hints.items():
-        if value is None:
-            value = type(None)
         if isinstance(value, str):
             # class-level forward refs were handled above, this must be either
             # a module-level annotation or a function argument annotation
@@ -2387,7 +2382,10 @@ def get_type_hints(obj, globalns=None, localns=None, include_extras=False,
                 is_argument=not isinstance(obj, types.ModuleType),
                 is_class=False,
             )
-        hints[name] = _eval_type(value, globalns, localns, type_params, format=format, owner=obj)
+        value = _eval_type(value, globalns, localns, type_params, format=format, owner=obj)
+        if value is None:
+            value = type(None)
+        hints[name] = value
     return hints if include_extras else {k: _strip_annotations(t) for k, t in hints.items()}
 
 
diff --git a/Misc/NEWS.d/next/Library/2025-05-12-20-38-57.gh-issue-133960.Aee79f.rst b/Misc/NEWS.d/next/Library/2025-05-12-20-38-57.gh-issue-133960.Aee79f.rst
new file mode 100644
index 00000000000000..66e8483b25bc37
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-05-12-20-38-57.gh-issue-133960.Aee79f.rst
@@ -0,0 +1,3 @@
+Simplify and improve :func:`typing.evaluate_forward_ref`. It now no longer
+raises errors on certain invalid types. In several situations, it is now
+able to evaluate forward references that were previously unsupported.

From 1000283694136ee0538baa6c6b2eee662ee618d4 Mon Sep 17 00:00:00 2001
From: Chris Eibl <138194463+chris-eibl@users.noreply.github.com>
Date: Sun, 25 May 2025 20:17:13 +0200
Subject: [PATCH 4/8] GH-130328: Fix WindowsConsoleGetEventTests after
 gh-133728 (gh-134660)

---
 Lib/test/test_pyrepl/test_windows_console.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Lib/test/test_pyrepl/test_windows_console.py b/Lib/test/test_pyrepl/test_windows_console.py
index a52ae96a83ddde..f9607e02c604ff 100644
--- a/Lib/test/test_pyrepl/test_windows_console.py
+++ b/Lib/test/test_pyrepl/test_windows_console.py
@@ -386,6 +386,7 @@ def get_event(self, input_records, **kwargs) -> Console:
         self.console._read_input = self.mock
         self.console._WindowsConsole__vt_support = kwargs.get("vt_support",
                                                               False)
+        self.console.wait = MagicMock(return_value=True)
         event = self.console.get_event(block=False)
         return event
 

From 24a47155d2172966fab7d56f2bf9181056fba8d0 Mon Sep 17 00:00:00 2001
From: Julien Palard <julien@palard.fr>
Date: Sun, 25 May 2025 21:22:52 +0200
Subject: [PATCH 5/8] Fix sphinx-lint warnings (default-role used). (GH-134647)

---
 Doc/c-api/code.rst     | 2 +-
 Doc/c-api/function.rst | 2 +-
 Doc/c-api/typeobj.rst  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Doc/c-api/code.rst b/Doc/c-api/code.rst
index 6eae24b38fae48..42594f063b0709 100644
--- a/Doc/c-api/code.rst
+++ b/Doc/c-api/code.rst
@@ -182,7 +182,7 @@ bound into a function.
    Type of a code object watcher callback function.
 
    If *event* is ``PY_CODE_EVENT_CREATE``, then the callback is invoked
-   after `co` has been fully initialized. Otherwise, the callback is invoked
+   after *co* has been fully initialized. Otherwise, the callback is invoked
    before the destruction of *co* takes place, so the prior state of *co*
    can be inspected.
 
diff --git a/Doc/c-api/function.rst b/Doc/c-api/function.rst
index 58792edeed25e3..63b78f677674e9 100644
--- a/Doc/c-api/function.rst
+++ b/Doc/c-api/function.rst
@@ -169,7 +169,7 @@ There are a few functions specific to Python functions.
    unpredictable effects, including infinite recursion.
 
    If *event* is ``PyFunction_EVENT_CREATE``, then the callback is invoked
-   after `func` has been fully initialized. Otherwise, the callback is invoked
+   after *func* has been fully initialized. Otherwise, the callback is invoked
    before the modification to *func* takes place, so the prior state of *func*
    can be inspected. The runtime is permitted to optimize away the creation of
    function objects when possible. In such cases no event will be emitted.
diff --git a/Doc/c-api/typeobj.rst b/Doc/c-api/typeobj.rst
index 5df0c0fe608e53..91046c0e6f18ae 100644
--- a/Doc/c-api/typeobj.rst
+++ b/Doc/c-api/typeobj.rst
@@ -1238,7 +1238,7 @@ and :c:data:`PyType_Type` effectively act as defaults.)
 
    .. c:macro:: Py_TPFLAGS_MANAGED_DICT
 
-      This bit indicates that instances of the class have a `~object.__dict__`
+      This bit indicates that instances of the class have a :attr:`~object.__dict__`
       attribute, and that the space for the dictionary is managed by the VM.
 
       If this flag is set, :c:macro:`Py_TPFLAGS_HAVE_GC` should also be set.

From 328a778db8cc6ecadf0964a8e6a1834078b2d0d3 Mon Sep 17 00:00:00 2001
From: Hugo van Kemenade <1324225+hugovk@users.noreply.github.com>
Date: Sun, 25 May 2025 23:09:02 +0300
Subject: [PATCH 6/8] gh-134357: Remove unused imports in tests (#134340)

---
 Lib/test/.ruff.toml                            | 7 +++++++
 Lib/test/support/__init__.py                   | 3 +--
 Lib/test/support/interpreters/channels.py      | 4 ++--
 Lib/test/support/interpreters/queues.py        | 1 -
 Lib/test/test_capi/test_config.py              | 1 -
 Lib/test/test_codeccallbacks.py                | 1 -
 Lib/test/test_crossinterp.py                   | 2 --
 Lib/test/test_ctypes/_support.py               | 1 -
 Lib/test/test_ctypes/test_byteswap.py          | 1 -
 Lib/test/test_ctypes/test_generated_structs.py | 2 +-
 Lib/test/test_decimal.py                       | 1 -
 Lib/test/test_generated_cases.py               | 5 +----
 Lib/test/test_genericpath.py                   | 2 +-
 Lib/test/test_gzip.py                          | 1 -
 Lib/test/test_hashlib.py                       | 1 -
 Lib/test/test_hmac.py                          | 1 -
 Lib/test/test_idle.py                          | 2 +-
 Lib/test/test_interpreters/test_queues.py      | 1 -
 Lib/test/test_interpreters/utils.py            | 1 -
 Lib/test/test_ntpath.py                        | 3 +--
 Lib/test/test_peepholer.py                     | 2 +-
 Lib/test/test_pty.py                           | 1 -
 Lib/test/test_pydoc/test_pydoc.py              | 2 +-
 Lib/test/test_remote_pdb.py                    | 9 ++-------
 Lib/test/test_shutil.py                        | 2 +-
 Lib/test/test_string/_support.py               | 1 -
 Lib/test/test_sysconfig.py                     | 1 -
 Lib/test/test_threading.py                     | 2 +-
 Lib/test/test_tools/i18n_data/docstrings.py    | 2 +-
 Lib/test/test_types.py                         | 2 +-
 Lib/test/test_typing.py                        | 3 +--
 Lib/test/test_venv.py                          | 2 +-
 Lib/test/test_webbrowser.py                    | 1 -
 Lib/test/test_zipfile/__main__.py              | 2 +-
 Lib/test/test_zstd.py                          | 1 -
 35 files changed, 26 insertions(+), 48 deletions(-)

diff --git a/Lib/test/.ruff.toml b/Lib/test/.ruff.toml
index 7aa8a4785d6844..f1a967203ce4ba 100644
--- a/Lib/test/.ruff.toml
+++ b/Lib/test/.ruff.toml
@@ -19,5 +19,12 @@ extend-exclude = [
 
 [lint]
 select = [
+    "F401",  # Unused import
     "F811",  # Redefinition of unused variable (useful for finding test methods with the same name)
 ]
+
+[lint.per-file-ignores]
+"*/**/__main__.py" = ["F401"]  # Unused import
+"test_import/*.py" = ["F401"]  # Unused import
+"test_importlib/*.py" = ["F401"]  # Unused import
+"typinganndata/partialexecution/*.py" = ["F401"]  # Unused import
diff --git a/Lib/test/support/__init__.py b/Lib/test/support/__init__.py
index b7cd7940eb15b3..351d832a26d1df 100644
--- a/Lib/test/support/__init__.py
+++ b/Lib/test/support/__init__.py
@@ -1101,7 +1101,6 @@ def __init__(self):
         self.started = False
 
     def start(self):
-        import warnings
         try:
             f = open(self.procfile, 'r')
         except OSError as e:
@@ -2728,7 +2727,7 @@ def iter_builtin_types():
     # Fall back to making a best-effort guess.
     if hasattr(object, '__flags__'):
         # Look for any type object with the Py_TPFLAGS_STATIC_BUILTIN flag set.
-        import datetime
+        import datetime  # noqa: F401
         seen = set()
         for cls, subs in walk_class_hierarchy(object):
             if cls in seen:
diff --git a/Lib/test/support/interpreters/channels.py b/Lib/test/support/interpreters/channels.py
index 7a2bd7d63f808f..b25a17b1aabb93 100644
--- a/Lib/test/support/interpreters/channels.py
+++ b/Lib/test/support/interpreters/channels.py
@@ -6,8 +6,8 @@
 
 # aliases:
 from _interpchannels import (
-    ChannelError, ChannelNotFoundError, ChannelClosedError,
-    ChannelEmptyError, ChannelNotEmptyError,
+    ChannelError, ChannelNotFoundError, ChannelClosedError,  # noqa: F401
+    ChannelEmptyError, ChannelNotEmptyError,  # noqa: F401
 )
 from ._crossinterp import (
     UNBOUND_ERROR, UNBOUND_REMOVE,
diff --git a/Lib/test/support/interpreters/queues.py b/Lib/test/support/interpreters/queues.py
index d6a3197d9e0e26..99987f2f6926b0 100644
--- a/Lib/test/support/interpreters/queues.py
+++ b/Lib/test/support/interpreters/queues.py
@@ -1,6 +1,5 @@
 """Cross-interpreter Queues High Level Module."""
 
-import pickle
 import queue
 import time
 import weakref
diff --git a/Lib/test/test_capi/test_config.py b/Lib/test/test_capi/test_config.py
index a2d70dd3af482d..04a27de8d84994 100644
--- a/Lib/test/test_capi/test_config.py
+++ b/Lib/test/test_capi/test_config.py
@@ -3,7 +3,6 @@
 """
 import os
 import sys
-import sysconfig
 import types
 import unittest
 from test import support
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
index a767f67a02cf56..65d54d1004d647 100644
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -2,7 +2,6 @@
 import codecs
 import html.entities
 import itertools
-import re
 import sys
 import unicodedata
 import unittest
diff --git a/Lib/test/test_crossinterp.py b/Lib/test/test_crossinterp.py
index c54635eaeab3f9..2fa0077a09bbbb 100644
--- a/Lib/test/test_crossinterp.py
+++ b/Lib/test/test_crossinterp.py
@@ -1,6 +1,4 @@
 import contextlib
-import importlib
-import importlib.util
 import itertools
 import sys
 import types
diff --git a/Lib/test/test_ctypes/_support.py b/Lib/test/test_ctypes/_support.py
index 946d654a19aff8..700657a4e41f74 100644
--- a/Lib/test/test_ctypes/_support.py
+++ b/Lib/test/test_ctypes/_support.py
@@ -3,7 +3,6 @@
 import ctypes
 from _ctypes import Structure, Union, _Pointer, Array, _SimpleCData, CFuncPtr
 import sys
-from test import support
 
 
 _CData = Structure.__base__
diff --git a/Lib/test/test_ctypes/test_byteswap.py b/Lib/test/test_ctypes/test_byteswap.py
index ea5951603f9324..f14e1aa32e17ab 100644
--- a/Lib/test/test_ctypes/test_byteswap.py
+++ b/Lib/test/test_ctypes/test_byteswap.py
@@ -1,5 +1,4 @@
 import binascii
-import ctypes
 import math
 import struct
 import sys
diff --git a/Lib/test/test_ctypes/test_generated_structs.py b/Lib/test/test_ctypes/test_generated_structs.py
index aa448fad5bbae6..1cb46a82701553 100644
--- a/Lib/test/test_ctypes/test_generated_structs.py
+++ b/Lib/test/test_ctypes/test_generated_structs.py
@@ -10,7 +10,7 @@
 """
 
 import unittest
-from test.support import import_helper, verbose
+from test.support import import_helper
 import re
 from dataclasses import dataclass
 from functools import cached_property
diff --git a/Lib/test/test_decimal.py b/Lib/test/test_decimal.py
index 9e298401dc3dcc..c0a1e378583ba8 100644
--- a/Lib/test/test_decimal.py
+++ b/Lib/test/test_decimal.py
@@ -28,7 +28,6 @@
 import math
 import os, sys
 import operator
-import warnings
 import pickle, copy
 import unittest
 import numbers
diff --git a/Lib/test/test_generated_cases.py b/Lib/test/test_generated_cases.py
index a71ddc01d1c045..37046d8e1c02b7 100644
--- a/Lib/test/test_generated_cases.py
+++ b/Lib/test/test_generated_cases.py
@@ -1,11 +1,9 @@
 import contextlib
 import os
-import re
 import sys
 import tempfile
 import unittest
 
-from io import StringIO
 from test import support
 from test import test_tools
 
@@ -31,12 +29,11 @@ def skip_if_different_mount_drives():
 
 test_tools.skip_if_missing("cases_generator")
 with test_tools.imports_under_tool("cases_generator"):
-    from analyzer import analyze_forest, StackItem
+    from analyzer import StackItem
     from cwriter import CWriter
     import parser
     from stack import Local, Stack
     import tier1_generator
-    import opcode_metadata_generator
     import optimizer_generator
 
 
diff --git a/Lib/test/test_genericpath.py b/Lib/test/test_genericpath.py
index df07af01fc7540..16c3268fefb034 100644
--- a/Lib/test/test_genericpath.py
+++ b/Lib/test/test_genericpath.py
@@ -8,7 +8,7 @@
 import unittest
 import warnings
 from test.support import (
-    is_apple, is_emscripten, os_helper, warnings_helper
+    is_apple, os_helper, warnings_helper
 )
 from test.support.script_helper import assert_python_ok
 from test.support.os_helper import FakePath
diff --git a/Lib/test/test_gzip.py b/Lib/test/test_gzip.py
index ccbacc7c19b6e6..a12ff5662a73db 100644
--- a/Lib/test/test_gzip.py
+++ b/Lib/test/test_gzip.py
@@ -9,7 +9,6 @@
 import struct
 import sys
 import unittest
-import warnings
 from subprocess import PIPE, Popen
 from test.support import catch_unraisable_exception
 from test.support import import_helper
diff --git a/Lib/test/test_hashlib.py b/Lib/test/test_hashlib.py
index de4c8a1670f591..161c7652d7ab11 100644
--- a/Lib/test/test_hashlib.py
+++ b/Lib/test/test_hashlib.py
@@ -17,7 +17,6 @@
 import tempfile
 import threading
 import unittest
-import warnings
 from test import support
 from test.support import _4G, bigmemtest
 from test.support import hashlib_helper
diff --git a/Lib/test/test_hmac.py b/Lib/test/test_hmac.py
index e898644dd8a552..ff6e1bce0ef801 100644
--- a/Lib/test/test_hmac.py
+++ b/Lib/test/test_hmac.py
@@ -21,7 +21,6 @@
 import hmac
 import hashlib
 import random
-import test.support
 import test.support.hashlib_helper as hashlib_helper
 import types
 import unittest
diff --git a/Lib/test/test_idle.py b/Lib/test/test_idle.py
index 3d8b7ecc0ecb6d..ebf572ac5caac1 100644
--- a/Lib/test/test_idle.py
+++ b/Lib/test/test_idle.py
@@ -16,7 +16,7 @@
 
 # Unittest.main and test.libregrtest.runtest.runtest_inner
 # call load_tests, when present here, to discover tests to run.
-from idlelib.idle_test import load_tests
+from idlelib.idle_test import load_tests  # noqa: F401
 
 if __name__ == '__main__':
     tk.NoDefaultRoot()
diff --git a/Lib/test/test_interpreters/test_queues.py b/Lib/test/test_interpreters/test_queues.py
index 64a2db1230d023..757373904d7a43 100644
--- a/Lib/test/test_interpreters/test_queues.py
+++ b/Lib/test/test_interpreters/test_queues.py
@@ -9,7 +9,6 @@
 _queues = import_helper.import_module('_interpqueues')
 from test.support import interpreters
 from test.support.interpreters import queues, _crossinterp
-import test._crossinterp_definitions as defs
 from .utils import _run_output, TestBase as _TestBase
 
 
diff --git a/Lib/test/test_interpreters/utils.py b/Lib/test/test_interpreters/utils.py
index fc4ad662e03b66..c25e0fb7475e7e 100644
--- a/Lib/test/test_interpreters/utils.py
+++ b/Lib/test/test_interpreters/utils.py
@@ -12,7 +12,6 @@
 import threading
 import types
 import unittest
-import warnings
 
 from test import support
 
diff --git a/Lib/test/test_ntpath.py b/Lib/test/test_ntpath.py
index f83ef225a6e48e..c3b0bdaebc2329 100644
--- a/Lib/test/test_ntpath.py
+++ b/Lib/test/test_ntpath.py
@@ -6,8 +6,7 @@
 import sys
 import unittest
 import warnings
-from test.support import cpython_only, os_helper
-from test.support import TestFailed, is_emscripten
+from test.support import TestFailed, cpython_only, os_helper
 from test.support.os_helper import FakePath
 from test import test_genericpath
 from tempfile import TemporaryFile
diff --git a/Lib/test/test_peepholer.py b/Lib/test/test_peepholer.py
index 0a9ba578673b39..f33de3d420ca34 100644
--- a/Lib/test/test_peepholer.py
+++ b/Lib/test/test_peepholer.py
@@ -12,7 +12,7 @@
 
 from test import support
 from test.support.bytecode_helper import (
-    BytecodeTestCase, CfgOptimizationTestCase, CompilationStepTestCase)
+    BytecodeTestCase, CfgOptimizationTestCase)
 
 
 def compile_pattern_with_fast_locals(pattern):
diff --git a/Lib/test/test_pty.py b/Lib/test/test_pty.py
index c1728f5019d042..4836f38c388c05 100644
--- a/Lib/test/test_pty.py
+++ b/Lib/test/test_pty.py
@@ -20,7 +20,6 @@
 import signal
 import socket
 import io # readline
-import warnings
 
 TEST_STRING_1 = b"I wish to buy a fish license.\n"
 TEST_STRING_2 = b"For my pet fish, Eric.\n"
diff --git a/Lib/test/test_pydoc/test_pydoc.py b/Lib/test/test_pydoc/test_pydoc.py
index 281b24eaa36b80..d1d6f4987def0c 100644
--- a/Lib/test/test_pydoc/test_pydoc.py
+++ b/Lib/test/test_pydoc/test_pydoc.py
@@ -553,7 +553,7 @@ class object
             # of the known subclasses of object. (doc.docclass() used to
             # fail if HeapType was imported before running this test, like
             # when running tests sequentially.)
-            from _testcapi import HeapType
+            from _testcapi import HeapType  # noqa: F401
         except ImportError:
             pass
         text = doc.docclass(object)
diff --git a/Lib/test/test_remote_pdb.py b/Lib/test/test_remote_pdb.py
index aef8a6b0129092..a1c50af15f3dd2 100644
--- a/Lib/test/test_remote_pdb.py
+++ b/Lib/test/test_remote_pdb.py
@@ -1,5 +1,4 @@
 import io
-import time
 import itertools
 import json
 import os
@@ -8,16 +7,13 @@
 import socket
 import subprocess
 import sys
-import tempfile
 import textwrap
-import threading
 import unittest
 import unittest.mock
 from contextlib import closing, contextmanager, redirect_stdout, redirect_stderr, ExitStack
-from pathlib import Path
 from test.support import is_wasi, cpython_only, force_color, requires_subprocess, SHORT_TIMEOUT
-from test.support.os_helper import temp_dir, TESTFN, unlink
-from typing import Dict, List, Optional, Tuple, Union, Any
+from test.support.os_helper import TESTFN, unlink
+from typing import List
 
 import pdb
 from pdb import _PdbServer, _PdbClient
@@ -1434,7 +1430,6 @@ def test_multi_line_commands(self):
 
 
 def _supports_remote_attaching():
-    from contextlib import suppress
     PROCESS_VM_READV_SUPPORTED = False
 
     try:
diff --git a/Lib/test/test_shutil.py b/Lib/test/test_shutil.py
index 62c80aab4b3305..ebb6cf88336249 100644
--- a/Lib/test/test_shutil.py
+++ b/Lib/test/test_shutil.py
@@ -3492,7 +3492,7 @@ def test_module_all_attribute(self):
             target_api.append('disk_usage')
         self.assertEqual(set(shutil.__all__), set(target_api))
         with self.assertWarns(DeprecationWarning):
-            from shutil import ExecError
+            from shutil import ExecError  # noqa: F401
 
 
 if __name__ == '__main__':
diff --git a/Lib/test/test_string/_support.py b/Lib/test/test_string/_support.py
index eaa3354a559246..abdddaf187b4fe 100644
--- a/Lib/test/test_string/_support.py
+++ b/Lib/test/test_string/_support.py
@@ -1,4 +1,3 @@
-import unittest
 from string.templatelib import Interpolation
 
 
diff --git a/Lib/test/test_sysconfig.py b/Lib/test/test_sysconfig.py
index d30f69ded6643a..2c0df9376abfc6 100644
--- a/Lib/test/test_sysconfig.py
+++ b/Lib/test/test_sysconfig.py
@@ -32,7 +32,6 @@
 from sysconfig.__main__ import _main, _parse_makefile, _get_pybuilddir, _get_json_data_name
 import _imp
 import _osx_support
-import _sysconfig
 
 
 HAS_USER_BASE = sysconfig._HAS_USER_BASE
diff --git a/Lib/test/test_threading.py b/Lib/test/test_threading.py
index 0e51e7fc8c5a76..59b3a749d2fffa 100644
--- a/Lib/test/test_threading.py
+++ b/Lib/test/test_threading.py
@@ -1253,7 +1253,7 @@ def test_start_new_thread_failed(self):
         # its state should be removed from interpreter' thread states list
         # to avoid its double cleanup
         try:
-            from resource import setrlimit, RLIMIT_NPROC
+            from resource import setrlimit, RLIMIT_NPROC  # noqa: F401
         except ImportError as err:
             self.skipTest(err)  # RLIMIT_NPROC is specific to Linux and BSD
         code = """if 1:
diff --git a/Lib/test/test_tools/i18n_data/docstrings.py b/Lib/test/test_tools/i18n_data/docstrings.py
index 151a55a4b56ba6..14559a632da158 100644
--- a/Lib/test/test_tools/i18n_data/docstrings.py
+++ b/Lib/test/test_tools/i18n_data/docstrings.py
@@ -1,7 +1,7 @@
 """Module docstring"""
 
 # Test docstring extraction
-from gettext import gettext as _
+from gettext import gettext as _  # noqa: F401
 
 
 # Empty docstring
diff --git a/Lib/test/test_types.py b/Lib/test/test_types.py
index 3097c7ddf05901..9011e0e1962820 100644
--- a/Lib/test/test_types.py
+++ b/Lib/test/test_types.py
@@ -2516,7 +2516,7 @@ def setUpClass(cls):
             from test.support import interpreters
         except ModuleNotFoundError:
             raise unittest.SkipTest('subinterpreters required')
-        import test.support.interpreters.channels
+        import test.support.interpreters.channels  # noqa: F401
 
     @cpython_only
     @no_rerun('channels (and queues) might have a refleak; see gh-122199')
diff --git a/Lib/test/test_typing.py b/Lib/test/test_typing.py
index e2b6f459aa24fc..ef02e8202fc829 100644
--- a/Lib/test/test_typing.py
+++ b/Lib/test/test_typing.py
@@ -46,11 +46,10 @@
 import textwrap
 import typing
 import weakref
-import warnings
 import types
 
 from test.support import (
-    captured_stderr, cpython_only, infinite_recursion, requires_docstrings, import_helper, run_code,
+    captured_stderr, cpython_only, requires_docstrings, import_helper, run_code,
     EqualToForwardRef,
 )
 from test.typinganndata import (
diff --git a/Lib/test/test_venv.py b/Lib/test/test_venv.py
index 12c30e178aeb51..d62f3fba2d1a94 100644
--- a/Lib/test/test_venv.py
+++ b/Lib/test/test_venv.py
@@ -1008,7 +1008,7 @@ def do_test_with_pip(self, system_site_packages):
                      err, flags=re.MULTILINE)
         # Ignore warning about missing optional module:
         try:
-            import ssl
+            import ssl  # noqa: F401
         except ImportError:
             err = re.sub(
                 "^WARNING: Disabling truststore since ssl support is missing$",
diff --git a/Lib/test/test_webbrowser.py b/Lib/test/test_webbrowser.py
index 4c3ea1cd8df13e..6b577ae100e419 100644
--- a/Lib/test/test_webbrowser.py
+++ b/Lib/test/test_webbrowser.py
@@ -6,7 +6,6 @@
 import sys
 import unittest
 import webbrowser
-from functools import partial
 from test import support
 from test.support import import_helper
 from test.support import is_apple_mobile
diff --git a/Lib/test/test_zipfile/__main__.py b/Lib/test/test_zipfile/__main__.py
index e25ac946edffe4..90da74ade38c69 100644
--- a/Lib/test/test_zipfile/__main__.py
+++ b/Lib/test/test_zipfile/__main__.py
@@ -1,6 +1,6 @@
 import unittest
 
-from . import load_tests  # noqa: F401
+from . import load_tests
 
 
 if __name__ == "__main__":
diff --git a/Lib/test/test_zstd.py b/Lib/test/test_zstd.py
index 34c7c721b1ad32..bc809603cbc629 100644
--- a/Lib/test/test_zstd.py
+++ b/Lib/test/test_zstd.py
@@ -12,7 +12,6 @@
 from test.support.import_helper import import_module
 from test.support import threading_helper
 from test.support import _1M
-from test.support import Py_GIL_DISABLED
 
 _zstd = import_module("_zstd")
 zstd = import_module("compression.zstd")

From 42b25ad4d3d6bcdc28ddfe07d2bf8831378bb0d1 Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <Pablogsal@gmail.com>
Date: Sun, 25 May 2025 21:19:29 +0100
Subject: [PATCH 7/8] gh-91048: Refactor and optimize remote debugging module
 (#134652)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Completely refactor Modules/_remote_debugging_module.c with improved
code organization, replacing scattered reference counting and error
handling with centralized goto error paths. This cleanup improves
maintainability and reduces code duplication throughout the module while
preserving the same external API.

Implement memory page caching optimization in Python/remote_debug.h to
avoid repeated reads of the same memory regions during debugging
operations. The cache stores previously read memory pages and reuses
them for subsequent reads, significantly reducing system calls and
improving performance.

Add code object caching mechanism with a new code_object_generation
field in the interpreter state that tracks when code object caches need
invalidation. This allows efficient reuse of parsed code object metadata
and eliminates redundant processing of the same code objects across
debugging sessions.

Optimize memory operations by replacing multiple individual structure
copies with single bulk reads for the same data structures. This reduces
the number of memory operations and system calls required to gather
debugging information from the target process.

Update Makefile.pre.in to include Python/remote_debug.h in the headers
list, ensuring that changes to the remote debugging header force proper
recompilation of dependent modules and maintain build consistency across
the codebase.

Also, make the module compatible with the free threading build as an extra :)

Co-authored-by: Łukasz Langa <lukasz@langa.pl>
---
 Include/cpython/pystate.h                     |    2 +
 Include/internal/pycore_debug_offsets.h       |   15 +
 .../pycore_global_objects_fini_generated.h    |    1 +
 Include/internal/pycore_global_strings.h      |    1 +
 Include/internal/pycore_interp_structs.h      |    6 +
 .../internal/pycore_runtime_init_generated.h  |    1 +
 .../internal/pycore_unicodeobject_generated.h |    4 +
 Lib/asyncio/tools.py                          |    8 +-
 Lib/test/test_external_inspection.py          |   82 +-
 Makefile.pre.in                               |    1 +
 Modules/_remote_debugging_module.c            | 2951 +++++++++++------
 Modules/clinic/_remote_debugging_module.c.h   |  243 ++
 Objects/codeobject.c                          |    2 +
 Python/index_pool.c                           |    4 +
 Python/pystate.c                              |   10 +-
 Python/remote_debug.h                         |  117 +-
 16 files changed, 2390 insertions(+), 1058 deletions(-)
 create mode 100644 Modules/clinic/_remote_debugging_module.c.h

diff --git a/Include/cpython/pystate.h b/Include/cpython/pystate.h
index 7f1bc363861ddf..54d7e62292966e 100644
--- a/Include/cpython/pystate.h
+++ b/Include/cpython/pystate.h
@@ -61,6 +61,8 @@ typedef struct _stack_chunk {
     PyObject * data[1]; /* Variable sized */
 } _PyStackChunk;
 
+/* Minimum size of data stack chunk */
+#define _PY_DATA_STACK_CHUNK_SIZE (16*1024)
 struct _ts {
     /* See Python/ceval.c for comments explaining most fields */
 
diff --git a/Include/internal/pycore_debug_offsets.h b/Include/internal/pycore_debug_offsets.h
index 1a265c59ff8c08..ce3fcb109f49f7 100644
--- a/Include/internal/pycore_debug_offsets.h
+++ b/Include/internal/pycore_debug_offsets.h
@@ -54,11 +54,13 @@ extern "C" {
 # define _Py_Debug_Free_Threaded 1
 # define _Py_Debug_code_object_co_tlbc offsetof(PyCodeObject, co_tlbc)
 # define _Py_Debug_interpreter_frame_tlbc_index offsetof(_PyInterpreterFrame, tlbc_index)
+# define _Py_Debug_interpreter_state_tlbc_generation offsetof(PyInterpreterState, tlbc_indices.tlbc_generation)
 #else
 # define _Py_Debug_gilruntimestate_enabled 0
 # define _Py_Debug_Free_Threaded 0
 # define _Py_Debug_code_object_co_tlbc 0
 # define _Py_Debug_interpreter_frame_tlbc_index 0
+# define _Py_Debug_interpreter_state_tlbc_generation 0
 #endif
 
 
@@ -89,6 +91,8 @@ typedef struct _Py_DebugOffsets {
         uint64_t gil_runtime_state_enabled;
         uint64_t gil_runtime_state_locked;
         uint64_t gil_runtime_state_holder;
+        uint64_t code_object_generation;
+        uint64_t tlbc_generation;
     } interpreter_state;
 
     // Thread state offset;
@@ -216,6 +220,11 @@ typedef struct _Py_DebugOffsets {
         uint64_t gi_frame_state;
     } gen_object;
 
+    struct _llist_node {
+        uint64_t next;
+        uint64_t prev;
+    } llist_node;
+
     struct _debugger_support {
         uint64_t eval_breaker;
         uint64_t remote_debugger_support;
@@ -251,6 +260,8 @@ typedef struct _Py_DebugOffsets {
         .gil_runtime_state_enabled = _Py_Debug_gilruntimestate_enabled, \
         .gil_runtime_state_locked = offsetof(PyInterpreterState, _gil.locked), \
         .gil_runtime_state_holder = offsetof(PyInterpreterState, _gil.last_holder), \
+        .code_object_generation = offsetof(PyInterpreterState, _code_object_generation), \
+        .tlbc_generation = _Py_Debug_interpreter_state_tlbc_generation, \
     }, \
     .thread_state = { \
         .size = sizeof(PyThreadState), \
@@ -347,6 +358,10 @@ typedef struct _Py_DebugOffsets {
         .gi_iframe = offsetof(PyGenObject, gi_iframe), \
         .gi_frame_state = offsetof(PyGenObject, gi_frame_state), \
     }, \
+    .llist_node = { \
+        .next = offsetof(struct llist_node, next), \
+        .prev = offsetof(struct llist_node, prev), \
+    }, \
     .debugger_support = { \
         .eval_breaker = offsetof(PyThreadState, eval_breaker), \
         .remote_debugger_support = offsetof(PyThreadState, remote_debugger_support),  \
diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h
index d896e870630418..356bcaa7c350a1 100644
--- a/Include/internal/pycore_global_objects_fini_generated.h
+++ b/Include/internal/pycore_global_objects_fini_generated.h
@@ -795,6 +795,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(alias));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(align));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(all));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(all_threads));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(allow_code));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(any));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(append));
diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h
index a06d7495bab8e7..aebe798031ce4f 100644
--- a/Include/internal/pycore_global_strings.h
+++ b/Include/internal/pycore_global_strings.h
@@ -286,6 +286,7 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(alias)
         STRUCT_FOR_ID(align)
         STRUCT_FOR_ID(all)
+        STRUCT_FOR_ID(all_threads)
         STRUCT_FOR_ID(allow_code)
         STRUCT_FOR_ID(any)
         STRUCT_FOR_ID(append)
diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h
index c3e6c77405bfe7..8a29c533b99058 100644
--- a/Include/internal/pycore_interp_structs.h
+++ b/Include/internal/pycore_interp_structs.h
@@ -726,6 +726,10 @@ typedef struct _PyIndexPool {
 
     // Next index to allocate if no free indices are available
     int32_t next_index;
+
+    // Generation counter incremented on thread creation/destruction
+    // Used for TLBC cache invalidation in remote debugging
+    uint32_t tlbc_generation;
 } _PyIndexPool;
 
 typedef union _Py_unique_id_entry {
@@ -843,6 +847,8 @@ struct _is {
     /* The per-interpreter GIL, which might not be used. */
     struct _gil_runtime_state _gil;
 
+    uint64_t _code_object_generation;
+
      /* ---------- IMPORTANT ---------------------------
      The fields above this line are declared as early as
      possible to facilitate out-of-process observability
diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h
index 83301d8aef7697..0fa1fa5af99a92 100644
--- a/Include/internal/pycore_runtime_init_generated.h
+++ b/Include/internal/pycore_runtime_init_generated.h
@@ -793,6 +793,7 @@ extern "C" {
     INIT_ID(alias), \
     INIT_ID(align), \
     INIT_ID(all), \
+    INIT_ID(all_threads), \
     INIT_ID(allow_code), \
     INIT_ID(any), \
     INIT_ID(append), \
diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h
index c0f5f2b17f6609..4982c4532afd89 100644
--- a/Include/internal/pycore_unicodeobject_generated.h
+++ b/Include/internal/pycore_unicodeobject_generated.h
@@ -932,6 +932,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(all_threads);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(allow_code);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
diff --git a/Lib/asyncio/tools.py b/Lib/asyncio/tools.py
index b2da7d2f6ba10c..3fc4524c008db6 100644
--- a/Lib/asyncio/tools.py
+++ b/Lib/asyncio/tools.py
@@ -1,11 +1,10 @@
 """Tools to analyze tasks running in asyncio programs."""
 
-from dataclasses import dataclass
 from collections import defaultdict
 from itertools import count
 from enum import Enum
 import sys
-from _remote_debugging import get_all_awaited_by
+from _remote_debugging import RemoteUnwinder
 
 
 class NodeType(Enum):
@@ -118,6 +117,11 @@ def dfs(v):
 
 
 # ─── PRINT TREE FUNCTION ───────────────────────────────────────
+def get_all_awaited_by(pid):
+    unwinder = RemoteUnwinder(pid)
+    return unwinder.get_all_awaited_by()
+
+
 def build_async_tree(result, task_emoji="(T)", cor_emoji=""):
     """
     Build a list of strings for pretty-print an async call tree.
diff --git a/Lib/test/test_external_inspection.py b/Lib/test/test_external_inspection.py
index ad3f669a03043e..291c419066ac5b 100644
--- a/Lib/test/test_external_inspection.py
+++ b/Lib/test/test_external_inspection.py
@@ -4,6 +4,7 @@
 import importlib
 import sys
 import socket
+import threading
 from asyncio import staggered, taskgroups
 from unittest.mock import ANY
 from test.support import os_helper, SHORT_TIMEOUT, busy_retry
@@ -16,9 +17,7 @@
 
 try:
     from _remote_debugging import PROCESS_VM_READV_SUPPORTED
-    from _remote_debugging import get_stack_trace
-    from _remote_debugging import get_async_stack_trace
-    from _remote_debugging import get_all_awaited_by
+    from _remote_debugging import RemoteUnwinder
 except ImportError:
     raise unittest.SkipTest("Test only runs when _remote_debugging is available")
 
@@ -34,7 +33,23 @@ def _make_test_script(script_dir, script_basename, source):
 )
 
 
+def get_stack_trace(pid):
+    unwinder = RemoteUnwinder(pid, all_threads=True)
+    return unwinder.get_stack_trace()
+
+
+def get_async_stack_trace(pid):
+    unwinder = RemoteUnwinder(pid)
+    return unwinder.get_async_stack_trace()
+
+
+def get_all_awaited_by(pid):
+    unwinder = RemoteUnwinder(pid)
+    return unwinder.get_all_awaited_by()
+
+
 class TestGetStackTrace(unittest.TestCase):
+    maxDiff = None
 
     @skip_if_not_supported
     @unittest.skipIf(
@@ -46,7 +61,7 @@ def test_remote_stack_trace(self):
         port = find_unused_port()
         script = textwrap.dedent(
             f"""\
-            import time, sys, socket
+            import time, sys, socket, threading
             # Connect to the test process
             sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
             sock.connect(('localhost', {port}))
@@ -55,13 +70,16 @@ def bar():
                 for x in range(100):
                     if x == 50:
                         baz()
+
             def baz():
                 foo()
 
             def foo():
-                sock.sendall(b"ready"); time.sleep(10_000)  # same line number
+                sock.sendall(b"ready:thread\\n"); time.sleep(10_000)  # same line number
 
-            bar()
+            t = threading.Thread(target=bar)
+            t.start()
+            sock.sendall(b"ready:main\\n"); t.join()  # same line number
             """
         )
         stack_trace = None
@@ -82,8 +100,9 @@ def foo():
                 p = subprocess.Popen([sys.executable, script_name])
                 client_socket, _ = server_socket.accept()
                 server_socket.close()
-                response = client_socket.recv(1024)
-                self.assertEqual(response, b"ready")
+                response = b""
+                while b"ready:main" not in response or b"ready:thread" not in response:
+                    response += client_socket.recv(1024)
                 stack_trace = get_stack_trace(p.pid)
             except PermissionError:
                 self.skipTest("Insufficient permissions to read the stack trace")
@@ -94,13 +113,23 @@ def foo():
                 p.terminate()
                 p.wait(timeout=SHORT_TIMEOUT)
 
-            expected_stack_trace = [
-                ("foo", script_name, 14),
-                ("baz", script_name, 11),
+            thread_expected_stack_trace = [
+                ("foo", script_name, 15),
+                ("baz", script_name, 12),
                 ("bar", script_name, 9),
-                ("<module>", script_name, 16),
+                ('Thread.run', threading.__file__, ANY)
             ]
-            self.assertEqual(stack_trace, expected_stack_trace)
+            # Is possible that there are more threads, so we check that the
+            # expected stack traces are in the result (looking at you Windows!)
+            self.assertIn((ANY, thread_expected_stack_trace), stack_trace)
+
+            # Check that the main thread stack trace is in the result
+            frame = ("<module>", script_name, 19)
+            for _, stack in stack_trace:
+                if frame in stack:
+                    break
+            else:
+                self.fail("Main thread stack trace not found in result")
 
     @skip_if_not_supported
     @unittest.skipIf(
@@ -700,13 +729,28 @@ async def main():
     )
     def test_self_trace(self):
         stack_trace = get_stack_trace(os.getpid())
+        # Is possible that there are more threads, so we check that the
+        # expected stack traces are in the result (looking at you Windows!)
+        this_tread_stack = None
+        for thread_id, stack in stack_trace:
+            if thread_id == threading.get_native_id():
+                this_tread_stack = stack
+                break
+        self.assertIsNotNone(this_tread_stack)
         self.assertEqual(
-            stack_trace[0],
-            (
-                "TestGetStackTrace.test_self_trace",
-                __file__,
-                self.test_self_trace.__code__.co_firstlineno + 6,
-            ),
+            stack[:2],
+            [
+                (
+                    "get_stack_trace",
+                    __file__,
+                    get_stack_trace.__code__.co_firstlineno + 2,
+                ),
+                (
+                    "TestGetStackTrace.test_self_trace",
+                    __file__,
+                    self.test_self_trace.__code__.co_firstlineno + 6,
+                ),
+            ]
         )
 
 
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 3ab7c3d6c48ad9..b5703fbe6ae974 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -1206,6 +1206,7 @@ PYTHON_HEADERS= \
 		$(srcdir)/Include/unicodeobject.h \
 		$(srcdir)/Include/warnings.h \
 		$(srcdir)/Include/weakrefobject.h \
+		$(srcdir)/Python/remote_debug.h \
 		\
 		pyconfig.h \
 		$(PARSER_HEADERS) \
diff --git a/Modules/_remote_debugging_module.c b/Modules/_remote_debugging_module.c
index 8c0f40f835c36e..a13cbd63ad3bd8 100644
--- a/Modules/_remote_debugging_module.c
+++ b/Modules/_remote_debugging_module.c
@@ -1,5 +1,16 @@
+/******************************************************************************
+ * Python Remote Debugging Module
+ *
+ * This module provides functionality to debug Python processes remotely by
+ * reading their memory and reconstructing stack traces and asyncio task states.
+ ******************************************************************************/
+
 #define _GNU_SOURCE
 
+/* ============================================================================
+ * HEADERS AND INCLUDES
+ * ============================================================================ */
+
 #include <errno.h>
 #include <fcntl.h>
 #include <stddef.h>
@@ -23,6 +34,47 @@
 #    define HAVE_PROCESS_VM_READV 0
 #endif
 
+/* ============================================================================
+ * TYPE DEFINITIONS AND STRUCTURES
+ * ============================================================================ */
+
+#define GET_MEMBER(type, obj, offset) (*(type*)((char*)(obj) + (offset)))
+
+/* Size macros for opaque buffers */
+#define SIZEOF_BYTES_OBJ sizeof(PyBytesObject)
+#define SIZEOF_CODE_OBJ sizeof(PyCodeObject)
+#define SIZEOF_GEN_OBJ sizeof(PyGenObject)
+#define SIZEOF_INTERP_FRAME sizeof(_PyInterpreterFrame)
+#define SIZEOF_LLIST_NODE sizeof(struct llist_node)
+#define SIZEOF_PAGE_CACHE_ENTRY sizeof(page_cache_entry_t)
+#define SIZEOF_PYOBJECT sizeof(PyObject)
+#define SIZEOF_SET_OBJ sizeof(PySetObject)
+#define SIZEOF_TASK_OBJ 4096
+#define SIZEOF_THREAD_STATE sizeof(PyThreadState)
+#define SIZEOF_TYPE_OBJ sizeof(PyTypeObject)
+#define SIZEOF_UNICODE_OBJ sizeof(PyUnicodeObject)
+#define SIZEOF_LONG_OBJ sizeof(PyLongObject)
+
+// Calculate the minimum buffer size needed to read interpreter state fields
+// We need to read code_object_generation and potentially tlbc_generation
+#ifndef MAX
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#endif
+
+#ifdef Py_GIL_DISABLED
+#define INTERP_STATE_MIN_SIZE MAX(MAX(offsetof(PyInterpreterState, _code_object_generation) + sizeof(uint64_t), \
+                                      offsetof(PyInterpreterState, tlbc_indices.tlbc_generation) + sizeof(uint32_t)), \
+                                  offsetof(PyInterpreterState, threads.head) + sizeof(void*))
+#else
+#define INTERP_STATE_MIN_SIZE MAX(offsetof(PyInterpreterState, _code_object_generation) + sizeof(uint64_t), \
+                                  offsetof(PyInterpreterState, threads.head) + sizeof(void*))
+#endif
+#define INTERP_STATE_BUFFER_SIZE MAX(INTERP_STATE_MIN_SIZE, 256)
+
+
+
+// Copied from Modules/_asynciomodule.c because it's not exported
+
 struct _Py_AsyncioModuleDebugOffsets {
     struct _asyncio_task_object {
         uint64_t size;
@@ -45,6 +97,127 @@ struct _Py_AsyncioModuleDebugOffsets {
     } asyncio_thread_state;
 };
 
+typedef struct {
+    PyObject_HEAD
+    proc_handle_t handle;
+    uintptr_t runtime_start_address;
+    struct _Py_DebugOffsets debug_offsets;
+    int async_debug_offsets_available;
+    struct _Py_AsyncioModuleDebugOffsets async_debug_offsets;
+    uintptr_t interpreter_addr;
+    uintptr_t tstate_addr;
+    uint64_t code_object_generation;
+    _Py_hashtable_t *code_object_cache;
+#ifdef Py_GIL_DISABLED
+    // TLBC cache invalidation tracking
+    uint32_t tlbc_generation;  // Track TLBC index pool changes
+    _Py_hashtable_t *tlbc_cache;  // Cache of TLBC arrays by code object address
+#endif
+} RemoteUnwinderObject;
+
+typedef struct {
+    PyObject *func_name;
+    PyObject *file_name;
+    int first_lineno;
+    PyObject *linetable;  // bytes
+    uintptr_t addr_code_adaptive;
+} CachedCodeMetadata;
+
+typedef struct {
+    /* Types */
+    PyTypeObject *RemoteDebugging_Type;
+} RemoteDebuggingState;
+
+typedef struct
+{
+    int lineno;
+    int end_lineno;
+    int column;
+    int end_column;
+} LocationInfo;
+
+typedef struct {
+    uintptr_t remote_addr;
+    size_t size;
+    void *local_copy;
+} StackChunkInfo;
+
+typedef struct {
+    StackChunkInfo *chunks;
+    size_t count;
+} StackChunkList;
+
+#include "clinic/_remote_debugging_module.c.h"
+
+/*[clinic input]
+module _remote_debugging
+[clinic start generated code]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=5f507d5b2e76a7f7]*/
+
+
+/* ============================================================================
+ * FORWARD DECLARATIONS
+ * ============================================================================ */
+
+static int
+parse_tasks_in_set(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t set_addr,
+    PyObject *awaited_by,
+    int recurse_task
+);
+
+static int
+parse_task(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t task_address,
+    PyObject *render_to,
+    int recurse_task
+);
+
+static int
+parse_coro_chain(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t coro_address,
+    PyObject *render_to
+);
+
+/* Forward declarations for task parsing functions */
+static int parse_frame_object(
+    RemoteUnwinderObject *unwinder,
+    PyObject** result,
+    uintptr_t address,
+    uintptr_t* previous_frame
+);
+
+/* ============================================================================
+ * UTILITY FUNCTIONS AND HELPERS
+ * ============================================================================ */
+
+static void
+cached_code_metadata_destroy(void *ptr)
+{
+    CachedCodeMetadata *meta = (CachedCodeMetadata *)ptr;
+    Py_DECREF(meta->func_name);
+    Py_DECREF(meta->file_name);
+    Py_DECREF(meta->linetable);
+    PyMem_RawFree(meta);
+}
+
+static inline RemoteDebuggingState *
+RemoteDebugging_GetState(PyObject *module)
+{
+    void *state = _PyModule_GetState(module);
+    assert(state != NULL);
+    return (RemoteDebuggingState *)state;
+}
+
+static inline int
+RemoteDebugging_InitState(RemoteDebuggingState *st)
+{
+    return 0;
+}
+
 // Helper to chain exceptions and avoid repetitions
 static void
 chain_exceptions(PyObject *exception, const char *string)
@@ -54,36 +227,14 @@ chain_exceptions(PyObject *exception, const char *string)
     _PyErr_ChainExceptions1(exc);
 }
 
-// Get the PyAsyncioDebug section address for any platform
-static uintptr_t
-_Py_RemoteDebug_GetAsyncioDebugAddress(proc_handle_t* handle)
-{
-    uintptr_t address;
-
-#ifdef MS_WINDOWS
-    // On Windows, search for asyncio debug in executable or DLL
-    address = search_windows_map_for_section(handle, "AsyncioD", L"_asyncio");
-#elif defined(__linux__)
-    // On Linux, search for asyncio debug in executable or DLL
-    address = search_linux_map_for_section(handle, "AsyncioDebug", "_asyncio.cpython");
-#elif defined(__APPLE__) && TARGET_OS_OSX
-    // On macOS, try libpython first, then fall back to python
-    address = search_map_for_section(handle, "AsyncioDebug", "_asyncio.cpython");
-    if (address == 0) {
-        PyErr_Clear();
-        address = search_map_for_section(handle, "AsyncioDebug", "_asyncio.cpython");
-    }
-#else
-    Py_UNREACHABLE();
-#endif
-
-    return address;
-}
+/* ============================================================================
+ * MEMORY READING FUNCTIONS
+ * ============================================================================ */
 
 static inline int
 read_ptr(proc_handle_t *handle, uintptr_t address, uintptr_t *ptr_addr)
 {
-    int result = _Py_RemoteDebug_ReadRemoteMemory(handle, address, sizeof(void*), ptr_addr);
+    int result = _Py_RemoteDebug_PagedReadRemoteMemory(handle, address, sizeof(void*), ptr_addr);
     if (result < 0) {
         return -1;
     }
@@ -93,7 +244,7 @@ read_ptr(proc_handle_t *handle, uintptr_t address, uintptr_t *ptr_addr)
 static inline int
 read_Py_ssize_t(proc_handle_t *handle, uintptr_t address, Py_ssize_t *size)
 {
-    int result = _Py_RemoteDebug_ReadRemoteMemory(handle, address, sizeof(Py_ssize_t), size);
+    int result = _Py_RemoteDebug_PagedReadRemoteMemory(handle, address, sizeof(Py_ssize_t), size);
     if (result < 0) {
         return -1;
     }
@@ -113,72 +264,53 @@ read_py_ptr(proc_handle_t *handle, uintptr_t address, uintptr_t *ptr_addr)
 static int
 read_char(proc_handle_t *handle, uintptr_t address, char *result)
 {
-    int res = _Py_RemoteDebug_ReadRemoteMemory(handle, address, sizeof(char), result);
-    if (res < 0) {
-        return -1;
-    }
-    return 0;
-}
-
-static int
-read_sized_int(proc_handle_t *handle, uintptr_t address, void *result, size_t size)
-{
-    int res = _Py_RemoteDebug_ReadRemoteMemory(handle, address, size, result);
-    if (res < 0) {
-        return -1;
-    }
-    return 0;
-}
-
-static int
-read_unsigned_long(proc_handle_t *handle, uintptr_t address, unsigned long *result)
-{
-    int res = _Py_RemoteDebug_ReadRemoteMemory(handle, address, sizeof(unsigned long), result);
+    int res = _Py_RemoteDebug_PagedReadRemoteMemory(handle, address, sizeof(char), result);
     if (res < 0) {
         return -1;
     }
     return 0;
 }
 
-static int
-read_pyobj(proc_handle_t *handle, uintptr_t address, PyObject *ptr_addr)
-{
-    int res = _Py_RemoteDebug_ReadRemoteMemory(handle, address, sizeof(PyObject), ptr_addr);
-    if (res < 0) {
-        return -1;
-    }
-    return 0;
-}
+/* ============================================================================
+ * PYTHON OBJECT READING FUNCTIONS
+ * ============================================================================ */
 
 static PyObject *
 read_py_str(
-    proc_handle_t *handle,
-    _Py_DebugOffsets* debug_offsets,
+    RemoteUnwinderObject *unwinder,
     uintptr_t address,
     Py_ssize_t max_len
 ) {
     PyObject *result = NULL;
     char *buf = NULL;
 
-    Py_ssize_t len;
-    int res = _Py_RemoteDebug_ReadRemoteMemory(
-        handle,
-        address + debug_offsets->unicode_object.length,
-        sizeof(Py_ssize_t),
-        &len
+    // Read the entire PyUnicodeObject at once
+    char unicode_obj[SIZEOF_UNICODE_OBJ];
+    int res = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle,
+        address,
+        SIZEOF_UNICODE_OBJ,
+        unicode_obj
     );
     if (res < 0) {
         goto err;
     }
 
+    Py_ssize_t len = GET_MEMBER(Py_ssize_t, unicode_obj, unwinder->debug_offsets.unicode_object.length);
+    if (len < 0 || len > max_len) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "Invalid string length (%zd) at 0x%lx", len, address);
+        return NULL;
+    }
+
     buf = (char *)PyMem_RawMalloc(len+1);
     if (buf == NULL) {
         PyErr_NoMemory();
         return NULL;
     }
 
-    size_t offset = debug_offsets->unicode_object.asciiobject_size;
-    res = _Py_RemoteDebug_ReadRemoteMemory(handle, address + offset, len, buf);
+    size_t offset = unwinder->debug_offsets.unicode_object.asciiobject_size;
+    res = _Py_RemoteDebug_PagedReadRemoteMemory(&unwinder->handle, address + offset, len, buf);
     if (res < 0) {
         goto err;
     }
@@ -202,32 +334,40 @@ read_py_str(
 
 static PyObject *
 read_py_bytes(
-    proc_handle_t *handle,
-    _Py_DebugOffsets* debug_offsets,
-    uintptr_t address
+    RemoteUnwinderObject *unwinder,
+    uintptr_t address,
+    Py_ssize_t max_len
 ) {
     PyObject *result = NULL;
     char *buf = NULL;
 
-    Py_ssize_t len;
-    int res = _Py_RemoteDebug_ReadRemoteMemory(
-        handle,
-        address + debug_offsets->bytes_object.ob_size,
-        sizeof(Py_ssize_t),
-        &len
+    // Read the entire PyBytesObject at once
+    char bytes_obj[SIZEOF_BYTES_OBJ];
+    int res = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle,
+        address,
+        SIZEOF_BYTES_OBJ,
+        bytes_obj
     );
     if (res < 0) {
         goto err;
     }
 
+    Py_ssize_t len = GET_MEMBER(Py_ssize_t, bytes_obj, unwinder->debug_offsets.bytes_object.ob_size);
+    if (len < 0 || len > max_len) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "Invalid string length (%zd) at 0x%lx", len, address);
+        return NULL;
+    }
+
     buf = (char *)PyMem_RawMalloc(len+1);
     if (buf == NULL) {
         PyErr_NoMemory();
         return NULL;
     }
 
-    size_t offset = debug_offsets->bytes_object.ob_sval;
-    res = _Py_RemoteDebug_ReadRemoteMemory(handle, address + offset, len, buf);
+    size_t offset = unwinder->debug_offsets.bytes_object.ob_sval;
+    res = _Py_RemoteDebug_PagedReadRemoteMemory(&unwinder->handle, address + offset, len, buf);
     if (res < 0) {
         goto err;
     }
@@ -249,45 +389,60 @@ read_py_bytes(
     return NULL;
 }
 
-
-
 static long
-read_py_long(proc_handle_t *handle, _Py_DebugOffsets* offsets, uintptr_t address)
+read_py_long(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t address
+)
 {
     unsigned int shift = PYLONG_BITS_IN_DIGIT;
 
-    Py_ssize_t size;
-    uintptr_t lv_tag;
-
-    int bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
-        handle, address + offsets->long_object.lv_tag,
-        sizeof(uintptr_t),
-        &lv_tag);
+    // Read the entire PyLongObject at once
+    char long_obj[SIZEOF_LONG_OBJ];
+    int bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle,
+        address,
+        unwinder->debug_offsets.long_object.size,
+        long_obj);
     if (bytes_read < 0) {
         return -1;
     }
 
+    uintptr_t lv_tag = GET_MEMBER(uintptr_t, long_obj, unwinder->debug_offsets.long_object.lv_tag);
     int negative = (lv_tag & 3) == 2;
-    size = lv_tag >> 3;
+    Py_ssize_t size = lv_tag >> 3;
 
     if (size == 0) {
         return 0;
     }
 
-    digit *digits = (digit *)PyMem_RawMalloc(size * sizeof(digit));
-    if (!digits) {
-        PyErr_NoMemory();
-        return -1;
-    }
+    // If the long object has inline digits, use them directly
+    digit *digits;
+    if (size <= _PY_NSMALLNEGINTS + _PY_NSMALLPOSINTS) {
+        // For small integers, digits are inline in the long_value.ob_digit array
+        digits = (digit *)PyMem_RawMalloc(size * sizeof(digit));
+        if (!digits) {
+            PyErr_NoMemory();
+            return -1;
+        }
+        memcpy(digits, long_obj + unwinder->debug_offsets.long_object.ob_digit, size * sizeof(digit));
+    } else {
+        // For larger integers, we need to read the digits separately
+        digits = (digit *)PyMem_RawMalloc(size * sizeof(digit));
+        if (!digits) {
+            PyErr_NoMemory();
+            return -1;
+        }
 
-    bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
-        handle,
-        address + offsets->long_object.ob_digit,
-        sizeof(digit) * size,
-        digits
-    );
-    if (bytes_read < 0) {
-        goto error;
+        bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
+            &unwinder->handle,
+            address + unwinder->debug_offsets.long_object.ob_digit,
+            sizeof(digit) * size,
+            digits
+        );
+        if (bytes_read < 0) {
+            goto error;
+        }
     }
 
     long long value = 0;
@@ -310,44 +465,115 @@ read_py_long(proc_handle_t *handle, _Py_DebugOffsets* offsets, uintptr_t address
     return -1;
 }
 
+/* ============================================================================
+ * ASYNCIO DEBUG FUNCTIONS
+ * ============================================================================ */
+
+// Get the PyAsyncioDebug section address for any platform
+static uintptr_t
+_Py_RemoteDebug_GetAsyncioDebugAddress(proc_handle_t* handle)
+{
+    uintptr_t address;
+
+#ifdef MS_WINDOWS
+    // On Windows, search for asyncio debug in executable or DLL
+    address = search_windows_map_for_section(handle, "AsyncioD", L"_asyncio");
+    if (address == 0) {
+        // Error out: 'python' substring covers both executable and DLL
+        PyObject *exc = PyErr_GetRaisedException();
+        PyErr_SetString(PyExc_RuntimeError, "Failed to find the AsyncioDebug section in the process.");
+        _PyErr_ChainExceptions1(exc);
+    }
+#elif defined(__linux__)
+    // On Linux, search for asyncio debug in executable or DLL
+    address = search_linux_map_for_section(handle, "AsyncioDebug", "_asyncio.cpython");
+    if (address == 0) {
+        // Error out: 'python' substring covers both executable and DLL
+        PyObject *exc = PyErr_GetRaisedException();
+        PyErr_SetString(PyExc_RuntimeError, "Failed to find the AsyncioDebug section in the process.");
+        _PyErr_ChainExceptions1(exc);
+    }
+#elif defined(__APPLE__) && TARGET_OS_OSX
+    // On macOS, try libpython first, then fall back to python
+    address = search_map_for_section(handle, "AsyncioDebug", "_asyncio.cpython");
+    if (address == 0) {
+        PyErr_Clear();
+        address = search_map_for_section(handle, "AsyncioDebug", "_asyncio.cpython");
+    }
+    if (address == 0) {
+        // Error out: 'python' substring covers both executable and DLL
+        PyObject *exc = PyErr_GetRaisedException();
+        PyErr_SetString(PyExc_RuntimeError, "Failed to find the AsyncioDebug section in the process.");
+        _PyErr_ChainExceptions1(exc);
+    }
+#else
+    Py_UNREACHABLE();
+#endif
+
+    return address;
+}
+
+static int
+read_async_debug(
+    RemoteUnwinderObject *unwinder
+) {
+    uintptr_t async_debug_addr = _Py_RemoteDebug_GetAsyncioDebugAddress(&unwinder->handle);
+    if (!async_debug_addr) {
+        return -1;
+    }
+
+    size_t size = sizeof(struct _Py_AsyncioModuleDebugOffsets);
+    int result = _Py_RemoteDebug_PagedReadRemoteMemory(&unwinder->handle, async_debug_addr, size, &unwinder->async_debug_offsets);
+    return result;
+}
+
+/* ============================================================================
+ * ASYNCIO TASK PARSING FUNCTIONS
+ * ============================================================================ */
+
 static PyObject *
 parse_task_name(
-    proc_handle_t *handle,
-    _Py_DebugOffsets* offsets,
-    struct _Py_AsyncioModuleDebugOffsets* async_offsets,
+    RemoteUnwinderObject *unwinder,
     uintptr_t task_address
 ) {
-    uintptr_t task_name_addr;
-    int err = read_py_ptr(
-        handle,
-        task_address + async_offsets->asyncio_task_object.task_name,
-        &task_name_addr);
-    if (err) {
+    // Read the entire TaskObj at once
+    char task_obj[SIZEOF_TASK_OBJ];
+    int err = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle,
+        task_address,
+        unwinder->async_debug_offsets.asyncio_task_object.size,
+        task_obj);
+    if (err < 0) {
         return NULL;
     }
 
-    // The task name can be a long or a string so we need to check the type
+    uintptr_t task_name_addr = GET_MEMBER(uintptr_t, task_obj, unwinder->async_debug_offsets.asyncio_task_object.task_name);
+    task_name_addr &= ~Py_TAG_BITS;
 
-    PyObject task_name_obj;
-    err = read_pyobj(
-        handle,
+    // The task name can be a long or a string so we need to check the type
+    char task_name_obj[SIZEOF_PYOBJECT];
+    err = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle,
         task_name_addr,
-        &task_name_obj);
-    if (err) {
+        SIZEOF_PYOBJECT,
+        task_name_obj);
+    if (err < 0) {
         return NULL;
     }
 
-    unsigned long flags;
-    err = read_unsigned_long(
-        handle,
-        (uintptr_t)task_name_obj.ob_type + offsets->type_object.tp_flags,
-        &flags);
-    if (err) {
+    // Now read the type object to get the flags
+    char type_obj[SIZEOF_TYPE_OBJ];
+    err = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle,
+        GET_MEMBER(uintptr_t, task_name_obj, unwinder->debug_offsets.pyobject.ob_type),
+        SIZEOF_TYPE_OBJ,
+        type_obj);
+    if (err < 0) {
         return NULL;
     }
 
-    if ((flags & Py_TPFLAGS_LONG_SUBCLASS)) {
-        long res = read_py_long(handle, offsets, task_name_addr);
+    if ((GET_MEMBER(unsigned long, type_obj, unwinder->debug_offsets.type_object.tp_flags) & Py_TPFLAGS_LONG_SUBCLASS)) {
+        long res = read_py_long(unwinder, task_name_addr);
         if (res == -1) {
             chain_exceptions(PyExc_RuntimeError, "Failed to get task name");
             return NULL;
@@ -355,355 +581,375 @@ parse_task_name(
         return PyUnicode_FromFormat("Task-%d", res);
     }
 
-    if(!(flags & Py_TPFLAGS_UNICODE_SUBCLASS)) {
+    if(!(GET_MEMBER(unsigned long, type_obj, unwinder->debug_offsets.type_object.tp_flags) & Py_TPFLAGS_UNICODE_SUBCLASS)) {
         PyErr_SetString(PyExc_RuntimeError, "Invalid task name object");
         return NULL;
     }
 
     return read_py_str(
-        handle,
-        offsets,
+        unwinder,
         task_name_addr,
         255
     );
 }
 
-static int
-parse_frame_object(
-    proc_handle_t *handle,
-    PyObject** result,
-    struct _Py_DebugOffsets* offsets,
-    uintptr_t address,
-    uintptr_t* previous_frame
-);
-
-static int
-parse_coro_chain(
-    proc_handle_t *handle,
-    struct _Py_DebugOffsets* offsets,
-    struct _Py_AsyncioModuleDebugOffsets* async_offsets,
-    uintptr_t coro_address,
-    PyObject *render_to
+static int parse_task_awaited_by(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t task_address,
+    PyObject *awaited_by,
+    int recurse_task
 ) {
-    assert((void*)coro_address != NULL);
-
-    uintptr_t gen_type_addr;
-    int err = read_ptr(
-        handle,
-        coro_address + offsets->pyobject.ob_type,
-        &gen_type_addr);
-    if (err) {
+    // Read the entire TaskObj at once
+    char task_obj[SIZEOF_TASK_OBJ];
+    if (_Py_RemoteDebug_PagedReadRemoteMemory(&unwinder->handle, task_address,
+                                              unwinder->async_debug_offsets.asyncio_task_object.size,
+                                              task_obj) < 0) {
         return -1;
     }
 
-    PyObject* name = NULL;
-    uintptr_t prev_frame;
-    if (parse_frame_object(
-                handle,
-                &name,
-                offsets,
-                coro_address + offsets->gen_object.gi_iframe,
-                &prev_frame)
-        < 0)
-    {
-        return -1;
-    }
+    uintptr_t task_ab_addr = GET_MEMBER(uintptr_t, task_obj, unwinder->async_debug_offsets.asyncio_task_object.task_awaited_by);
+    task_ab_addr &= ~Py_TAG_BITS;
 
-    if (PyList_Append(render_to, name)) {
-        Py_DECREF(name);
-        return -1;
+    if ((void*)task_ab_addr == NULL) {
+        return 0;
     }
-    Py_DECREF(name);
 
-    int8_t gi_frame_state;
-    err = read_sized_int(
-        handle,
-        coro_address + offsets->gen_object.gi_frame_state,
-        &gi_frame_state,
-        sizeof(int8_t)
-    );
-    if (err) {
-        return -1;
-    }
+    char awaited_by_is_a_set = GET_MEMBER(char, task_obj, unwinder->async_debug_offsets.asyncio_task_object.task_awaited_by_is_set);
 
-    if (gi_frame_state == FRAME_SUSPENDED_YIELD_FROM) {
-        char owner;
-        err = read_char(
-            handle,
-            coro_address + offsets->gen_object.gi_iframe +
-                offsets->interpreter_frame.owner,
-            &owner
-        );
-        if (err) {
+    if (awaited_by_is_a_set) {
+        if (parse_tasks_in_set(unwinder, task_ab_addr, awaited_by, recurse_task)) {
             return -1;
         }
-        if (owner != FRAME_OWNED_BY_GENERATOR) {
-            PyErr_SetString(
-                PyExc_RuntimeError,
-                "generator doesn't own its frame \\_o_/");
+    } else {
+        if (parse_task(unwinder, task_ab_addr, awaited_by, recurse_task)) {
             return -1;
         }
+    }
+
+    return 0;
+}
+
+static int
+handle_yield_from_frame(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t gi_iframe_addr,
+    uintptr_t gen_type_addr,
+    PyObject *render_to
+) {
+    // Read the entire interpreter frame at once
+    char iframe[SIZEOF_INTERP_FRAME];
+    int err = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle,
+        gi_iframe_addr,
+        SIZEOF_INTERP_FRAME,
+        iframe);
+    if (err < 0) {
+        return -1;
+    }
+
+    if (GET_MEMBER(char, iframe, unwinder->debug_offsets.interpreter_frame.owner) != FRAME_OWNED_BY_GENERATOR) {
+        PyErr_SetString(
+            PyExc_RuntimeError,
+            "generator doesn't own its frame \\_o_/");
+        return -1;
+    }
+
+    uintptr_t stackpointer_addr = GET_MEMBER(uintptr_t, iframe, unwinder->debug_offsets.interpreter_frame.stackpointer);
+    stackpointer_addr &= ~Py_TAG_BITS;
 
-        uintptr_t stackpointer_addr;
+    if ((void*)stackpointer_addr != NULL) {
+        uintptr_t gi_await_addr;
         err = read_py_ptr(
-            handle,
-            coro_address + offsets->gen_object.gi_iframe +
-                offsets->interpreter_frame.stackpointer,
-            &stackpointer_addr);
+            &unwinder->handle,
+            stackpointer_addr - sizeof(void*),
+            &gi_await_addr);
         if (err) {
             return -1;
         }
 
-        if ((void*)stackpointer_addr != NULL) {
-            uintptr_t gi_await_addr;
-            err = read_py_ptr(
-                handle,
-                stackpointer_addr - sizeof(void*),
-                &gi_await_addr);
+        if ((void*)gi_await_addr != NULL) {
+            uintptr_t gi_await_addr_type_addr;
+            err = read_ptr(
+                &unwinder->handle,
+                gi_await_addr + unwinder->debug_offsets.pyobject.ob_type,
+                &gi_await_addr_type_addr);
             if (err) {
                 return -1;
             }
 
-            if ((void*)gi_await_addr != NULL) {
-                uintptr_t gi_await_addr_type_addr;
-                int err = read_ptr(
-                    handle,
-                    gi_await_addr + offsets->pyobject.ob_type,
-                    &gi_await_addr_type_addr);
+            if (gen_type_addr == gi_await_addr_type_addr) {
+                /* This needs an explanation. We always start with parsing
+                   native coroutine / generator frames. Ultimately they
+                   are awaiting on something. That something can be
+                   a native coroutine frame or... an iterator.
+                   If it's the latter -- we can't continue building
+                   our chain. So the condition to bail out of this is
+                   to do that when the type of the current coroutine
+                   doesn't match the type of whatever it points to
+                   in its cr_await.
+                */
+                err = parse_coro_chain(unwinder, gi_await_addr, render_to);
                 if (err) {
                     return -1;
                 }
-
-                if (gen_type_addr == gi_await_addr_type_addr) {
-                    /* This needs an explanation. We always start with parsing
-                       native coroutine / generator frames. Ultimately they
-                       are awaiting on something. That something can be
-                       a native coroutine frame or... an iterator.
-                       If it's the latter -- we can't continue building
-                       our chain. So the condition to bail out of this is
-                       to do that when the type of the current coroutine
-                       doesn't match the type of whatever it points to
-                       in its cr_await.
-                    */
-                    err = parse_coro_chain(
-                        handle,
-                        offsets,
-                        async_offsets,
-                        gi_await_addr,
-                        render_to
-                    );
-                    if (err) {
-                        return -1;
-                    }
-                }
             }
         }
-
     }
 
     return 0;
 }
 
-
 static int
-parse_task_awaited_by(
-    proc_handle_t *handle,
-    struct _Py_DebugOffsets* offsets,
-    struct _Py_AsyncioModuleDebugOffsets* async_offsets,
-    uintptr_t task_address,
-    PyObject *awaited_by,
-    int recurse_task
-);
+parse_coro_chain(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t coro_address,
+    PyObject *render_to
+) {
+    assert((void*)coro_address != NULL);
+
+    // Read the entire generator object at once
+    char gen_object[SIZEOF_GEN_OBJ];
+    int err = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle,
+        coro_address,
+        SIZEOF_GEN_OBJ,
+        gen_object);
+    if (err < 0) {
+        return -1;
+    }
 
+    uintptr_t gen_type_addr = GET_MEMBER(uintptr_t, gen_object, unwinder->debug_offsets.pyobject.ob_type);
 
-static int
-parse_task(
-    proc_handle_t *handle,
-    struct _Py_DebugOffsets* offsets,
-    struct _Py_AsyncioModuleDebugOffsets* async_offsets,
+    PyObject* name = NULL;
+
+    // Parse the previous frame using the gi_iframe from local copy
+    uintptr_t prev_frame;
+    uintptr_t gi_iframe_addr = coro_address + unwinder->debug_offsets.gen_object.gi_iframe;
+    if (parse_frame_object(unwinder, &name, gi_iframe_addr, &prev_frame) < 0) {
+        return -1;
+    }
+
+    if (PyList_Append(render_to, name)) {
+        Py_DECREF(name);
+        return -1;
+    }
+    Py_DECREF(name);
+
+    if (GET_MEMBER(int8_t, gen_object, unwinder->debug_offsets.gen_object.gi_frame_state) == FRAME_SUSPENDED_YIELD_FROM) {
+        return handle_yield_from_frame(unwinder, gi_iframe_addr, gen_type_addr, render_to);
+    }
+
+    return 0;
+}
+
+static PyObject*
+create_task_result(
+    RemoteUnwinderObject *unwinder,
     uintptr_t task_address,
-    PyObject *render_to,
     int recurse_task
 ) {
-    char is_task;
-    int err = read_char(
-        handle,
-        task_address + async_offsets->asyncio_task_object.task_is_task,
-        &is_task);
-    if (err) {
-        return -1;
-    }
+    PyObject* result = NULL;
+    PyObject *call_stack = NULL;
+    PyObject *tn = NULL;
+    char task_obj[SIZEOF_TASK_OBJ];
+    uintptr_t coro_addr;
 
-    PyObject* result = PyList_New(0);
+    result = PyList_New(0);
     if (result == NULL) {
-        return -1;
+        goto error;
     }
 
-    PyObject *call_stack = PyList_New(0);
+    call_stack = PyList_New(0);
     if (call_stack == NULL) {
-        goto err;
+        goto error;
     }
+
     if (PyList_Append(result, call_stack)) {
-        Py_DECREF(call_stack);
-        goto err;
+        goto error;
     }
-    /* we can operate on a borrowed one to simplify cleanup */
-    Py_DECREF(call_stack);
+    Py_CLEAR(call_stack);
 
-    if (is_task) {
-        PyObject *tn = NULL;
-        if (recurse_task) {
-            tn = parse_task_name(
-                handle, offsets, async_offsets, task_address);
-        } else {
-            tn = PyLong_FromUnsignedLongLong(task_address);
+    if (recurse_task) {
+        tn = parse_task_name(unwinder, task_address);
+    } else {
+        tn = PyLong_FromUnsignedLongLong(task_address);
+    }
+    if (tn == NULL) {
+        goto error;
+    }
+
+    if (PyList_Append(result, tn)) {
+        goto error;
+    }
+    Py_CLEAR(tn);
+
+    // Parse coroutine chain
+    if (_Py_RemoteDebug_PagedReadRemoteMemory(&unwinder->handle, task_address,
+                                              unwinder->async_debug_offsets.asyncio_task_object.size,
+                                              task_obj) < 0) {
+        goto error;
+    }
+
+    coro_addr = GET_MEMBER(uintptr_t, task_obj, unwinder->async_debug_offsets.asyncio_task_object.task_coro);
+    coro_addr &= ~Py_TAG_BITS;
+
+    if ((void*)coro_addr != NULL) {
+        call_stack = PyList_New(0);
+        if (call_stack == NULL) {
+            goto error;
         }
-        if (tn == NULL) {
-            goto err;
+
+        if (parse_coro_chain(unwinder, coro_addr, call_stack) < 0) {
+            Py_DECREF(call_stack);
+            goto error;
         }
-        if (PyList_Append(result, tn)) {
-            Py_DECREF(tn);
-            goto err;
+
+        if (PyList_Reverse(call_stack)) {
+            Py_DECREF(call_stack);
+            goto error;
         }
-        Py_DECREF(tn);
 
-        uintptr_t coro_addr;
-        err = read_py_ptr(
-            handle,
-            task_address + async_offsets->asyncio_task_object.task_coro,
-            &coro_addr);
-        if (err) {
-            goto err;
+        if (PyList_SetItem(result, 0, call_stack) < 0) {
+            Py_DECREF(call_stack);
+            goto error;
         }
+    }
 
-        if ((void*)coro_addr != NULL) {
-            err = parse_coro_chain(
-                handle,
-                offsets,
-                async_offsets,
-                coro_addr,
-                call_stack
-            );
-            if (err) {
-                goto err;
-            }
+    return result;
 
-            if (PyList_Reverse(call_stack)) {
-                goto err;
-            }
+error:
+    Py_XDECREF(result);
+    Py_XDECREF(call_stack);
+    Py_XDECREF(tn);
+    return NULL;
+}
+
+static int
+parse_task(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t task_address,
+    PyObject *render_to,
+    int recurse_task
+) {
+    char is_task;
+    PyObject* result = NULL;
+    PyObject* awaited_by = NULL;
+    int err;
+
+    err = read_char(
+        &unwinder->handle,
+        task_address + unwinder->async_debug_offsets.asyncio_task_object.task_is_task,
+        &is_task);
+    if (err) {
+        goto error;
+    }
+
+    if (is_task) {
+        result = create_task_result(unwinder, task_address, recurse_task);
+        if (!result) {
+            goto error;
+        }
+    } else {
+        result = PyList_New(0);
+        if (result == NULL) {
+            goto error;
         }
     }
 
     if (PyList_Append(render_to, result)) {
-        goto err;
+        goto error;
     }
 
     if (recurse_task) {
-        PyObject *awaited_by = PyList_New(0);
+        awaited_by = PyList_New(0);
         if (awaited_by == NULL) {
-            goto err;
+            goto error;
         }
+
         if (PyList_Append(result, awaited_by)) {
-            Py_DECREF(awaited_by);
-            goto err;
+            goto error;
         }
-        /* we can operate on a borrowed one to simplify cleanup */
         Py_DECREF(awaited_by);
 
-        if (parse_task_awaited_by(handle, offsets, async_offsets,
-                                task_address, awaited_by, 1)
-        ) {
-            goto err;
+        /* awaited_by is borrowed from 'result' to simplify cleanup */
+        if (parse_task_awaited_by(unwinder, task_address, awaited_by, 1) < 0) {
+            // Clear the pointer so the cleanup doesn't try to decref it since
+            // it's borrowed from 'result' and will be decrefed when result is
+            // deleted.
+            awaited_by = NULL;
+            goto error;
         }
     }
     Py_DECREF(result);
 
     return 0;
 
-err:
-    Py_DECREF(result);
+error:
+    Py_XDECREF(result);
+    Py_XDECREF(awaited_by);
     return -1;
 }
 
 static int
-parse_tasks_in_set(
-    proc_handle_t *handle,
-    struct _Py_DebugOffsets* offsets,
-    struct _Py_AsyncioModuleDebugOffsets* async_offsets,
-    uintptr_t set_addr,
+process_set_entry(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t table_ptr,
     PyObject *awaited_by,
     int recurse_task
 ) {
-    uintptr_t set_obj;
-    if (read_py_ptr(
-            handle,
-            set_addr,
-            &set_obj)
-    ) {
+    uintptr_t key_addr;
+    if (read_py_ptr(&unwinder->handle, table_ptr, &key_addr)) {
         return -1;
     }
 
-    Py_ssize_t num_els;
-    if (read_Py_ssize_t(
-            handle,
-            set_obj + offsets->set_object.used,
-            &num_els)
-    ) {
-        return -1;
-    }
+    if ((void*)key_addr != NULL) {
+        Py_ssize_t ref_cnt;
+        if (read_Py_ssize_t(&unwinder->handle, table_ptr, &ref_cnt)) {
+            return -1;
+        }
 
-    Py_ssize_t set_len;
-    if (read_Py_ssize_t(
-            handle,
-            set_obj + offsets->set_object.mask,
-            &set_len)
-    ) {
-        return -1;
+        if (ref_cnt) {
+            // if 'ref_cnt=0' it's a set dummy marker
+            if (parse_task(unwinder, key_addr, awaited_by, recurse_task)) {
+                return -1;
+            }
+            return 1; // Successfully processed a valid entry
+        }
     }
-    set_len++; // The set contains the `mask+1` element slots.
+    return 0; // Entry was NULL or dummy marker
+}
 
-    uintptr_t table_ptr;
-    if (read_ptr(
-            handle,
-            set_obj + offsets->set_object.table,
-            &table_ptr)
-    ) {
+static int
+parse_tasks_in_set(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t set_addr,
+    PyObject *awaited_by,
+    int recurse_task
+) {
+    char set_object[SIZEOF_SET_OBJ];
+    int err = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle,
+        set_addr,
+        SIZEOF_SET_OBJ,
+        set_object);
+    if (err < 0) {
         return -1;
     }
 
+    Py_ssize_t num_els = GET_MEMBER(Py_ssize_t, set_object, unwinder->debug_offsets.set_object.used);
+    Py_ssize_t set_len = GET_MEMBER(Py_ssize_t, set_object, unwinder->debug_offsets.set_object.mask) + 1; // The set contains the `mask+1` element slots.
+    uintptr_t table_ptr = GET_MEMBER(uintptr_t, set_object, unwinder->debug_offsets.set_object.table);
+
     Py_ssize_t i = 0;
     Py_ssize_t els = 0;
-    while (i < set_len) {
-        uintptr_t key_addr;
-        if (read_py_ptr(handle, table_ptr, &key_addr)) {
+    while (i < set_len && els < num_els) {
+        int result = process_set_entry(unwinder, table_ptr, awaited_by, recurse_task);
+
+        if (result < 0) {
             return -1;
         }
-
-        if ((void*)key_addr != NULL) {
-            Py_ssize_t ref_cnt;
-            if (read_Py_ssize_t(handle, table_ptr, &ref_cnt)) {
-                return -1;
-            }
-
-            if (ref_cnt) {
-                // if 'ref_cnt=0' it's a set dummy marker
-
-                if (parse_task(
-                    handle,
-                    offsets,
-                    async_offsets,
-                    key_addr,
-                    awaited_by,
-                    recurse_task
-                )
-                ) {
-                    return -1;
-                }
-
-                if (++els == num_els) {
-                    break;
-                }
-            }
+        if (result > 0) {
+            els++;
         }
 
         table_ptr += sizeof(void*) * 2;
@@ -714,81 +960,224 @@ parse_tasks_in_set(
 
 
 static int
-parse_task_awaited_by(
-    proc_handle_t *handle,
-    struct _Py_DebugOffsets* offsets,
-    struct _Py_AsyncioModuleDebugOffsets* async_offsets,
-    uintptr_t task_address,
-    PyObject *awaited_by,
-    int recurse_task
+setup_async_result_structure(PyObject **result, PyObject **calls)
+{
+    *result = PyList_New(1);
+    if (*result == NULL) {
+        return -1;
+    }
+
+    *calls = PyList_New(0);
+    if (*calls == NULL) {
+        Py_DECREF(*result);
+        *result = NULL;
+        return -1;
+    }
+
+    if (PyList_SetItem(*result, 0, *calls)) { /* steals ref to 'calls' */
+        Py_DECREF(*calls);
+        Py_DECREF(*result);
+        *result = NULL;
+        *calls = NULL;
+        return -1;
+    }
+
+    return 0;
+}
+
+static int
+add_task_info_to_result(
+    RemoteUnwinderObject *self,
+    PyObject *result,
+    uintptr_t running_task_addr
 ) {
-    uintptr_t task_ab_addr;
-    int err = read_py_ptr(
-        handle,
-        task_address + async_offsets->asyncio_task_object.task_awaited_by,
-        &task_ab_addr);
-    if (err) {
+    PyObject *tn = parse_task_name(self, running_task_addr);
+    if (tn == NULL) {
         return -1;
     }
 
-    if ((void*)task_ab_addr == NULL) {
-        return 0;
+    if (PyList_Append(result, tn)) {
+        Py_DECREF(tn);
+        return -1;
     }
+    Py_DECREF(tn);
 
-    char awaited_by_is_a_set;
-    err = read_char(
-        handle,
-        task_address + async_offsets->asyncio_task_object.task_awaited_by_is_set,
-        &awaited_by_is_a_set);
-    if (err) {
+    PyObject* awaited_by = PyList_New(0);
+    if (awaited_by == NULL) {
         return -1;
     }
 
-    if (awaited_by_is_a_set) {
-        if (parse_tasks_in_set(
-            handle,
-            offsets,
-            async_offsets,
-            task_address + async_offsets->asyncio_task_object.task_awaited_by,
-            awaited_by,
-            recurse_task
-        )
-         ) {
-            return -1;
-        }
-    } else {
-        uintptr_t sub_task;
-        if (read_py_ptr(
-                handle,
-                task_address + async_offsets->asyncio_task_object.task_awaited_by,
-                &sub_task)
-        ) {
-            return -1;
-        }
+    if (PyList_Append(result, awaited_by)) {
+        Py_DECREF(awaited_by);
+        return -1;
+    }
+    Py_DECREF(awaited_by);
 
-        if (parse_task(
-            handle,
-            offsets,
-            async_offsets,
-            sub_task,
-            awaited_by,
-            recurse_task
-        )
-        ) {
-            return -1;
-        }
+    if (parse_task_awaited_by(
+        self, running_task_addr, awaited_by, 1) < 0) {
+        return -1;
     }
 
     return 0;
 }
 
-typedef struct
+static int
+process_single_task_node(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t task_addr,
+    PyObject *result
+) {
+    PyObject *tn = NULL;
+    PyObject *current_awaited_by = NULL;
+    PyObject *task_id = NULL;
+    PyObject *result_item = NULL;
+
+    tn = parse_task_name(unwinder, task_addr);
+    if (tn == NULL) {
+        goto error;
+    }
+
+    current_awaited_by = PyList_New(0);
+    if (current_awaited_by == NULL) {
+        goto error;
+    }
+
+    task_id = PyLong_FromUnsignedLongLong(task_addr);
+    if (task_id == NULL) {
+        goto error;
+    }
+
+    result_item = PyTuple_New(3);
+    if (result_item == NULL) {
+        goto error;
+    }
+
+    PyTuple_SET_ITEM(result_item, 0, task_id);  // steals ref
+    PyTuple_SET_ITEM(result_item, 1, tn);  // steals ref
+    PyTuple_SET_ITEM(result_item, 2, current_awaited_by);  // steals ref
+
+    // References transferred to tuple
+    task_id = NULL;
+    tn = NULL;
+    current_awaited_by = NULL;
+
+    if (PyList_Append(result, result_item)) {
+        Py_DECREF(result_item);
+        return -1;
+    }
+    Py_DECREF(result_item);
+
+    // Get back current_awaited_by reference for parse_task_awaited_by
+    current_awaited_by = PyTuple_GET_ITEM(result_item, 2);
+    if (parse_task_awaited_by(unwinder, task_addr, current_awaited_by, 0) < 0) {
+        return -1;
+    }
+
+    return 0;
+
+error:
+    Py_XDECREF(tn);
+    Py_XDECREF(current_awaited_by);
+    Py_XDECREF(task_id);
+    Py_XDECREF(result_item);
+    return -1;
+}
+
+/* ============================================================================
+ * TLBC CACHING FUNCTIONS
+ * ============================================================================ */
+
+#ifdef Py_GIL_DISABLED
+
+typedef struct {
+    void *tlbc_array;  // Local copy of the TLBC array
+    Py_ssize_t tlbc_array_size;  // Size of the TLBC array
+    uint32_t generation;  // Generation when this was cached
+} TLBCCacheEntry;
+
+static void
+tlbc_cache_entry_destroy(void *ptr)
 {
-    int lineno;
-    int end_lineno;
-    int column;
-    int end_column;
-} LocationInfo;
+    TLBCCacheEntry *entry = (TLBCCacheEntry *)ptr;
+    if (entry->tlbc_array) {
+        PyMem_RawFree(entry->tlbc_array);
+    }
+    PyMem_RawFree(entry);
+}
+
+static TLBCCacheEntry *
+get_tlbc_cache_entry(RemoteUnwinderObject *self, uintptr_t code_addr, uint32_t current_generation)
+{
+    void *key = (void *)code_addr;
+    TLBCCacheEntry *entry = _Py_hashtable_get(self->tlbc_cache, key);
+
+    if (entry && entry->generation != current_generation) {
+        // Entry is stale, remove it by setting to NULL
+        _Py_hashtable_set(self->tlbc_cache, key, NULL);
+        entry = NULL;
+    }
+
+    return entry;
+}
+
+static int
+cache_tlbc_array(RemoteUnwinderObject *self, uintptr_t code_addr, uintptr_t tlbc_array_addr, uint32_t generation)
+{
+    uintptr_t tlbc_array_ptr;
+    void *tlbc_array = NULL;
+    TLBCCacheEntry *entry = NULL;
+
+    // Read the TLBC array pointer
+    if (read_ptr(&self->handle, tlbc_array_addr, &tlbc_array_ptr) != 0 || tlbc_array_ptr == 0) {
+        return 0; // No TLBC array
+    }
+
+    // Read the TLBC array size
+    Py_ssize_t tlbc_size;
+    if (_Py_RemoteDebug_PagedReadRemoteMemory(&self->handle, tlbc_array_ptr, sizeof(tlbc_size), &tlbc_size) != 0 || tlbc_size <= 0) {
+        return 0; // Invalid size
+    }
+
+    // Allocate and read the entire TLBC array
+    size_t array_data_size = tlbc_size * sizeof(void*);
+    tlbc_array = PyMem_RawMalloc(sizeof(Py_ssize_t) + array_data_size);
+    if (!tlbc_array) {
+        return -1; // Memory error
+    }
+
+    if (_Py_RemoteDebug_PagedReadRemoteMemory(&self->handle, tlbc_array_ptr, sizeof(Py_ssize_t) + array_data_size, tlbc_array) != 0) {
+        PyMem_RawFree(tlbc_array);
+        return 0; // Read error
+    }
+
+    // Create cache entry
+    entry = PyMem_RawMalloc(sizeof(TLBCCacheEntry));
+    if (!entry) {
+        PyMem_RawFree(tlbc_array);
+        return -1; // Memory error
+    }
+
+    entry->tlbc_array = tlbc_array;
+    entry->tlbc_array_size = tlbc_size;
+    entry->generation = generation;
+
+    // Store in cache
+    void *key = (void *)code_addr;
+    if (_Py_hashtable_set(self->tlbc_cache, key, entry) < 0) {
+        tlbc_cache_entry_destroy(entry);
+        return -1; // Cache error
+    }
+
+    return 1; // Success
+}
+
+
+
+#endif
+
+/* ============================================================================
+ * LINE TABLE PARSING FUNCTIONS
+ * ============================================================================ */
 
 static int
 scan_varint(const uint8_t **ptr)
@@ -818,7 +1207,6 @@ scan_signed_varint(const uint8_t **ptr)
     }
 }
 
-
 static bool
 parse_linetable(const uintptr_t addrq, const char* linetable, int firstlineno, LocationInfo* info)
 {
@@ -863,7 +1251,9 @@ parse_linetable(const uintptr_t addrq, const char* linetable, int firstlineno, L
             }
             default: {
                 uint8_t second_byte = *(ptr++);
-                assert((second_byte & 128) == 0);
+                if ((second_byte & 128) != 0) {
+                    return false;
+                }
                 info->column = code << 3 | (second_byte >> 4);
                 info->end_column = info->column + (second_byte & 15);
                 break;
@@ -877,240 +1267,387 @@ parse_linetable(const uintptr_t addrq, const char* linetable, int firstlineno, L
     return false;
 }
 
+/* ============================================================================
+ * CODE OBJECT AND FRAME PARSING FUNCTIONS
+ * ============================================================================ */
+
 static int
-read_remote_pointer(proc_handle_t *handle, uintptr_t address, uintptr_t *out_ptr, const char *error_message)
+parse_code_object(RemoteUnwinderObject *unwinder,
+                  PyObject **result,
+                  uintptr_t address,
+                  uintptr_t instruction_pointer,
+                  uintptr_t *previous_frame,
+                  int32_t tlbc_index)
 {
-    int bytes_read = _Py_RemoteDebug_ReadRemoteMemory(handle, address, sizeof(void *), out_ptr);
-    if (bytes_read < 0) {
-        return -1;
+    void *key = (void *)address;
+    CachedCodeMetadata *meta = NULL;
+    PyObject *func = NULL;
+    PyObject *file = NULL;
+    PyObject *linetable = NULL;
+    PyObject *lineno = NULL;
+    PyObject *tuple = NULL;
+
+#ifdef Py_GIL_DISABLED
+    // In free threading builds, code object addresses might have the low bit set
+    // as a flag, so we need to mask it off to get the real address
+    uintptr_t real_address = address & (~1);
+#else
+    uintptr_t real_address = address;
+#endif
+
+    if (unwinder && unwinder->code_object_cache != NULL) {
+        meta = _Py_hashtable_get(unwinder->code_object_cache, key);
     }
 
-    if ((void *)(*out_ptr) == NULL) {
-        PyErr_SetString(PyExc_RuntimeError, error_message);
-        return -1;
+    if (meta == NULL) {
+        char code_object[SIZEOF_CODE_OBJ];
+        if (_Py_RemoteDebug_PagedReadRemoteMemory(
+                &unwinder->handle, real_address, SIZEOF_CODE_OBJ, code_object) < 0)
+        {
+            goto error;
+        }
+
+        func = read_py_str(unwinder,
+            GET_MEMBER(uintptr_t, code_object, unwinder->debug_offsets.code_object.qualname), 1024);
+        if (!func) {
+            goto error;
+        }
+
+        file = read_py_str(unwinder,
+            GET_MEMBER(uintptr_t, code_object, unwinder->debug_offsets.code_object.filename), 1024);
+        if (!file) {
+            goto error;
+        }
+
+        linetable = read_py_bytes(unwinder,
+            GET_MEMBER(uintptr_t, code_object, unwinder->debug_offsets.code_object.linetable), 4096);
+        if (!linetable) {
+            goto error;
+        }
+
+        meta = PyMem_RawMalloc(sizeof(CachedCodeMetadata));
+        if (!meta) {
+            goto error;
+        }
+
+        meta->func_name = func;
+        meta->file_name = file;
+        meta->linetable = linetable;
+        meta->first_lineno = GET_MEMBER(int, code_object, unwinder->debug_offsets.code_object.firstlineno);
+        meta->addr_code_adaptive = real_address + unwinder->debug_offsets.code_object.co_code_adaptive;
+
+        if (unwinder && unwinder->code_object_cache && _Py_hashtable_set(unwinder->code_object_cache, key, meta) < 0) {
+            cached_code_metadata_destroy(meta);
+            goto error;
+        }
+
+        // Ownership transferred to meta
+        func = NULL;
+        file = NULL;
+        linetable = NULL;
+    }
+
+    uintptr_t ip = instruction_pointer;
+    ptrdiff_t addrq;
+
+#ifdef Py_GIL_DISABLED
+    // Handle thread-local bytecode (TLBC) in free threading builds
+    if (tlbc_index == 0 || unwinder->debug_offsets.code_object.co_tlbc == 0 || unwinder == NULL) {
+        // No TLBC or no unwinder - use main bytecode directly
+        addrq = (uint16_t *)ip - (uint16_t *)meta->addr_code_adaptive;
+        goto done_tlbc;
+    }
+
+    // Try to get TLBC data from cache (we'll get generation from the caller)
+    TLBCCacheEntry *tlbc_entry = get_tlbc_cache_entry(unwinder, real_address, unwinder->tlbc_generation);
+
+    if (!tlbc_entry) {
+        // Cache miss - try to read and cache TLBC array
+        if (cache_tlbc_array(unwinder, real_address, real_address + unwinder->debug_offsets.code_object.co_tlbc, unwinder->tlbc_generation) > 0) {
+            tlbc_entry = get_tlbc_cache_entry(unwinder, real_address, unwinder->tlbc_generation);
+        }
+    }
+
+    if (tlbc_entry && tlbc_index < tlbc_entry->tlbc_array_size) {
+        // Use cached TLBC data
+        uintptr_t *entries = (uintptr_t *)((char *)tlbc_entry->tlbc_array + sizeof(Py_ssize_t));
+        uintptr_t tlbc_bytecode_addr = entries[tlbc_index];
+
+        if (tlbc_bytecode_addr != 0) {
+            // Calculate offset from TLBC bytecode
+            addrq = (uint16_t *)ip - (uint16_t *)tlbc_bytecode_addr;
+            goto done_tlbc;
+        }
+    }
+
+    // Fall back to main bytecode
+    addrq = (uint16_t *)ip - (uint16_t *)meta->addr_code_adaptive;
+
+done_tlbc:
+#else
+    // Non-free-threaded build, always use the main bytecode
+    (void)tlbc_index; // Suppress unused parameter warning
+    (void)unwinder;   // Suppress unused parameter warning
+    addrq = (uint16_t *)ip - (uint16_t *)meta->addr_code_adaptive;
+#endif
+    ;  // Empty statement to avoid C23 extension warning
+    LocationInfo info = {0};
+    bool ok = parse_linetable(addrq, PyBytes_AS_STRING(meta->linetable),
+                              meta->first_lineno, &info);
+    if (!ok) {
+        info.lineno = -1;
+    }
+
+    lineno = PyLong_FromLong(info.lineno);
+    if (!lineno) {
+        goto error;
+    }
+
+    tuple = PyTuple_New(3);
+    if (!tuple) {
+        goto error;
     }
 
+    Py_INCREF(meta->func_name);
+    Py_INCREF(meta->file_name);
+    PyTuple_SET_ITEM(tuple, 0, meta->func_name);
+    PyTuple_SET_ITEM(tuple, 1, meta->file_name);
+    PyTuple_SET_ITEM(tuple, 2, lineno);
+
+    *result = tuple;
     return 0;
+
+error:
+    Py_XDECREF(func);
+    Py_XDECREF(file);
+    Py_XDECREF(linetable);
+    Py_XDECREF(lineno);
+    Py_XDECREF(tuple);
+    return -1;
 }
 
-static int
-read_instruction_ptr(proc_handle_t *handle, struct _Py_DebugOffsets *offsets,
-                     uintptr_t current_frame, uintptr_t *instruction_ptr)
+/* ============================================================================
+ * STACK CHUNK MANAGEMENT FUNCTIONS
+ * ============================================================================ */
+
+static void
+cleanup_stack_chunks(StackChunkList *chunks)
 {
-    return read_remote_pointer(
-        handle,
-        current_frame + offsets->interpreter_frame.instr_ptr,
-        instruction_ptr,
-        "No instruction ptr found"
-    );
+    for (size_t i = 0; i < chunks->count; ++i) {
+        PyMem_RawFree(chunks->chunks[i].local_copy);
+    }
+    PyMem_RawFree(chunks->chunks);
 }
 
 static int
-parse_code_object(proc_handle_t *handle,
-                  PyObject **result,
-                  struct _Py_DebugOffsets *offsets,
-                  uintptr_t address,
-                  uintptr_t current_frame,
-                  uintptr_t *previous_frame)
-{
-    uintptr_t addr_func_name, addr_file_name, addr_linetable, instruction_ptr;
+process_single_stack_chunk(
+    proc_handle_t *handle,
+    uintptr_t chunk_addr,
+    StackChunkInfo *chunk_info
+) {
+    // Start with default size assumption
+    size_t current_size = _PY_DATA_STACK_CHUNK_SIZE;
 
-    if (read_remote_pointer(handle, address + offsets->code_object.qualname, &addr_func_name, "No function name found") < 0 ||
-        read_remote_pointer(handle, address + offsets->code_object.filename, &addr_file_name, "No file name found") < 0 ||
-        read_remote_pointer(handle, address + offsets->code_object.linetable, &addr_linetable, "No linetable found") < 0 ||
-        read_instruction_ptr(handle, offsets, current_frame, &instruction_ptr) < 0) {
+    char *this_chunk = PyMem_RawMalloc(current_size);
+    if (!this_chunk) {
+        PyErr_NoMemory();
         return -1;
     }
 
-    int firstlineno;
-    if (_Py_RemoteDebug_ReadRemoteMemory(handle,
-                                         address + offsets->code_object.firstlineno,
-                                         sizeof(int),
-                                         &firstlineno) < 0) {
+    if (_Py_RemoteDebug_PagedReadRemoteMemory(handle, chunk_addr, current_size, this_chunk) < 0) {
+        PyMem_RawFree(this_chunk);
         return -1;
     }
 
-    PyObject *py_linetable = read_py_bytes(handle, offsets, addr_linetable);
-    if (!py_linetable) {
-        return -1;
+    // Check actual size and reread if necessary
+    size_t actual_size = GET_MEMBER(size_t, this_chunk, offsetof(_PyStackChunk, size));
+    if (actual_size != current_size) {
+        this_chunk = PyMem_RawRealloc(this_chunk, actual_size);
+        if (!this_chunk) {
+            PyErr_NoMemory();
+            return -1;
+        }
+
+        if (_Py_RemoteDebug_PagedReadRemoteMemory(handle, chunk_addr, actual_size, this_chunk) < 0) {
+            PyMem_RawFree(this_chunk);
+            return -1;
+        }
+        current_size = actual_size;
     }
 
-    uintptr_t addr_code_adaptive = address + offsets->code_object.co_code_adaptive;
-    ptrdiff_t addrq = (uint16_t *)instruction_ptr - (uint16_t *)addr_code_adaptive;
+    chunk_info->remote_addr = chunk_addr;
+    chunk_info->size = current_size;
+    chunk_info->local_copy = this_chunk;
+    return 0;
+}
 
-    LocationInfo info;
-    parse_linetable(addrq, PyBytes_AS_STRING(py_linetable), firstlineno, &info);
-    Py_DECREF(py_linetable);  // Done with linetable
+static int
+copy_stack_chunks(RemoteUnwinderObject *unwinder,
+                  uintptr_t tstate_addr,
+                  StackChunkList *out_chunks)
+{
+    uintptr_t chunk_addr;
+    StackChunkInfo *chunks = NULL;
+    size_t count = 0;
+    size_t max_chunks = 16;
 
-    PyObject *py_line = PyLong_FromLong(info.lineno);
-    if (!py_line) {
+    if (read_ptr(&unwinder->handle, tstate_addr + unwinder->debug_offsets.thread_state.datastack_chunk, &chunk_addr)) {
         return -1;
     }
 
-    PyObject *py_func_name = read_py_str(handle, offsets, addr_func_name, 256);
-    if (!py_func_name) {
-        Py_DECREF(py_line);
+    chunks = PyMem_RawMalloc(max_chunks * sizeof(StackChunkInfo));
+    if (!chunks) {
+        PyErr_NoMemory();
         return -1;
     }
 
-    PyObject *py_file_name = read_py_str(handle, offsets, addr_file_name, 256);
-    if (!py_file_name) {
-        Py_DECREF(py_line);
-        Py_DECREF(py_func_name);
-        return -1;
+    while (chunk_addr != 0) {
+        // Grow array if needed
+        if (count >= max_chunks) {
+            max_chunks *= 2;
+            StackChunkInfo *new_chunks = PyMem_RawRealloc(chunks, max_chunks * sizeof(StackChunkInfo));
+            if (!new_chunks) {
+                PyErr_NoMemory();
+                goto error;
+            }
+            chunks = new_chunks;
+        }
+
+        // Process this chunk
+        if (process_single_stack_chunk(&unwinder->handle, chunk_addr, &chunks[count]) < 0) {
+            goto error;
+        }
+
+        // Get next chunk address and increment count
+        chunk_addr = GET_MEMBER(uintptr_t, chunks[count].local_copy, offsetof(_PyStackChunk, previous));
+        count++;
     }
 
-    PyObject *result_tuple = PyTuple_New(3);
-    if (!result_tuple) {
-        Py_DECREF(py_line);
-        Py_DECREF(py_func_name);
-        Py_DECREF(py_file_name);
-        return -1;
+    out_chunks->chunks = chunks;
+    out_chunks->count = count;
+    return 0;
+
+error:
+    for (size_t i = 0; i < count; ++i) {
+        PyMem_RawFree(chunks[i].local_copy);
     }
+    PyMem_RawFree(chunks);
+    return -1;
+}
 
-    PyTuple_SET_ITEM(result_tuple, 0, py_func_name);  // steals ref
-    PyTuple_SET_ITEM(result_tuple, 1, py_file_name);  // steals ref
-    PyTuple_SET_ITEM(result_tuple, 2, py_line);       // steals ref
+static void *
+find_frame_in_chunks(StackChunkList *chunks, uintptr_t remote_ptr)
+{
+    for (size_t i = 0; i < chunks->count; ++i) {
+        uintptr_t base = chunks->chunks[i].remote_addr + offsetof(_PyStackChunk, data);
+        size_t payload = chunks->chunks[i].size - offsetof(_PyStackChunk, data);
 
-    *result = result_tuple;
-    return 0;
+        if (remote_ptr >= base && remote_ptr < base + payload) {
+            return (char *)chunks->chunks[i].local_copy + (remote_ptr - chunks->chunks[i].remote_addr);
+        }
+    }
+    return NULL;
 }
 
 static int
-parse_frame_object(
-    proc_handle_t *handle,
-    PyObject** result,
-    struct _Py_DebugOffsets* offsets,
+parse_frame_from_chunks(
+    RemoteUnwinderObject *unwinder,
+    PyObject **result,
     uintptr_t address,
-    uintptr_t* previous_frame
+    uintptr_t *previous_frame,
+    StackChunkList *chunks
 ) {
-    int err;
-
-    Py_ssize_t bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
-        handle,
-        address + offsets->interpreter_frame.previous,
-        sizeof(void*),
-        previous_frame
-    );
-    if (bytes_read < 0) {
+    void *frame_ptr = find_frame_in_chunks(chunks, address);
+    if (!frame_ptr) {
         return -1;
     }
 
-    char owner;
-    if (read_char(handle, address + offsets->interpreter_frame.owner, &owner)) {
-        return -1;
-    }
+    char *frame = (char *)frame_ptr;
+    *previous_frame = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.previous);
 
-    if (owner >= FRAME_OWNED_BY_INTERPRETER) {
+    if (GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner) >= FRAME_OWNED_BY_INTERPRETER ||
+        !GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable)) {
         return 0;
     }
 
-    uintptr_t address_of_code_object;
-    err = read_py_ptr(
-        handle,
-        address + offsets->interpreter_frame.executable,
-        &address_of_code_object
-    );
-    if (err) {
-        return -1;
-    }
+    uintptr_t instruction_pointer = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.instr_ptr);
 
-    if ((void*)address_of_code_object == NULL) {
-        return 0;
+    // Get tlbc_index for free threading builds
+    int32_t tlbc_index = 0;
+#ifdef Py_GIL_DISABLED
+    if (unwinder->debug_offsets.interpreter_frame.tlbc_index != 0) {
+        tlbc_index = GET_MEMBER(int32_t, frame, unwinder->debug_offsets.interpreter_frame.tlbc_index);
     }
+#endif
 
     return parse_code_object(
-        handle, result, offsets, address_of_code_object, address, previous_frame);
+        unwinder, result, GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable),
+        instruction_pointer, previous_frame, tlbc_index);
 }
 
+/* ============================================================================
+ * INTERPRETER STATE AND THREAD DISCOVERY FUNCTIONS
+ * ============================================================================ */
+
 static int
-parse_async_frame_object(
-    proc_handle_t *handle,
-    PyObject** result,
-    struct _Py_DebugOffsets* offsets,
-    uintptr_t address,
-    uintptr_t* previous_frame,
-    uintptr_t* code_object
+populate_initial_state_data(
+    int all_threads,
+    RemoteUnwinderObject *unwinder,
+    uintptr_t runtime_start_address,
+    uintptr_t *interpreter_state,
+    uintptr_t *tstate
 ) {
-    int err;
-
-    Py_ssize_t bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
-        handle,
-        address + offsets->interpreter_frame.previous,
-        sizeof(void*),
-        previous_frame
-    );
-    if (bytes_read < 0) {
-        return -1;
-    }
+    uint64_t interpreter_state_list_head =
+        unwinder->debug_offsets.runtime_state.interpreters_head;
 
-    char owner;
-    bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
-        handle, address + offsets->interpreter_frame.owner, sizeof(char), &owner);
+    uintptr_t address_of_interpreter_state;
+    int bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
+            &unwinder->handle,
+            runtime_start_address + interpreter_state_list_head,
+            sizeof(void*),
+            &address_of_interpreter_state);
     if (bytes_read < 0) {
         return -1;
     }
 
-    if (owner == FRAME_OWNED_BY_CSTACK || owner == FRAME_OWNED_BY_INTERPRETER) {
-        return 0;  // C frame
-    }
-
-    if (owner != FRAME_OWNED_BY_GENERATOR
-        && owner != FRAME_OWNED_BY_THREAD) {
-        PyErr_Format(PyExc_RuntimeError, "Unhandled frame owner %d.\n", owner);
+    if (address_of_interpreter_state == 0) {
+        PyErr_SetString(PyExc_RuntimeError, "No interpreter state found");
         return -1;
     }
 
-    err = read_py_ptr(
-        handle,
-        address + offsets->interpreter_frame.executable,
-        code_object
-    );
-    if (err) {
-        return -1;
-    }
+    *interpreter_state = address_of_interpreter_state;
 
-    assert(code_object != NULL);
-    if ((void*)*code_object == NULL) {
+    if (all_threads) {
+        *tstate = 0;
         return 0;
     }
 
-    if (parse_code_object(
-        handle, result, offsets, *code_object, address, previous_frame)) {
-        return -1;
-    }
-
-    return 1;
-}
+    uintptr_t address_of_thread = address_of_interpreter_state +
+                    unwinder->debug_offsets.interpreter_state.threads_main;
 
-static int
-read_async_debug(
-    proc_handle_t *handle,
-    struct _Py_AsyncioModuleDebugOffsets* async_debug
-) {
-    uintptr_t async_debug_addr = _Py_RemoteDebug_GetAsyncioDebugAddress(handle);
-    if (!async_debug_addr) {
+    if (_Py_RemoteDebug_PagedReadRemoteMemory(
+            &unwinder->handle,
+            address_of_thread,
+            sizeof(void*),
+            tstate) < 0) {
         return -1;
     }
 
-    size_t size = sizeof(struct _Py_AsyncioModuleDebugOffsets);
-    int result = _Py_RemoteDebug_ReadRemoteMemory(handle, async_debug_addr, size, async_debug);
-    return result;
+    return 0;
 }
 
 static int
 find_running_frame(
-    proc_handle_t *handle,
+    RemoteUnwinderObject *unwinder,
     uintptr_t runtime_start_address,
-    _Py_DebugOffsets* local_debug_offsets,
     uintptr_t *frame
 ) {
     uint64_t interpreter_state_list_head =
-        local_debug_offsets->runtime_state.interpreters_head;
+        unwinder->debug_offsets.runtime_state.interpreters_head;
 
     uintptr_t address_of_interpreter_state;
-    int bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
-            handle,
+    int bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
+            &unwinder->handle,
             runtime_start_address + interpreter_state_list_head,
             sizeof(void*),
             &address_of_interpreter_state);
@@ -1124,10 +1661,10 @@ find_running_frame(
     }
 
     uintptr_t address_of_thread;
-    bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
-            handle,
+    bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
+            &unwinder->handle,
             address_of_interpreter_state +
-                local_debug_offsets->interpreter_state.threads_main,
+                unwinder->debug_offsets.interpreter_state.threads_main,
             sizeof(void*),
             &address_of_thread);
     if (bytes_read < 0) {
@@ -1137,8 +1674,8 @@ find_running_frame(
     // No Python frames are available for us (can happen at tear-down).
     if ((void*)address_of_thread != NULL) {
         int err = read_ptr(
-            handle,
-            address_of_thread + local_debug_offsets->thread_state.current_frame,
+            &unwinder->handle,
+            address_of_thread + unwinder->debug_offsets.thread_state.current_frame,
             frame);
         if (err) {
             return -1;
@@ -1152,21 +1689,18 @@ find_running_frame(
 
 static int
 find_running_task(
-    proc_handle_t *handle,
-    uintptr_t runtime_start_address,
-    _Py_DebugOffsets *local_debug_offsets,
-    struct _Py_AsyncioModuleDebugOffsets *async_offsets,
+    RemoteUnwinderObject *unwinder,
     uintptr_t *running_task_addr
 ) {
     *running_task_addr = (uintptr_t)NULL;
 
     uint64_t interpreter_state_list_head =
-        local_debug_offsets->runtime_state.interpreters_head;
+        unwinder->debug_offsets.runtime_state.interpreters_head;
 
     uintptr_t address_of_interpreter_state;
-    int bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
-            handle,
-            runtime_start_address + interpreter_state_list_head,
+    int bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
+            &unwinder->handle,
+            unwinder->runtime_start_address + interpreter_state_list_head,
             sizeof(void*),
             &address_of_interpreter_state);
     if (bytes_read < 0) {
@@ -1179,10 +1713,10 @@ find_running_task(
     }
 
     uintptr_t address_of_thread;
-    bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
-            handle,
+    bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
+            &unwinder->handle,
             address_of_interpreter_state +
-                local_debug_offsets->interpreter_state.threads_head,
+                unwinder->debug_offsets.interpreter_state.threads_head,
             sizeof(void*),
             &address_of_thread);
     if (bytes_read < 0) {
@@ -1196,9 +1730,9 @@ find_running_task(
     }
 
     bytes_read = read_py_ptr(
-        handle,
+        &unwinder->handle,
         address_of_thread
-        + async_offsets->asyncio_thread_state.asyncio_running_loop,
+        + unwinder->async_debug_offsets.asyncio_thread_state.asyncio_running_loop,
         &address_of_running_loop);
     if (bytes_read == -1) {
         return -1;
@@ -1210,9 +1744,9 @@ find_running_task(
     }
 
     int err = read_ptr(
-        handle,
+        &unwinder->handle,
         address_of_thread
-        + async_offsets->asyncio_thread_state.asyncio_running_task,
+        + unwinder->async_debug_offsets.asyncio_thread_state.asyncio_running_task,
         running_task_addr);
     if (err) {
         return -1;
@@ -1222,579 +1756,936 @@ find_running_task(
 }
 
 static int
-append_awaited_by_for_thread(
-    proc_handle_t *handle,
-    uintptr_t head_addr,
-    struct _Py_DebugOffsets *debug_offsets,
-    struct _Py_AsyncioModuleDebugOffsets *async_offsets,
-    PyObject *result
+find_running_task_and_coro(
+    RemoteUnwinderObject *self,
+    uintptr_t *running_task_addr,
+    uintptr_t *running_coro_addr,
+    uintptr_t *running_task_code_obj
 ) {
-    struct llist_node task_node;
-
-    if (0 > _Py_RemoteDebug_ReadRemoteMemory(
-                handle,
-                head_addr,
-                sizeof(task_node),
-                &task_node))
-    {
+    *running_task_addr = (uintptr_t)NULL;
+    if (find_running_task(
+        self, running_task_addr) < 0) {
+        chain_exceptions(PyExc_RuntimeError, "Failed to find running task");
         return -1;
     }
 
-    size_t iteration_count = 0;
-    const size_t MAX_ITERATIONS = 2 << 15;  // A reasonable upper bound
-    while ((uintptr_t)task_node.next != head_addr) {
-        if (++iteration_count > MAX_ITERATIONS) {
-            PyErr_SetString(PyExc_RuntimeError, "Task list appears corrupted");
-            return -1;
-        }
-
-        if (task_node.next == NULL) {
-            PyErr_SetString(
-                PyExc_RuntimeError,
-                "Invalid linked list structure reading remote memory");
-            return -1;
-        }
-
-        uintptr_t task_addr = (uintptr_t)task_node.next
-            - async_offsets->asyncio_task_object.task_node;
-
-        PyObject *tn = parse_task_name(
-            handle,
-            debug_offsets,
-            async_offsets,
-            task_addr);
-        if (tn == NULL) {
-            return -1;
-        }
-
-        PyObject *current_awaited_by = PyList_New(0);
-        if (current_awaited_by == NULL) {
-            Py_DECREF(tn);
-            return -1;
-        }
-
-        PyObject* task_id = PyLong_FromUnsignedLongLong(task_addr);
-        if (task_id == NULL) {
-            Py_DECREF(tn);
-            Py_DECREF(current_awaited_by);
-            return -1;
-        }
-
-        PyObject *result_item = PyTuple_New(3);
-        if (result_item == NULL) {
-            Py_DECREF(tn);
-            Py_DECREF(current_awaited_by);
-            Py_DECREF(task_id);
-            return -1;
-        }
-
-        PyTuple_SET_ITEM(result_item, 0, task_id);  // steals ref
-        PyTuple_SET_ITEM(result_item, 1, tn);  // steals ref
-        PyTuple_SET_ITEM(result_item, 2, current_awaited_by);  // steals ref
-        if (PyList_Append(result, result_item)) {
-            Py_DECREF(result_item);
-            return -1;
-        }
-        Py_DECREF(result_item);
-
-        if (parse_task_awaited_by(handle, debug_offsets, async_offsets,
-                                  task_addr, current_awaited_by, 0))
-        {
-            return -1;
-        }
-
-        // onto the next one...
-        if (0 > _Py_RemoteDebug_ReadRemoteMemory(
-                    handle,
-                    (uintptr_t)task_node.next,
-                    sizeof(task_node),
-                    &task_node))
-        {
-            return -1;
-        }
-    }
-
-    return 0;
-}
-
-static int
-append_awaited_by(
-    proc_handle_t *handle,
-    unsigned long tid,
-    uintptr_t head_addr,
-    struct _Py_DebugOffsets *debug_offsets,
-    struct _Py_AsyncioModuleDebugOffsets *async_offsets,
-    PyObject *result)
-{
-    PyObject *tid_py = PyLong_FromUnsignedLong(tid);
-    if (tid_py == NULL) {
+    if ((void*)*running_task_addr == NULL) {
+        PyErr_SetString(PyExc_RuntimeError, "No running task found");
         return -1;
     }
 
-    PyObject *result_item = PyTuple_New(2);
-    if (result_item == NULL) {
-        Py_DECREF(tid_py);
+    if (read_py_ptr(
+        &self->handle,
+        *running_task_addr + self->async_debug_offsets.asyncio_task_object.task_coro,
+        running_coro_addr) < 0) {
+        chain_exceptions(PyExc_RuntimeError, "Failed to read running task coro");
         return -1;
     }
 
-    PyObject* awaited_by_for_thread = PyList_New(0);
-    if (awaited_by_for_thread == NULL) {
-        Py_DECREF(tid_py);
-        Py_DECREF(result_item);
+    if ((void*)*running_coro_addr == NULL) {
+        PyErr_SetString(PyExc_RuntimeError, "Running task coro is NULL");
         return -1;
     }
 
-    PyTuple_SET_ITEM(result_item, 0, tid_py);  // steals ref
-    PyTuple_SET_ITEM(result_item, 1, awaited_by_for_thread);  // steals ref
-    if (PyList_Append(result, result_item)) {
-        Py_DECREF(result_item);
+    // note: genobject's gi_iframe is an embedded struct so the address to
+    // the offset leads directly to its first field: f_executable
+    if (read_py_ptr(
+        &self->handle,
+        *running_coro_addr + self->debug_offsets.gen_object.gi_iframe,
+        running_task_code_obj) < 0) {
         return -1;
     }
-    Py_DECREF(result_item);
 
-    if (append_awaited_by_for_thread(
-            handle,
-            head_addr,
-            debug_offsets,
-            async_offsets,
-            awaited_by_for_thread))
-    {
+    if ((void*)*running_task_code_obj == NULL) {
+        PyErr_SetString(PyExc_RuntimeError, "Running task code object is NULL");
         return -1;
     }
 
     return 0;
 }
 
-static PyObject*
-get_all_awaited_by(PyObject* self, PyObject* args)
-{
-#if (!defined(__linux__) && !defined(__APPLE__))  && !defined(MS_WINDOWS) || \
-    (defined(__linux__) && !HAVE_PROCESS_VM_READV)
-    PyErr_SetString(
-        PyExc_RuntimeError,
-        "get_all_awaited_by is not implemented on this platform");
-    return NULL;
-#endif
 
-    int pid;
-    if (!PyArg_ParseTuple(args, "i", &pid)) {
-        return NULL;
+/* ============================================================================
+ * FRAME PARSING FUNCTIONS
+ * ============================================================================ */
+
+static int
+parse_frame_object(
+    RemoteUnwinderObject *unwinder,
+    PyObject** result,
+    uintptr_t address,
+    uintptr_t* previous_frame
+) {
+    char frame[SIZEOF_INTERP_FRAME];
+
+    Py_ssize_t bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle,
+        address,
+        SIZEOF_INTERP_FRAME,
+        frame
+    );
+    if (bytes_read < 0) {
+        return -1;
     }
 
-    proc_handle_t the_handle;
-    proc_handle_t *handle = &the_handle;
-    if (_Py_RemoteDebug_InitProcHandle(handle, pid) < 0) {
+    *previous_frame = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.previous);
+
+    if (GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner) >= FRAME_OWNED_BY_INTERPRETER) {
         return 0;
     }
 
-    PyObject *result = NULL;
-
-    uintptr_t runtime_start_addr = _Py_RemoteDebug_GetPyRuntimeAddress(handle);
-    if (runtime_start_addr == 0) {
-        if (!PyErr_Occurred()) {
-            PyErr_SetString(
-                PyExc_RuntimeError, "Failed to get .PyRuntime address");
-        }
-        goto result_err;
+    if ((void*)GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable) == NULL) {
+        return 0;
     }
-    struct _Py_DebugOffsets local_debug_offsets;
 
-    if (_Py_RemoteDebug_ReadDebugOffsets(handle, &runtime_start_addr, &local_debug_offsets)) {
-        chain_exceptions(PyExc_RuntimeError, "Failed to read debug offsets");
-        goto result_err;
+    uintptr_t instruction_pointer = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.instr_ptr);
+
+    // Get tlbc_index for free threading builds
+    int32_t tlbc_index = 0;
+#ifdef Py_GIL_DISABLED
+    if (unwinder->debug_offsets.interpreter_frame.tlbc_index != 0) {
+        tlbc_index = GET_MEMBER(int32_t, frame, unwinder->debug_offsets.interpreter_frame.tlbc_index);
     }
+#endif
 
-    struct _Py_AsyncioModuleDebugOffsets local_async_debug;
-    if (read_async_debug(handle, &local_async_debug)) {
-        chain_exceptions(PyExc_RuntimeError, "Failed to read asyncio debug offsets");
-        goto result_err;
+    return parse_code_object(
+        unwinder, result, GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable),
+        instruction_pointer, previous_frame, tlbc_index);
+}
+
+static int
+parse_async_frame_object(
+    RemoteUnwinderObject *unwinder,
+    PyObject** result,
+    uintptr_t address,
+    uintptr_t* previous_frame,
+    uintptr_t* code_object
+) {
+    char frame[SIZEOF_INTERP_FRAME];
+
+    Py_ssize_t bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle,
+        address,
+        SIZEOF_INTERP_FRAME,
+        frame
+    );
+    if (bytes_read < 0) {
+        return -1;
     }
 
-    result = PyList_New(0);
-    if (result == NULL) {
-        goto result_err;
+    *previous_frame = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.previous);
+
+    if (GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner) == FRAME_OWNED_BY_CSTACK ||
+        GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner) == FRAME_OWNED_BY_INTERPRETER) {
+        return 0;  // C frame
     }
 
-    uint64_t interpreter_state_list_head =
-        local_debug_offsets.runtime_state.interpreters_head;
+    if (GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner) != FRAME_OWNED_BY_GENERATOR
+        && GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner) != FRAME_OWNED_BY_THREAD) {
+        PyErr_Format(PyExc_RuntimeError, "Unhandled frame owner %d.\n",
+                    GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner));
+        return -1;
+    }
 
-    uintptr_t interpreter_state_addr;
-    if (0 > _Py_RemoteDebug_ReadRemoteMemory(
-                handle,
-                runtime_start_addr + interpreter_state_list_head,
-                sizeof(void*),
-                &interpreter_state_addr))
-    {
-        goto result_err;
+    *code_object = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable);
+    // Strip tag bits for consistent comparison
+    *code_object &= ~Py_TAG_BITS;
+
+    assert(code_object != NULL);
+    if ((void*)*code_object == NULL) {
+        return 0;
     }
 
-    uintptr_t thread_state_addr;
-    unsigned long tid = 0;
-    if (0 > _Py_RemoteDebug_ReadRemoteMemory(
-                handle,
-                interpreter_state_addr
-                + local_debug_offsets.interpreter_state.threads_head,
-                sizeof(void*),
-                &thread_state_addr))
-    {
-        goto result_err;
+    uintptr_t instruction_pointer = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.instr_ptr);
+
+    // Get tlbc_index for free threading builds
+    int32_t tlbc_index = 0;
+#ifdef Py_GIL_DISABLED
+    if (unwinder->debug_offsets.interpreter_frame.tlbc_index != 0) {
+        tlbc_index = GET_MEMBER(int32_t, frame, unwinder->debug_offsets.interpreter_frame.tlbc_index);
     }
+#endif
 
-    uintptr_t head_addr;
-    while (thread_state_addr != 0) {
-        if (0 > _Py_RemoteDebug_ReadRemoteMemory(
-                    handle,
-                    thread_state_addr
-                    + local_debug_offsets.thread_state.native_thread_id,
-                    sizeof(tid),
-                    &tid))
-        {
-            goto result_err;
+    if (parse_code_object(
+        unwinder, result, *code_object, instruction_pointer, previous_frame, tlbc_index)) {
+        return -1;
+    }
+
+    return 1;
+}
+
+static int
+parse_async_frame_chain(
+    RemoteUnwinderObject *self,
+    PyObject *calls,
+    uintptr_t running_task_code_obj
+) {
+    uintptr_t address_of_current_frame;
+    if (find_running_frame(self, self->runtime_start_address, &address_of_current_frame) < 0) {
+        chain_exceptions(PyExc_RuntimeError, "Failed to find running frame");
+        return -1;
+    }
+
+    uintptr_t address_of_code_object;
+    while ((void*)address_of_current_frame != NULL) {
+        PyObject* frame_info = NULL;
+        int res = parse_async_frame_object(
+            self,
+            &frame_info,
+            address_of_current_frame,
+            &address_of_current_frame,
+            &address_of_code_object
+        );
+
+        if (res < 0) {
+            chain_exceptions(PyExc_RuntimeError, "Failed to parse async frame object");
+            return -1;
         }
 
-        head_addr = thread_state_addr
-            + local_async_debug.asyncio_thread_state.asyncio_tasks_head;
+        if (!frame_info) {
+            continue;
+        }
 
-        if (append_awaited_by(handle, tid, head_addr, &local_debug_offsets,
-                              &local_async_debug, result))
-        {
-            goto result_err;
+        if (PyList_Append(calls, frame_info) == -1) {
+            Py_DECREF(frame_info);
+            return -1;
         }
 
-        if (0 > _Py_RemoteDebug_ReadRemoteMemory(
-                    handle,
-                    thread_state_addr + local_debug_offsets.thread_state.next,
-                    sizeof(void*),
-                    &thread_state_addr))
-        {
-            goto result_err;
+        Py_DECREF(frame_info);
+
+        if (address_of_code_object == running_task_code_obj) {
+            break;
         }
     }
 
-    head_addr = interpreter_state_addr
-        + local_async_debug.asyncio_interpreter_state.asyncio_tasks_head;
+    return 0;
+}
 
-    // On top of a per-thread task lists used by default by asyncio to avoid
-    // contention, there is also a fallback per-interpreter list of tasks;
-    // any tasks still pending when a thread is destroyed will be moved to the
-    // per-interpreter task list.  It's unlikely we'll find anything here, but
-    // interesting for debugging.
-    if (append_awaited_by(handle, 0, head_addr, &local_debug_offsets,
-                        &local_async_debug, result))
-    {
-        goto result_err;
+/* ============================================================================
+ * AWAITED BY PARSING FUNCTIONS
+ * ============================================================================ */
+
+static int
+append_awaited_by_for_thread(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t head_addr,
+    PyObject *result
+) {
+    char task_node[SIZEOF_LLIST_NODE];
+
+    if (_Py_RemoteDebug_PagedReadRemoteMemory(&unwinder->handle, head_addr,
+                                              sizeof(task_node), task_node) < 0) {
+        return -1;
     }
 
-    _Py_RemoteDebug_CleanupProcHandle(handle);
-    return result;
+    size_t iteration_count = 0;
+    const size_t MAX_ITERATIONS = 2 << 15;  // A reasonable upper bound
 
-result_err:
-    Py_XDECREF(result);
-    _Py_RemoteDebug_CleanupProcHandle(handle);
-    return NULL;
-}
+    while (GET_MEMBER(uintptr_t, task_node, unwinder->debug_offsets.llist_node.next) != head_addr) {
+        if (++iteration_count > MAX_ITERATIONS) {
+            PyErr_SetString(PyExc_RuntimeError, "Task list appears corrupted");
+            return -1;
+        }
 
-static PyObject*
-get_stack_trace(PyObject* self, PyObject* args)
-{
-#if (!defined(__linux__) && !defined(__APPLE__))  && !defined(MS_WINDOWS) || \
-    (defined(__linux__) && !HAVE_PROCESS_VM_READV)
-    PyErr_SetString(
-        PyExc_RuntimeError,
-        "get_stack_trace is not supported on this platform");
-    return NULL;
-#endif
+        if (GET_MEMBER(uintptr_t, task_node, unwinder->debug_offsets.llist_node.next) == 0) {
+            PyErr_SetString(PyExc_RuntimeError,
+                           "Invalid linked list structure reading remote memory");
+            return -1;
+        }
 
-    int pid;
-    if (!PyArg_ParseTuple(args, "i", &pid)) {
-        return NULL;
-    }
+        uintptr_t task_addr = (uintptr_t)GET_MEMBER(uintptr_t, task_node, unwinder->debug_offsets.llist_node.next)
+            - unwinder->async_debug_offsets.asyncio_task_object.task_node;
 
-    proc_handle_t the_handle;
-    proc_handle_t *handle = &the_handle;
-    if (_Py_RemoteDebug_InitProcHandle(handle, pid) < 0) {
-        return 0;
+        if (process_single_task_node(unwinder, task_addr, result) < 0) {
+            return -1;
+        }
+
+        // Read next node
+        if (_Py_RemoteDebug_PagedReadRemoteMemory(
+                &unwinder->handle,
+                (uintptr_t)GET_MEMBER(uintptr_t, task_node, unwinder->debug_offsets.llist_node.next),
+                sizeof(task_node),
+                task_node) < 0) {
+            return -1;
+        }
     }
 
-    PyObject* result = NULL;
+    return 0;
+}
 
-    uintptr_t runtime_start_address = _Py_RemoteDebug_GetPyRuntimeAddress(handle);
-    if (runtime_start_address == 0) {
-        if (!PyErr_Occurred()) {
-            PyErr_SetString(
-                PyExc_RuntimeError, "Failed to get .PyRuntime address");
-        }
-        goto result_err;
+static int
+append_awaited_by(
+    RemoteUnwinderObject *unwinder,
+    unsigned long tid,
+    uintptr_t head_addr,
+    PyObject *result)
+{
+    PyObject *tid_py = PyLong_FromUnsignedLong(tid);
+    if (tid_py == NULL) {
+        return -1;
     }
-    struct _Py_DebugOffsets local_debug_offsets;
 
-    if (_Py_RemoteDebug_ReadDebugOffsets(handle, &runtime_start_address, &local_debug_offsets)) {
-        chain_exceptions(PyExc_RuntimeError, "Failed to read debug offsets");
-        goto result_err;
+    PyObject *result_item = PyTuple_New(2);
+    if (result_item == NULL) {
+        Py_DECREF(tid_py);
+        return -1;
     }
 
-    uintptr_t address_of_current_frame;
-    if (find_running_frame(
-        handle, runtime_start_address, &local_debug_offsets,
-        &address_of_current_frame)
-    ) {
-        goto result_err;
+    PyObject* awaited_by_for_thread = PyList_New(0);
+    if (awaited_by_for_thread == NULL) {
+        Py_DECREF(tid_py);
+        Py_DECREF(result_item);
+        return -1;
     }
 
-    result = PyList_New(0);
-    if (result == NULL) {
-        goto result_err;
+    PyTuple_SET_ITEM(result_item, 0, tid_py);  // steals ref
+    PyTuple_SET_ITEM(result_item, 1, awaited_by_for_thread);  // steals ref
+    if (PyList_Append(result, result_item)) {
+        Py_DECREF(result_item);
+        return -1;
     }
+    Py_DECREF(result_item);
 
-    while ((void*)address_of_current_frame != NULL) {
-        PyObject* frame_info = NULL;
-        if (parse_frame_object(
-                    handle,
-                    &frame_info,
-                    &local_debug_offsets,
-                    address_of_current_frame,
-                    &address_of_current_frame)
-            < 0)
-        {
-            Py_CLEAR(result);
-            goto result_err;
+    if (append_awaited_by_for_thread(unwinder, head_addr, awaited_by_for_thread))
+    {
+        return -1;
+    }
+
+    return 0;
+}
+
+/* ============================================================================
+ * STACK UNWINDING FUNCTIONS
+ * ============================================================================ */
+
+static int
+process_frame_chain(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t initial_frame_addr,
+    StackChunkList *chunks,
+    PyObject *frame_info
+) {
+    uintptr_t frame_addr = initial_frame_addr;
+    uintptr_t prev_frame_addr = 0;
+    const size_t MAX_FRAMES = 1024;
+    size_t frame_count = 0;
+
+    while ((void*)frame_addr != NULL) {
+        PyObject *frame = NULL;
+        uintptr_t next_frame_addr = 0;
+
+        if (++frame_count > MAX_FRAMES) {
+            PyErr_SetString(PyExc_RuntimeError, "Too many stack frames (possible infinite loop)");
+            return -1;
         }
 
-        if (!frame_info) {
-            continue;
+        // Try chunks first, fallback to direct memory read
+        if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, chunks) < 0) {
+            PyErr_Clear();
+            if (parse_frame_object(unwinder, &frame, frame_addr, &next_frame_addr) < 0) {
+                return -1;
+            }
         }
 
-        if (PyList_Append(result, frame_info) == -1) {
-            Py_CLEAR(result);
-            goto result_err;
+        if (!frame) {
+            break;
         }
 
-        Py_DECREF(frame_info);
-        frame_info = NULL;
+        if (prev_frame_addr && frame_addr != prev_frame_addr) {
+            PyErr_Format(PyExc_RuntimeError,
+                        "Broken frame chain: expected frame at 0x%lx, got 0x%lx",
+                        prev_frame_addr, frame_addr);
+            Py_DECREF(frame);
+            return -1;
+        }
 
+        if (PyList_Append(frame_info, frame) == -1) {
+            Py_DECREF(frame);
+            return -1;
+        }
+        Py_DECREF(frame);
+
+        prev_frame_addr = next_frame_addr;
+        frame_addr = next_frame_addr;
     }
 
-result_err:
-    _Py_RemoteDebug_CleanupProcHandle(handle);
-    return result;
+    return 0;
 }
 
 static PyObject*
-get_async_stack_trace(PyObject* self, PyObject* args)
-{
-#if (!defined(__linux__) && !defined(__APPLE__))  && !defined(MS_WINDOWS) || \
-    (defined(__linux__) && !HAVE_PROCESS_VM_READV)
-    PyErr_SetString(
-        PyExc_RuntimeError,
-        "get_stack_trace is not supported on this platform");
-    return NULL;
-#endif
-    int pid;
+unwind_stack_for_thread(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t *current_tstate
+) {
+    PyObject *frame_info = NULL;
+    PyObject *thread_id = NULL;
+    PyObject *result = NULL;
+    StackChunkList chunks = {0};
 
-    if (!PyArg_ParseTuple(args, "i", &pid)) {
-        return NULL;
+    char ts[SIZEOF_THREAD_STATE];
+    int bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
+        &unwinder->handle, *current_tstate, unwinder->debug_offsets.thread_state.size, ts);
+    if (bytes_read < 0) {
+        goto error;
     }
 
-    proc_handle_t the_handle;
-    proc_handle_t *handle = &the_handle;
-    if (_Py_RemoteDebug_InitProcHandle(handle, pid) < 0) {
-        return 0;
-    }
+    uintptr_t frame_addr = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.current_frame);
 
-    PyObject *result = NULL;
+    frame_info = PyList_New(0);
+    if (!frame_info) {
+        goto error;
+    }
 
-    uintptr_t runtime_start_address = _Py_RemoteDebug_GetPyRuntimeAddress(handle);
-    if (runtime_start_address == 0) {
-        if (!PyErr_Occurred()) {
-            PyErr_SetString(
-                PyExc_RuntimeError, "Failed to get .PyRuntime address");
-        }
-        goto result_err;
+    if (copy_stack_chunks(unwinder, *current_tstate, &chunks) < 0) {
+        goto error;
     }
-    struct _Py_DebugOffsets local_debug_offsets;
 
-    if (_Py_RemoteDebug_ReadDebugOffsets(handle, &runtime_start_address, &local_debug_offsets)) {
-        chain_exceptions(PyExc_RuntimeError, "Failed to read debug offsets");
-        goto result_err;
+    if (process_frame_chain(unwinder, frame_addr, &chunks, frame_info) < 0) {
+        goto error;
     }
 
-    struct _Py_AsyncioModuleDebugOffsets local_async_debug;
-    if (read_async_debug(handle, &local_async_debug)) {
-        chain_exceptions(PyExc_RuntimeError, "Failed to read asyncio debug offsets");
-        goto result_err;
+    *current_tstate = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.next);
+
+    thread_id = PyLong_FromLongLong(
+        GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.native_thread_id));
+    if (thread_id == NULL) {
+        goto error;
     }
 
-    result = PyList_New(1);
+    result = PyTuple_New(2);
     if (result == NULL) {
-        goto result_err;
+        goto error;
     }
-    PyObject* calls = PyList_New(0);
-    if (calls == NULL) {
-        goto result_err;
+
+    PyTuple_SET_ITEM(result, 0, thread_id);  // Steals reference
+    PyTuple_SET_ITEM(result, 1, frame_info); // Steals reference
+
+    cleanup_stack_chunks(&chunks);
+    return result;
+
+error:
+    Py_XDECREF(frame_info);
+    Py_XDECREF(thread_id);
+    Py_XDECREF(result);
+    cleanup_stack_chunks(&chunks);
+    return NULL;
+}
+
+
+/* ============================================================================
+ * REMOTEUNWINDER CLASS IMPLEMENTATION
+ * ============================================================================ */
+
+/*[clinic input]
+class _remote_debugging.RemoteUnwinder "RemoteUnwinderObject *" "&RemoteUnwinder_Type"
+[clinic start generated code]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=55f164d8803318be]*/
+
+/*[clinic input]
+_remote_debugging.RemoteUnwinder.__init__
+    pid: int
+    *
+    all_threads: bool = False
+
+Initialize a new RemoteUnwinder object for debugging a remote Python process.
+
+Args:
+    pid: Process ID of the target Python process to debug
+    all_threads: If True, initialize state for all threads in the process.
+                If False, only initialize for the main thread.
+
+The RemoteUnwinder provides functionality to inspect and debug a running Python
+process, including examining thread states, stack frames and other runtime data.
+
+Raises:
+    PermissionError: If access to the target process is denied
+    OSError: If unable to attach to the target process or access its memory
+    RuntimeError: If unable to read debug information from the target process
+[clinic start generated code]*/
+
+static int
+_remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
+                                               int pid, int all_threads)
+/*[clinic end generated code: output=b8027cb247092081 input=6a2056b04e6f050e]*/
+{
+    if (_Py_RemoteDebug_InitProcHandle(&self->handle, pid) < 0) {
+        return -1;
     }
-    if (PyList_SetItem(result, 0, calls)) { /* steals ref to 'calls' */
-        Py_DECREF(calls);
-        goto result_err;
+
+    self->runtime_start_address = _Py_RemoteDebug_GetPyRuntimeAddress(&self->handle);
+    if (self->runtime_start_address == 0) {
+        return -1;
     }
 
-    uintptr_t running_task_addr = (uintptr_t)NULL;
-    if (find_running_task(
-        handle, runtime_start_address, &local_debug_offsets, &local_async_debug,
-        &running_task_addr)
-    ) {
-        chain_exceptions(PyExc_RuntimeError, "Failed to find running task");
-        goto result_err;
+    if (_Py_RemoteDebug_ReadDebugOffsets(&self->handle,
+                                         &self->runtime_start_address,
+                                         &self->debug_offsets) < 0)
+    {
+        return -1;
     }
 
-    if ((void*)running_task_addr == NULL) {
-        PyErr_SetString(PyExc_RuntimeError, "No running task found");
-        goto result_err;
+    // Try to read async debug offsets, but don't fail if they're not available
+    self->async_debug_offsets_available = 1;
+    if (read_async_debug(self) < 0) {
+        PyErr_Clear();
+        memset(&self->async_debug_offsets, 0, sizeof(self->async_debug_offsets));
+        self->async_debug_offsets_available = 0;
     }
 
-    uintptr_t running_coro_addr;
-    if (read_py_ptr(
-        handle,
-        running_task_addr + local_async_debug.asyncio_task_object.task_coro,
-        &running_coro_addr
-    )) {
-        chain_exceptions(PyExc_RuntimeError, "Failed to read running task coro");
-        goto result_err;
+    if (populate_initial_state_data(all_threads, self, self->runtime_start_address,
+                    &self->interpreter_addr ,&self->tstate_addr) < 0)
+    {
+        return -1;
     }
 
-    if ((void*)running_coro_addr == NULL) {
-        PyErr_SetString(PyExc_RuntimeError, "Running task coro is NULL");
-        goto result_err;
+    self->code_object_cache = _Py_hashtable_new_full(
+        _Py_hashtable_hash_ptr,
+        _Py_hashtable_compare_direct,
+        NULL,  // keys are stable pointers, don't destroy
+        cached_code_metadata_destroy,
+        NULL
+    );
+    if (self->code_object_cache == NULL) {
+        PyErr_NoMemory();
+        return -1;
     }
 
-    // note: genobject's gi_iframe is an embedded struct so the address to
-    // the offset leads directly to its first field: f_executable
-    uintptr_t address_of_running_task_code_obj;
-    if (read_py_ptr(
-        handle,
-        running_coro_addr + local_debug_offsets.gen_object.gi_iframe,
-        &address_of_running_task_code_obj
-    )) {
-        goto result_err;
+#ifdef Py_GIL_DISABLED
+    // Initialize TLBC cache
+    self->tlbc_generation = 0;
+    self->tlbc_cache = _Py_hashtable_new_full(
+        _Py_hashtable_hash_ptr,
+        _Py_hashtable_compare_direct,
+        NULL,  // keys are stable pointers, don't destroy
+        tlbc_cache_entry_destroy,
+        NULL
+    );
+    if (self->tlbc_cache == NULL) {
+        _Py_hashtable_destroy(self->code_object_cache);
+        PyErr_NoMemory();
+        return -1;
     }
+#endif
 
-    if ((void*)address_of_running_task_code_obj == NULL) {
-        PyErr_SetString(PyExc_RuntimeError, "Running task code object is NULL");
-        goto result_err;
+    return 0;
+}
+
+/*[clinic input]
+@critical_section
+_remote_debugging.RemoteUnwinder.get_stack_trace
+
+Returns a list of stack traces for all threads in the target process.
+
+Each element in the returned list is a tuple of (thread_id, frame_list), where:
+- thread_id is the OS thread identifier
+- frame_list is a list of tuples (function_name, filename, line_number) representing
+  the Python stack frames for that thread, ordered from most recent to oldest
+
+Example:
+    [
+        (1234, [
+            ('process_data', 'worker.py', 127),
+            ('run_worker', 'worker.py', 45),
+            ('main', 'app.py', 23)
+        ]),
+        (1235, [
+            ('handle_request', 'server.py', 89),
+            ('serve_forever', 'server.py', 52)
+        ])
+    ]
+
+Raises:
+    RuntimeError: If there is an error copying memory from the target process
+    OSError: If there is an error accessing the target process
+    PermissionError: If access to the target process is denied
+    UnicodeDecodeError: If there is an error decoding strings from the target process
+
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self)
+/*[clinic end generated code: output=666192b90c69d567 input=331dbe370578badf]*/
+{
+    PyObject* result = NULL;
+    // Read interpreter state into opaque buffer
+    char interp_state_buffer[INTERP_STATE_BUFFER_SIZE];
+    if (_Py_RemoteDebug_PagedReadRemoteMemory(
+            &self->handle,
+            self->interpreter_addr,
+            INTERP_STATE_BUFFER_SIZE,
+            interp_state_buffer) < 0) {
+        goto exit;
     }
 
-    uintptr_t address_of_current_frame;
-    if (find_running_frame(
-        handle, runtime_start_address, &local_debug_offsets,
-        &address_of_current_frame)
-    ) {
-        chain_exceptions(PyExc_RuntimeError, "Failed to find running frame");
-        goto result_err;
+    // Get code object generation from buffer
+    uint64_t code_object_generation = GET_MEMBER(uint64_t, interp_state_buffer,
+            self->debug_offsets.interpreter_state.code_object_generation);
+
+    if (code_object_generation != self->code_object_generation) {
+        self->code_object_generation = code_object_generation;
+        _Py_hashtable_clear(self->code_object_cache);
     }
 
-    uintptr_t address_of_code_object;
-    while ((void*)address_of_current_frame != NULL) {
-        PyObject* frame_info = NULL;
-        int res = parse_async_frame_object(
-            handle,
-            &frame_info,
-            &local_debug_offsets,
-            address_of_current_frame,
-            &address_of_current_frame,
-            &address_of_code_object
-        );
+#ifdef Py_GIL_DISABLED
+    // Check TLBC generation and invalidate cache if needed
+    uint32_t current_tlbc_generation = GET_MEMBER(uint32_t, interp_state_buffer,
+                                                  self->debug_offsets.interpreter_state.tlbc_generation);
+    if (current_tlbc_generation != self->tlbc_generation) {
+        self->tlbc_generation = current_tlbc_generation;
+        _Py_hashtable_clear(self->tlbc_cache);
+    }
+#endif
 
-        if (res < 0) {
-            chain_exceptions(PyExc_RuntimeError, "Failed to parse async frame object");
-            goto result_err;
-        }
+    uintptr_t current_tstate;
+    if (self->tstate_addr == 0) {
+        // Get threads head from buffer
+        current_tstate = GET_MEMBER(uintptr_t, interp_state_buffer,
+                self->debug_offsets.interpreter_state.threads_head);
+    } else {
+        current_tstate = self->tstate_addr;
+    }
 
+    result = PyList_New(0);
+    if (!result) {
+        goto exit;
+    }
+
+    while (current_tstate != 0) {
+        PyObject* frame_info = unwind_stack_for_thread(self, &current_tstate);
         if (!frame_info) {
-            continue;
+            Py_CLEAR(result);
+            goto exit;
         }
 
-        if (PyList_Append(calls, frame_info) == -1) {
-            Py_DECREF(calls);
-            goto result_err;
+        if (PyList_Append(result, frame_info) == -1) {
+            Py_DECREF(frame_info);
+            Py_CLEAR(result);
+            goto exit;
         }
-
         Py_DECREF(frame_info);
-        frame_info = NULL;
 
-        if (address_of_code_object == address_of_running_task_code_obj) {
+        // We are targeting a single tstate, break here
+        if (self->tstate_addr) {
             break;
         }
     }
 
-    PyObject *tn = parse_task_name(
-        handle, &local_debug_offsets, &local_async_debug, running_task_addr);
-    if (tn == NULL) {
-        goto result_err;
+exit:
+   _Py_RemoteDebug_ClearCache(&self->handle);
+    return result;
+}
+
+/*[clinic input]
+@critical_section
+_remote_debugging.RemoteUnwinder.get_all_awaited_by
+
+Get all tasks and their awaited_by relationships from the remote process.
+
+This provides a tree structure showing which tasks are waiting for other tasks.
+
+For each task, returns:
+1. The call stack frames leading to where the task is currently executing
+2. The name of the task
+3. A list of tasks that this task is waiting for, with their own frames/names/etc
+
+Returns a list of [frames, task_name, subtasks] where:
+- frames: List of (func_name, filename, lineno) showing the call stack
+- task_name: String identifier for the task
+- subtasks: List of tasks being awaited by this task, in same format
+
+Raises:
+    RuntimeError: If AsyncioDebug section is not available in the remote process
+    MemoryError: If memory allocation fails
+    OSError: If reading from the remote process fails
+
+Example output:
+[
+    # Task c2_root waiting for two subtasks
+    [
+        # Call stack of c2_root
+        [("c5", "script.py", 10), ("c4", "script.py", 14)],
+        "c2_root",
+        [
+            # First subtask (sub_main_2) and what it's waiting for
+            [
+                [("c1", "script.py", 23)],
+                "sub_main_2",
+                [...]
+            ],
+            # Second subtask and its waiters
+            [...]
+        ]
+    ]
+]
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_all_awaited_by_impl(RemoteUnwinderObject *self)
+/*[clinic end generated code: output=6a49cd345e8aec53 input=a452c652bb00701a]*/
+{
+    if (!self->async_debug_offsets_available) {
+        PyErr_SetString(PyExc_RuntimeError, "AsyncioDebug section not available");
+        return NULL;
     }
-    if (PyList_Append(result, tn)) {
-        Py_DECREF(tn);
+
+    PyObject *result = PyList_New(0);
+    if (result == NULL) {
         goto result_err;
     }
-    Py_DECREF(tn);
 
-    PyObject* awaited_by = PyList_New(0);
-    if (awaited_by == NULL) {
+    uintptr_t thread_state_addr;
+    unsigned long tid = 0;
+    if (0 > _Py_RemoteDebug_PagedReadRemoteMemory(
+                &self->handle,
+                self->interpreter_addr
+                + self->debug_offsets.interpreter_state.threads_main,
+                sizeof(void*),
+                &thread_state_addr))
+    {
         goto result_err;
     }
-    if (PyList_Append(result, awaited_by)) {
-        Py_DECREF(awaited_by);
-        goto result_err;
+
+    uintptr_t head_addr;
+    while (thread_state_addr != 0) {
+        if (0 > _Py_RemoteDebug_PagedReadRemoteMemory(
+                    &self->handle,
+                    thread_state_addr
+                    + self->debug_offsets.thread_state.native_thread_id,
+                    sizeof(tid),
+                    &tid))
+        {
+            goto result_err;
+        }
+
+        head_addr = thread_state_addr
+            + self->async_debug_offsets.asyncio_thread_state.asyncio_tasks_head;
+
+        if (append_awaited_by(self, tid, head_addr, result))
+        {
+            goto result_err;
+        }
+
+        if (0 > _Py_RemoteDebug_PagedReadRemoteMemory(
+                    &self->handle,
+                    thread_state_addr + self->debug_offsets.thread_state.next,
+                    sizeof(void*),
+                    &thread_state_addr))
+        {
+            goto result_err;
+        }
     }
-    Py_DECREF(awaited_by);
 
-    if (parse_task_awaited_by(
-        handle, &local_debug_offsets, &local_async_debug,
-        running_task_addr, awaited_by, 1)
-    ) {
+    head_addr = self->interpreter_addr
+        + self->async_debug_offsets.asyncio_interpreter_state.asyncio_tasks_head;
+
+    // On top of a per-thread task lists used by default by asyncio to avoid
+    // contention, there is also a fallback per-interpreter list of tasks;
+    // any tasks still pending when a thread is destroyed will be moved to the
+    // per-interpreter task list.  It's unlikely we'll find anything here, but
+    // interesting for debugging.
+    if (append_awaited_by(self, 0, head_addr, result))
+    {
         goto result_err;
     }
 
-    _Py_RemoteDebug_CleanupProcHandle(handle);
+    _Py_RemoteDebug_ClearCache(&self->handle);
     return result;
 
 result_err:
-    _Py_RemoteDebug_CleanupProcHandle(handle);
+    _Py_RemoteDebug_ClearCache(&self->handle);
     Py_XDECREF(result);
     return NULL;
 }
 
+/*[clinic input]
+@critical_section
+_remote_debugging.RemoteUnwinder.get_async_stack_trace
 
-static PyMethodDef methods[] = {
-    {"get_stack_trace", get_stack_trace, METH_VARARGS,
-        "Get the Python stack from a given pid"},
-    {"get_async_stack_trace", get_async_stack_trace, METH_VARARGS,
-        "Get the asyncio stack from a given pid"},
-    {"get_all_awaited_by", get_all_awaited_by, METH_VARARGS,
-        "Get all tasks and their awaited_by from a given pid"},
-    {NULL, NULL, 0, NULL},
+Returns information about the currently running async task and its stack trace.
+
+Returns a tuple of (task_info, stack_frames) where:
+- task_info is a tuple of (task_id, task_name) identifying the task
+- stack_frames is a list of tuples (function_name, filename, line_number) representing
+  the Python stack frames for the task, ordered from most recent to oldest
+
+Example:
+    ((4345585712, 'Task-1'), [
+        ('run_echo_server', 'server.py', 127),
+        ('serve_forever', 'server.py', 45),
+        ('main', 'app.py', 23)
+    ])
+
+Raises:
+    RuntimeError: If AsyncioDebug section is not available in the target process
+    RuntimeError: If there is an error copying memory from the target process
+    OSError: If there is an error accessing the target process
+    PermissionError: If access to the target process is denied
+    UnicodeDecodeError: If there is an error decoding strings from the target process
+
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_async_stack_trace_impl(RemoteUnwinderObject *self)
+/*[clinic end generated code: output=6433d52b55e87bbe input=11b7150c59d4c60f]*/
+{
+    if (!self->async_debug_offsets_available) {
+        PyErr_SetString(PyExc_RuntimeError, "AsyncioDebug section not available");
+        return NULL;
+    }
+
+    PyObject *result = NULL;
+    PyObject *calls = NULL;
+
+    if (setup_async_result_structure(&result, &calls) < 0) {
+        goto cleanup;
+    }
+
+    uintptr_t running_task_addr, running_coro_addr, running_task_code_obj;
+    if (find_running_task_and_coro(self, &running_task_addr,
+                                   &running_coro_addr, &running_task_code_obj) < 0) {
+        goto cleanup;
+    }
+
+    if (parse_async_frame_chain(self, calls, running_task_code_obj) < 0) {
+        goto cleanup;
+    }
+
+    if (add_task_info_to_result(self, result, running_task_addr) < 0) {
+        goto cleanup;
+    }
+
+    _Py_RemoteDebug_ClearCache(&self->handle);
+    return result;
+
+cleanup:
+    _Py_RemoteDebug_ClearCache(&self->handle);
+    Py_XDECREF(result);
+    return NULL;
+}
+
+static PyMethodDef RemoteUnwinder_methods[] = {
+    _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_STACK_TRACE_METHODDEF
+    _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_ALL_AWAITED_BY_METHODDEF
+    _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_ASYNC_STACK_TRACE_METHODDEF
+    {NULL, NULL}
 };
 
-static struct PyModuleDef module = {
-    .m_base = PyModuleDef_HEAD_INIT,
-    .m_name = "_remote_debugging",
-    .m_size = -1,
-    .m_methods = methods,
+static void
+RemoteUnwinder_dealloc(RemoteUnwinderObject *self)
+{
+    PyTypeObject *tp = Py_TYPE(self);
+    if (self->code_object_cache) {
+        _Py_hashtable_destroy(self->code_object_cache);
+    }
+#ifdef Py_GIL_DISABLED
+    if (self->tlbc_cache) {
+        _Py_hashtable_destroy(self->tlbc_cache);
+    }
+#endif
+    if (self->handle.pid != 0) {
+        _Py_RemoteDebug_ClearCache(&self->handle);
+        _Py_RemoteDebug_CleanupProcHandle(&self->handle);
+    }
+    PyObject_Del(self);
+    Py_DECREF(tp);
+}
+
+static PyType_Slot RemoteUnwinder_slots[] = {
+    {Py_tp_doc, (void *)"RemoteUnwinder(pid): Inspect stack of a remote Python process."},
+    {Py_tp_methods, RemoteUnwinder_methods},
+    {Py_tp_init, _remote_debugging_RemoteUnwinder___init__},
+    {Py_tp_dealloc, RemoteUnwinder_dealloc},
+    {0, NULL}
 };
 
-PyMODINIT_FUNC
-PyInit__remote_debugging(void)
+static PyType_Spec RemoteUnwinder_spec = {
+    .name = "_remote_debugging.RemoteUnwinder",
+    .basicsize = sizeof(RemoteUnwinderObject),
+    .flags = Py_TPFLAGS_DEFAULT,
+    .slots = RemoteUnwinder_slots,
+};
+
+/* ============================================================================
+ * MODULE INITIALIZATION
+ * ============================================================================ */
+
+static int
+_remote_debugging_exec(PyObject *m)
 {
-    PyObject* mod = PyModule_Create(&module);
-    if (mod == NULL) {
-        return NULL;
+    RemoteDebuggingState *st = RemoteDebugging_GetState(m);
+#define CREATE_TYPE(mod, type, spec)                                        \
+    do {                                                                    \
+        type = (PyTypeObject *)PyType_FromMetaclass(NULL, mod, spec, NULL); \
+        if (type == NULL) {                                                 \
+            return -1;                                                      \
+        }                                                                   \
+    } while (0)
+
+    CREATE_TYPE(m, st->RemoteDebugging_Type, &RemoteUnwinder_spec);
+
+    if (PyModule_AddType(m, st->RemoteDebugging_Type) < 0) {
+        return -1;
     }
 #ifdef Py_GIL_DISABLED
-    PyUnstable_Module_SetGIL(mod, Py_MOD_GIL_NOT_USED);
+    PyUnstable_Module_SetGIL(m, Py_MOD_GIL_NOT_USED);
 #endif
-    int rc = PyModule_AddIntConstant(
-        mod, "PROCESS_VM_READV_SUPPORTED", HAVE_PROCESS_VM_READV);
+    int rc = PyModule_AddIntConstant(m, "PROCESS_VM_READV_SUPPORTED", HAVE_PROCESS_VM_READV);
     if (rc < 0) {
-        Py_DECREF(mod);
-        return NULL;
+        return -1;
     }
-    return mod;
+    if (RemoteDebugging_InitState(st) < 0) {
+        return -1;
+    }
+    return 0;
 }
+
+static int
+remote_debugging_traverse(PyObject *mod, visitproc visit, void *arg)
+{
+    RemoteDebuggingState *state = RemoteDebugging_GetState(mod);
+    Py_VISIT(state->RemoteDebugging_Type);
+    return 0;
+}
+
+static int
+remote_debugging_clear(PyObject *mod)
+{
+    RemoteDebuggingState *state = RemoteDebugging_GetState(mod);
+    Py_CLEAR(state->RemoteDebugging_Type);
+    return 0;
+}
+
+static void
+remote_debugging_free(void *mod)
+{
+    (void)remote_debugging_clear((PyObject *)mod);
+}
+
+static PyModuleDef_Slot remote_debugging_slots[] = {
+    {Py_mod_exec, _remote_debugging_exec},
+    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
+    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
+    {0, NULL},
+};
+
+static PyMethodDef remote_debugging_methods[] = {
+    {NULL, NULL, 0, NULL},
+};
+
+static struct PyModuleDef remote_debugging_module = {
+    PyModuleDef_HEAD_INIT,
+    .m_name = "_remote_debugging",
+    .m_size = sizeof(RemoteDebuggingState),
+    .m_methods = remote_debugging_methods,
+    .m_slots = remote_debugging_slots,
+    .m_traverse = remote_debugging_traverse,
+    .m_clear = remote_debugging_clear,
+    .m_free = remote_debugging_free,
+};
+
+PyMODINIT_FUNC
+PyInit__remote_debugging(void)
+{
+    return PyModuleDef_Init(&remote_debugging_module);
+}
+
diff --git a/Modules/clinic/_remote_debugging_module.c.h b/Modules/clinic/_remote_debugging_module.c.h
new file mode 100644
index 00000000000000..e83e2fd7fd2b5b
--- /dev/null
+++ b/Modules/clinic/_remote_debugging_module.c.h
@@ -0,0 +1,243 @@
+/*[clinic input]
+preserve
+[clinic start generated code]*/
+
+#if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+#  include "pycore_gc.h"          // PyGC_Head
+#  include "pycore_runtime.h"     // _Py_ID()
+#endif
+#include "pycore_critical_section.h"// Py_BEGIN_CRITICAL_SECTION()
+#include "pycore_modsupport.h"    // _PyArg_UnpackKeywords()
+
+PyDoc_STRVAR(_remote_debugging_RemoteUnwinder___init____doc__,
+"RemoteUnwinder(pid, *, all_threads=False)\n"
+"--\n"
+"\n"
+"Initialize a new RemoteUnwinder object for debugging a remote Python process.\n"
+"\n"
+"Args:\n"
+"    pid: Process ID of the target Python process to debug\n"
+"    all_threads: If True, initialize state for all threads in the process.\n"
+"                If False, only initialize for the main thread.\n"
+"\n"
+"The RemoteUnwinder provides functionality to inspect and debug a running Python\n"
+"process, including examining thread states, stack frames and other runtime data.\n"
+"\n"
+"Raises:\n"
+"    PermissionError: If access to the target process is denied\n"
+"    OSError: If unable to attach to the target process or access its memory\n"
+"    RuntimeError: If unable to read debug information from the target process");
+
+static int
+_remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
+                                               int pid, int all_threads);
+
+static int
+_remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+    int return_value = -1;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 2
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(pid), &_Py_ID(all_threads), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"pid", "all_threads", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "RemoteUnwinder",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[2];
+    PyObject * const *fastargs;
+    Py_ssize_t nargs = PyTuple_GET_SIZE(args);
+    Py_ssize_t noptargs = nargs + (kwargs ? PyDict_GET_SIZE(kwargs) : 0) - 1;
+    int pid;
+    int all_threads = 0;
+
+    fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser,
+            /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!fastargs) {
+        goto exit;
+    }
+    pid = PyLong_AsInt(fastargs[0]);
+    if (pid == -1 && PyErr_Occurred()) {
+        goto exit;
+    }
+    if (!noptargs) {
+        goto skip_optional_kwonly;
+    }
+    all_threads = PyObject_IsTrue(fastargs[1]);
+    if (all_threads < 0) {
+        goto exit;
+    }
+skip_optional_kwonly:
+    return_value = _remote_debugging_RemoteUnwinder___init___impl((RemoteUnwinderObject *)self, pid, all_threads);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_RemoteUnwinder_get_stack_trace__doc__,
+"get_stack_trace($self, /)\n"
+"--\n"
+"\n"
+"Returns a list of stack traces for all threads in the target process.\n"
+"\n"
+"Each element in the returned list is a tuple of (thread_id, frame_list), where:\n"
+"- thread_id is the OS thread identifier\n"
+"- frame_list is a list of tuples (function_name, filename, line_number) representing\n"
+"  the Python stack frames for that thread, ordered from most recent to oldest\n"
+"\n"
+"Example:\n"
+"    [\n"
+"        (1234, [\n"
+"            (\'process_data\', \'worker.py\', 127),\n"
+"            (\'run_worker\', \'worker.py\', 45),\n"
+"            (\'main\', \'app.py\', 23)\n"
+"        ]),\n"
+"        (1235, [\n"
+"            (\'handle_request\', \'server.py\', 89),\n"
+"            (\'serve_forever\', \'server.py\', 52)\n"
+"        ])\n"
+"    ]\n"
+"\n"
+"Raises:\n"
+"    RuntimeError: If there is an error copying memory from the target process\n"
+"    OSError: If there is an error accessing the target process\n"
+"    PermissionError: If access to the target process is denied\n"
+"    UnicodeDecodeError: If there is an error decoding strings from the target process");
+
+#define _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_STACK_TRACE_METHODDEF    \
+    {"get_stack_trace", (PyCFunction)_remote_debugging_RemoteUnwinder_get_stack_trace, METH_NOARGS, _remote_debugging_RemoteUnwinder_get_stack_trace__doc__},
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self);
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_stack_trace(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    PyObject *return_value = NULL;
+
+    Py_BEGIN_CRITICAL_SECTION(self);
+    return_value = _remote_debugging_RemoteUnwinder_get_stack_trace_impl((RemoteUnwinderObject *)self);
+    Py_END_CRITICAL_SECTION();
+
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_RemoteUnwinder_get_all_awaited_by__doc__,
+"get_all_awaited_by($self, /)\n"
+"--\n"
+"\n"
+"Get all tasks and their awaited_by relationships from the remote process.\n"
+"\n"
+"This provides a tree structure showing which tasks are waiting for other tasks.\n"
+"\n"
+"For each task, returns:\n"
+"1. The call stack frames leading to where the task is currently executing\n"
+"2. The name of the task\n"
+"3. A list of tasks that this task is waiting for, with their own frames/names/etc\n"
+"\n"
+"Returns a list of [frames, task_name, subtasks] where:\n"
+"- frames: List of (func_name, filename, lineno) showing the call stack\n"
+"- task_name: String identifier for the task\n"
+"- subtasks: List of tasks being awaited by this task, in same format\n"
+"\n"
+"Raises:\n"
+"    RuntimeError: If AsyncioDebug section is not available in the remote process\n"
+"    MemoryError: If memory allocation fails\n"
+"    OSError: If reading from the remote process fails\n"
+"\n"
+"Example output:\n"
+"[\n"
+"    [\n"
+"        [(\"c5\", \"script.py\", 10), (\"c4\", \"script.py\", 14)],\n"
+"        \"c2_root\",\n"
+"        [\n"
+"            [\n"
+"                [(\"c1\", \"script.py\", 23)],\n"
+"                \"sub_main_2\",\n"
+"                [...]\n"
+"            ],\n"
+"            [...]\n"
+"        ]\n"
+"    ]\n"
+"]");
+
+#define _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_ALL_AWAITED_BY_METHODDEF    \
+    {"get_all_awaited_by", (PyCFunction)_remote_debugging_RemoteUnwinder_get_all_awaited_by, METH_NOARGS, _remote_debugging_RemoteUnwinder_get_all_awaited_by__doc__},
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_all_awaited_by_impl(RemoteUnwinderObject *self);
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_all_awaited_by(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    PyObject *return_value = NULL;
+
+    Py_BEGIN_CRITICAL_SECTION(self);
+    return_value = _remote_debugging_RemoteUnwinder_get_all_awaited_by_impl((RemoteUnwinderObject *)self);
+    Py_END_CRITICAL_SECTION();
+
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_RemoteUnwinder_get_async_stack_trace__doc__,
+"get_async_stack_trace($self, /)\n"
+"--\n"
+"\n"
+"Returns information about the currently running async task and its stack trace.\n"
+"\n"
+"Returns a tuple of (task_info, stack_frames) where:\n"
+"- task_info is a tuple of (task_id, task_name) identifying the task\n"
+"- stack_frames is a list of tuples (function_name, filename, line_number) representing\n"
+"  the Python stack frames for the task, ordered from most recent to oldest\n"
+"\n"
+"Example:\n"
+"    ((4345585712, \'Task-1\'), [\n"
+"        (\'run_echo_server\', \'server.py\', 127),\n"
+"        (\'serve_forever\', \'server.py\', 45),\n"
+"        (\'main\', \'app.py\', 23)\n"
+"    ])\n"
+"\n"
+"Raises:\n"
+"    RuntimeError: If AsyncioDebug section is not available in the target process\n"
+"    RuntimeError: If there is an error copying memory from the target process\n"
+"    OSError: If there is an error accessing the target process\n"
+"    PermissionError: If access to the target process is denied\n"
+"    UnicodeDecodeError: If there is an error decoding strings from the target process");
+
+#define _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_ASYNC_STACK_TRACE_METHODDEF    \
+    {"get_async_stack_trace", (PyCFunction)_remote_debugging_RemoteUnwinder_get_async_stack_trace, METH_NOARGS, _remote_debugging_RemoteUnwinder_get_async_stack_trace__doc__},
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_async_stack_trace_impl(RemoteUnwinderObject *self);
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_async_stack_trace(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    PyObject *return_value = NULL;
+
+    Py_BEGIN_CRITICAL_SECTION(self);
+    return_value = _remote_debugging_RemoteUnwinder_get_async_stack_trace_impl((RemoteUnwinderObject *)self);
+    Py_END_CRITICAL_SECTION();
+
+    return return_value;
+}
+/*[clinic end generated code: output=654772085f1f4bf6 input=a9049054013a1b77]*/
diff --git a/Objects/codeobject.c b/Objects/codeobject.c
index 4f06a36a130207..ee869d991d93cd 100644
--- a/Objects/codeobject.c
+++ b/Objects/codeobject.c
@@ -2364,6 +2364,8 @@ free_monitoring_data(_PyCoMonitoringData *data)
 static void
 code_dealloc(PyObject *self)
 {
+    PyThreadState *tstate = PyThreadState_GET();
+    _Py_atomic_add_uint64(&tstate->interp->_code_object_generation, 1);
     PyCodeObject *co = _PyCodeObject_CAST(self);
     _PyObject_ResurrectStart(self);
     notify_code_watchers(PY_CODE_EVENT_DESTROY, co);
diff --git a/Python/index_pool.c b/Python/index_pool.c
index 007c81a0fc16ec..520a65938ec6c7 100644
--- a/Python/index_pool.c
+++ b/Python/index_pool.c
@@ -172,6 +172,9 @@ _PyIndexPool_AllocIndex(_PyIndexPool *pool)
     else {
         index = heap_pop(free_indices);
     }
+
+    pool->tlbc_generation++;
+
     UNLOCK_POOL(pool);
     return index;
 }
@@ -180,6 +183,7 @@ void
 _PyIndexPool_FreeIndex(_PyIndexPool *pool, int32_t index)
 {
     LOCK_POOL(pool);
+    pool->tlbc_generation++;
     heap_add(&pool->free_indices, index);
     UNLOCK_POOL(pool);
 }
diff --git a/Python/pystate.c b/Python/pystate.c
index 4144e6edefc073..0544b15aad1cc8 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -567,6 +567,7 @@ init_interpreter(PyInterpreterState *interp,
     }
     interp->sys_profile_initialized = false;
     interp->sys_trace_initialized = false;
+    interp->_code_object_generation = 0;
     interp->jit = false;
     interp->executor_list_head = NULL;
     interp->executor_deletion_list_head = NULL;
@@ -777,6 +778,10 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate)
     for (int t = 0; t < PY_MONITORING_TOOL_IDS; t++) {
         Py_CLEAR(interp->monitoring_tool_names[t]);
     }
+    interp->_code_object_generation = 0;
+#ifdef Py_GIL_DISABLED
+    interp->tlbc_indices.tlbc_generation = 0;
+#endif
 
     PyConfig_Clear(&interp->config);
     _PyCodec_Fini(interp);
@@ -1346,9 +1351,6 @@ tstate_is_alive(PyThreadState *tstate)
 // lifecycle
 //----------
 
-/* Minimum size of data stack chunk */
-#define DATA_STACK_CHUNK_SIZE (16*1024)
-
 static _PyStackChunk*
 allocate_chunk(int size_in_bytes, _PyStackChunk* previous)
 {
@@ -2897,7 +2899,7 @@ _PyInterpreterState_HasFeature(PyInterpreterState *interp, unsigned long feature
 static PyObject **
 push_chunk(PyThreadState *tstate, int size)
 {
-    int allocate_size = DATA_STACK_CHUNK_SIZE;
+    int allocate_size = _PY_DATA_STACK_CHUNK_SIZE;
     while (allocate_size < (int)sizeof(PyObject*)*(size + MINIMUM_OVERHEAD)) {
         allocate_size *= 2;
     }
diff --git a/Python/remote_debug.h b/Python/remote_debug.h
index edc77c302916ca..dbc6bdd09a693f 100644
--- a/Python/remote_debug.h
+++ b/Python/remote_debug.h
@@ -73,19 +73,71 @@ extern "C" {
 #    define HAVE_PROCESS_VM_READV 0
 #endif
 
+static inline size_t
+get_page_size(void) {
+    size_t page_size = 0;
+    if (page_size == 0) {
+#ifdef MS_WINDOWS
+        SYSTEM_INFO si;
+        GetSystemInfo(&si);
+        page_size = si.dwPageSize;
+#else
+        page_size = (size_t)getpagesize();
+#endif
+    }
+    return page_size;
+}
+
+typedef struct page_cache_entry {
+    uintptr_t page_addr; // page-aligned base address
+    char *data;
+    int valid;
+    struct page_cache_entry *next;
+} page_cache_entry_t;
+
+#define MAX_PAGES 1024
+
 // Define a platform-independent process handle structure
 typedef struct {
     pid_t pid;
-#ifdef MS_WINDOWS
+#if defined(__APPLE__)
+    mach_port_t task;
+#elif defined(MS_WINDOWS)
     HANDLE hProcess;
 #endif
+    page_cache_entry_t pages[MAX_PAGES];
+    Py_ssize_t page_size;
 } proc_handle_t;
 
+static void
+_Py_RemoteDebug_FreePageCache(proc_handle_t *handle)
+{
+    for (int i = 0; i < MAX_PAGES; i++) {
+        PyMem_RawFree(handle->pages[i].data);
+        handle->pages[i].data = NULL;
+        handle->pages[i].valid = 0;
+    }
+}
+
+void
+_Py_RemoteDebug_ClearCache(proc_handle_t *handle)
+{
+    for (int i = 0; i < MAX_PAGES; i++) {
+        handle->pages[i].valid = 0;
+    }
+}
+
+#if defined(__APPLE__) && TARGET_OS_OSX
+static mach_port_t pid_to_task(pid_t pid);
+#endif
+
 // Initialize the process handle
 static int
 _Py_RemoteDebug_InitProcHandle(proc_handle_t *handle, pid_t pid) {
     handle->pid = pid;
-#ifdef MS_WINDOWS
+#if defined(__APPLE__)
+    handle->task = pid_to_task(handle->pid);
+#elif defined(MS_WINDOWS)
     handle->hProcess = OpenProcess(
         PROCESS_VM_READ | PROCESS_VM_WRITE | PROCESS_VM_OPERATION | PROCESS_QUERY_INFORMATION,
         FALSE, pid);
@@ -94,6 +146,11 @@ _Py_RemoteDebug_InitProcHandle(proc_handle_t *handle, pid_t pid) {
         return -1;
     }
 #endif
+    handle->page_size = get_page_size();
+    for (int i = 0; i < MAX_PAGES; i++) {
+        handle->pages[i].data = NULL;
+        handle->pages[i].valid = 0;
+    }
     return 0;
 }
 
@@ -107,6 +164,7 @@ _Py_RemoteDebug_CleanupProcHandle(proc_handle_t *handle) {
     }
 #endif
     handle->pid = 0;
+    _Py_RemoteDebug_FreePageCache(handle);
 }
 
 #if defined(__APPLE__) && TARGET_OS_OSX
@@ -755,7 +813,7 @@ _Py_RemoteDebug_ReadRemoteMemory(proc_handle_t *handle, uintptr_t remote_address
 #elif defined(__APPLE__) && TARGET_OS_OSX
     Py_ssize_t result = -1;
     kern_return_t kr = mach_vm_read_overwrite(
-        pid_to_task(handle->pid),
+        handle->task,
         (mach_vm_address_t)remote_address,
         len,
         (mach_vm_address_t)dst,
@@ -780,6 +838,59 @@ _Py_RemoteDebug_ReadRemoteMemory(proc_handle_t *handle, uintptr_t remote_address
 #endif
 }
 
+int
+_Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle,
+                                      uintptr_t addr,
+                                      size_t size,
+                                      void *out)
+{
+    size_t page_size = handle->page_size;
+    uintptr_t page_base = addr & ~(page_size - 1);
+    size_t offset_in_page = addr - page_base;
+
+    if (offset_in_page + size > page_size) {
+        return _Py_RemoteDebug_ReadRemoteMemory(handle, addr, size, out);
+    }
+
+    // Search for valid cached page
+    for (int i = 0; i < MAX_PAGES; i++) {
+        page_cache_entry_t *entry = &handle->pages[i];
+        if (entry->valid && entry->page_addr == page_base) {
+            memcpy(out, entry->data + offset_in_page, size);
+            return 0;
+        }
+    }
+
+    // Find reusable slot
+    for (int i = 0; i < MAX_PAGES; i++) {
+        page_cache_entry_t *entry = &handle->pages[i];
+        if (!entry->valid) {
+            if (entry->data == NULL) {
+                entry->data = PyMem_RawMalloc(page_size);
+                if (entry->data == NULL) {
+                    PyErr_NoMemory();
+                    return -1;
+                }
+            }
+
+            if (_Py_RemoteDebug_ReadRemoteMemory(handle, page_base, page_size, entry->data) < 0) {
+                // Try to just copy the exact ammount as a fallback
+                PyErr_Clear();
+                goto fallback;
+            }
+
+            entry->page_addr = page_base;
+            entry->valid = 1;
+            memcpy(out, entry->data + offset_in_page, size);
+            return 0;
+        }
+    }
+
+fallback:
+    // Cache full — fallback to uncached read
+    return _Py_RemoteDebug_ReadRemoteMemory(handle, addr, size, out);
+}
+
 static int
 _Py_RemoteDebug_ReadDebugOffsets(
     proc_handle_t *handle,

From 7b1a70023127666e98a1fc7f7a3ee04f7bdc4028 Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <Pablogsal@gmail.com>
Date: Sun, 25 May 2025 21:37:15 +0100
Subject: [PATCH 8/8] Heavily comment Python/perf_jit_trampoline.c to improve
 maintainability (#134527)

Signed-off-by: Pablo Galindo <pablogsal@gmail.com>
---
 Python/perf_jit_trampoline.c | 1414 +++++++++++++++++++++++++---------
 1 file changed, 1029 insertions(+), 385 deletions(-)

diff --git a/Python/perf_jit_trampoline.c b/Python/perf_jit_trampoline.c
index 1211e0e9f112b7..5c7cb5b0a9913c 100644
--- a/Python/perf_jit_trampoline.c
+++ b/Python/perf_jit_trampoline.c
@@ -1,241 +1,354 @@
+/*
+ * Python Perf Trampoline Support - JIT Dump Implementation
+ *
+ * This file implements the perf jitdump API for Python's performance profiling
+ * integration. It allows perf (Linux performance analysis tool) to understand
+ * and profile dynamically generated Python bytecode by creating JIT dump files
+ * that perf can inject into its analysis.
+ *
+ *
+ * IMPORTANT: This file exports specific callback functions that are part of
+ * Python's internal API. Do not modify the function signatures or behavior
+ * of exported functions without coordinating with the Python core team.
+ *
+ * Usually the binary and libraries are mapped in separate region like below:
+ *
+ *   address ->
+ *    --+---------------------+--//--+---------------------+--
+ *      | .text | .data | ... |      | .text | .data | ... |
+ *    --+---------------------+--//--+---------------------+--
+ *          myprog                      libc.so
+ *
+ * So it'd be easy and straight-forward to find a mapped binary or library from an
+ * address.
+ *
+ * But for JIT code, the code arena only cares about the code section. But the
+ * resulting DSOs (which is generated by perf inject -j) contain ELF headers and
+ * unwind info too. Then it'd generate following address space with synthesized
+ * MMAP events. Let's say it has a sample between address B and C.
+ *
+ *                                                sample
+ *                                                  |
+ *   address ->                         A       B   v   C
+ *   ---------------------------------------------------------------------------------------------------
+ *   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
+ *   /tmp/jitted-PID-1.so           | (headers) | .text | unwind info |
+ *   /tmp/jitted-PID-2.so                   | (headers) | .text | unwind info |
+ *     ...
+ *   ---------------------------------------------------------------------------------------------------
+ *
+ * If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see
+ * the unwind info. If it maps both .text section and unwind sections, the sample
+ * could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing
+ * which one is right. So to make perf happy we have non-overlapping ranges for each
+ * DSO:
+ *
+ *   address ->
+ *   -------------------------------------------------------------------------------------------------------
+ *   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
+ *   /tmp/jitted-PID-1.so                         | (headers) | .text | unwind info |
+ *   /tmp/jitted-PID-2.so                                               | (headers) | .text | unwind info |
+ *     ...
+ *   -------------------------------------------------------------------------------------------------------
+ *
+ * As the trampolines are constant, we add a constant padding but in general the padding needs to have the
+ * size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50
+ */
+
+
+
 #include "Python.h"
 #include "pycore_ceval.h"         // _PyPerf_Callbacks
 #include "pycore_frame.h"
 #include "pycore_interp.h"
 #include "pycore_runtime.h"       // _PyRuntime
 
-
 #ifdef PY_HAVE_PERF_TRAMPOLINE
 
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/mman.h>             // mmap()
-#include <sys/types.h>
-#include <unistd.h>               // sysconf()
-#include <sys/time.h>           // gettimeofday()
-#include <sys/syscall.h>
-
-// ----------------------------------
-//         Perf jitdump API
-// ----------------------------------
-
-typedef struct {
-    FILE* perf_map;
-    PyThread_type_lock map_lock;
-    void* mapped_buffer;
-    size_t mapped_size;
-    int code_id;
-} PerfMapJitState;
-
-static PerfMapJitState perf_jit_map_state;
+/* Standard library includes for perf jitdump implementation */
+#include <elf.h>                  // ELF architecture constants
+#include <fcntl.h>                // File control operations
+#include <stdio.h>                // Standard I/O operations
+#include <stdlib.h>               // Standard library functions
+#include <sys/mman.h>             // Memory mapping functions (mmap)
+#include <sys/types.h>            // System data types
+#include <unistd.h>               // System calls (sysconf, getpid)
+#include <sys/time.h>             // Time functions (gettimeofday)
+#include <sys/syscall.h>          // System call interface
+
+// =============================================================================
+//                           CONSTANTS AND CONFIGURATION
+// =============================================================================
 
 /*
-Usually the binary and libraries are mapped in separate region like below:
-
-  address ->
-   --+---------------------+--//--+---------------------+--
-     | .text | .data | ... |      | .text | .data | ... |
-   --+---------------------+--//--+---------------------+--
-         myprog                      libc.so
-
-So it'd be easy and straight-forward to find a mapped binary or library from an
-address.
-
-But for JIT code, the code arena only cares about the code section. But the
-resulting DSOs (which is generated by perf inject -j) contain ELF headers and
-unwind info too. Then it'd generate following address space with synthesized
-MMAP events. Let's say it has a sample between address B and C.
-
-                                               sample
-                                                 |
-  address ->                         A       B   v   C
-  ---------------------------------------------------------------------------------------------------
-  /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
-  /tmp/jitted-PID-1.so           | (headers) | .text | unwind info |
-  /tmp/jitted-PID-2.so                   | (headers) | .text | unwind info |
-    ...
-  ---------------------------------------------------------------------------------------------------
-
-If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see
-the unwind info. If it maps both .text section and unwind sections, the sample
-could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing
-which one is right. So to make perf happy we have non-overlapping ranges for each
-DSO:
-
-  address ->
-  -------------------------------------------------------------------------------------------------------
-  /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
-  /tmp/jitted-PID-1.so                         | (headers) | .text | unwind info |
-  /tmp/jitted-PID-2.so                                               | (headers) | .text | unwind info |
-    ...
-  -------------------------------------------------------------------------------------------------------
-
-As the trampolines are constant, we add a constant padding but in general the padding needs to have the
-size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50
+ * Memory layout considerations for perf jitdump:
+ *
+ * Perf expects non-overlapping memory regions for each JIT-compiled function.
+ * When perf processes the jitdump file, it creates synthetic DSO (Dynamic
+ * Shared Object) files that contain:
+ * - ELF headers
+ * - .text section (actual machine code)
+ * - Unwind information (for stack traces)
+ *
+ * To ensure proper address space layout, we add padding between code regions.
+ * This prevents address conflicts when perf maps the synthesized DSOs.
+ *
+ * Memory layout example:
+ * /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding]
+ * /tmp/jitted-PID-1.so:                                       [headers][.text][unwind_info][padding]
+ *
+ * The padding size (0x100) is chosen to accommodate typical unwind info sizes
+ * while maintaining 16-byte alignment requirements.
  */
-
 #define PERF_JIT_CODE_PADDING 0x100
-#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
-
-typedef uint64_t uword;
-typedef const char* CodeComments;
 
-#define Pd "d"
-#define MB (1024 * 1024)
-
-#define EM_386      3
-#define EM_X86_64   62
-#define EM_ARM      40
-#define EM_AARCH64  183
-#define EM_RISCV    243
+/* Convenient access to the global trampoline API state */
+#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
 
-#define TARGET_ARCH_IA32   0
-#define TARGET_ARCH_X64    0
-#define TARGET_ARCH_ARM    0
-#define TARGET_ARCH_ARM64  0
-#define TARGET_ARCH_RISCV32 0
-#define TARGET_ARCH_RISCV64 0
+/* Type aliases for clarity and portability */
+typedef uint64_t uword;                    // Word-sized unsigned integer
+typedef const char* CodeComments;          // Code comment strings
 
-#define FLAG_generate_perf_jitdump 0
-#define FLAG_write_protect_code 0
-#define FLAG_write_protect_vm_isolate 0
-#define FLAG_code_comments 0
+/* Memory size constants */
+#define MB (1024 * 1024)                   // 1 Megabyte for buffer sizing
 
-#define UNREACHABLE()
+// =============================================================================
+//                        ARCHITECTURE-SPECIFIC DEFINITIONS
+// =============================================================================
 
-static uword GetElfMachineArchitecture(void) {
-#if TARGET_ARCH_IA32
-    return EM_386;
-#elif TARGET_ARCH_X64
+/*
+ * Returns the ELF machine architecture constant for the current platform.
+ * This is required for the jitdump header to correctly identify the target
+ * architecture for perf processing.
+ *
+ */
+static uint64_t GetElfMachineArchitecture(void) {
+#if defined(__x86_64__) || defined(_M_X64)
     return EM_X86_64;
-#elif TARGET_ARCH_ARM
-    return EM_ARM;
-#elif TARGET_ARCH_ARM64
+#elif defined(__i386__) || defined(_M_IX86)
+    return EM_386;
+#elif defined(__aarch64__)
     return EM_AARCH64;
-#elif TARGET_ARCH_RISCV32 || TARGET_ARCH_RISCV64
+#elif defined(__arm__) || defined(_M_ARM)
+    return EM_ARM;
+#elif defined(__riscv)
     return EM_RISCV;
 #else
-    UNREACHABLE();
+    Py_UNREACHABLE();  // Unsupported architecture - should never reach here
     return 0;
 #endif
 }
 
+// =============================================================================
+//                           PERF JITDUMP DATA STRUCTURES
+// =============================================================================
+
+/*
+ * Perf jitdump file format structures
+ *
+ * These structures define the binary format that perf expects for JIT dump files.
+ * The format is documented in the Linux perf tools source code and must match
+ * exactly for proper perf integration.
+ */
+
+/*
+ * Jitdump file header - written once at the beginning of each jitdump file
+ * Contains metadata about the process and jitdump format version
+ */
 typedef struct {
-    uint32_t magic;
-    uint32_t version;
-    uint32_t size;
-    uint32_t elf_mach_target;
-    uint32_t reserved;
-    uint32_t process_id;
-    uint64_t time_stamp;
-    uint64_t flags;
+    uint32_t magic;              // Magic number (0x4A695444 = "JiTD")
+    uint32_t version;            // Jitdump format version (currently 1)
+    uint32_t size;               // Size of this header structure
+    uint32_t elf_mach_target;    // Target architecture (from GetElfMachineArchitecture)
+    uint32_t reserved;           // Reserved field (must be 0)
+    uint32_t process_id;         // Process ID of the JIT compiler
+    uint64_t time_stamp;         // Timestamp when jitdump was created
+    uint64_t flags;              // Feature flags (currently unused)
 } Header;
 
- enum PerfEvent {
-    PerfLoad = 0,
-    PerfMove = 1,
-    PerfDebugInfo = 2,
-    PerfClose = 3,
-    PerfUnwindingInfo = 4
+/*
+ * Perf event types supported by the jitdump format
+ * Each event type has a corresponding structure format
+ */
+enum PerfEvent {
+    PerfLoad = 0,           // Code load event (new JIT function)
+    PerfMove = 1,           // Code move event (function relocated)
+    PerfDebugInfo = 2,      // Debug information event
+    PerfClose = 3,          // JIT session close event
+    PerfUnwindingInfo = 4   // Stack unwinding information event
 };
 
+/*
+ * Base event structure - common header for all perf events
+ * Every event in the jitdump file starts with this structure
+ */
 struct BaseEvent {
-    uint32_t event;
-    uint32_t size;
-    uint64_t time_stamp;
-  };
+    uint32_t event;         // Event type (from PerfEvent enum)
+    uint32_t size;          // Total size of this event including payload
+    uint64_t time_stamp;    // Timestamp when event occurred
+};
 
+/*
+ * Code load event - indicates a new JIT-compiled function is available
+ * This is the most important event type for Python profiling
+ */
 typedef struct {
-    struct BaseEvent base;
-    uint32_t process_id;
-    uint32_t thread_id;
-    uint64_t vma;
-    uint64_t code_address;
-    uint64_t code_size;
-    uint64_t code_id;
+    struct BaseEvent base;   // Common event header
+    uint32_t process_id;     // Process ID where code was generated
+    uint32_t thread_id;      // Thread ID where code was generated
+    uint64_t vma;            // Virtual memory address where code is loaded
+    uint64_t code_address;   // Address of the actual machine code
+    uint64_t code_size;      // Size of the machine code in bytes
+    uint64_t code_id;        // Unique identifier for this code region
+    /* Followed by:
+     * - null-terminated function name string
+     * - raw machine code bytes
+     */
 } CodeLoadEvent;
 
+/*
+ * Code unwinding information event - provides DWARF data for stack traces
+ * Essential for proper stack unwinding during profiling
+ */
 typedef struct {
-    struct BaseEvent base;
-    uint64_t unwind_data_size;
-    uint64_t eh_frame_hdr_size;
-    uint64_t mapped_size;
+    struct BaseEvent base;      // Common event header
+    uint64_t unwind_data_size;  // Size of the unwinding data
+    uint64_t eh_frame_hdr_size; // Size of the EH frame header
+    uint64_t mapped_size;       // Total mapped size (with padding)
+    /* Followed by:
+     * - EH frame header
+     * - DWARF unwinding information
+     * - Padding to alignment boundary
+     */
 } CodeUnwindingInfoEvent;
 
-static const intptr_t nanoseconds_per_second = 1000000000;
-
-// Dwarf encoding constants
+// =============================================================================
+//                              GLOBAL STATE MANAGEMENT
+// =============================================================================
 
-static const uint8_t DwarfUData4 = 0x03;
-static const uint8_t DwarfSData4 = 0x0b;
-static const uint8_t DwarfPcRel = 0x10;
-static const uint8_t DwarfDataRel = 0x30;
-// static uint8_t DwarfOmit = 0xff;
+/*
+ * Global state for the perf jitdump implementation
+ *
+ * This structure maintains all the state needed for generating jitdump files.
+ * It's designed as a singleton since there's typically only one jitdump file
+ * per Python process.
+ */
 typedef struct {
-    unsigned char version;
-    unsigned char eh_frame_ptr_enc;
-    unsigned char fde_count_enc;
-    unsigned char table_enc;
-    int32_t eh_frame_ptr;
-    int32_t eh_fde_count;
-    int32_t from;
-    int32_t to;
-} EhFrameHeader;
+    FILE* perf_map;          // File handle for the jitdump file
+    PyThread_type_lock map_lock;  // Thread synchronization lock
+    void* mapped_buffer;     // Memory-mapped region (signals perf we're active)
+    size_t mapped_size;      // Size of the mapped region
+    int code_id;             // Counter for unique code region identifiers
+} PerfMapJitState;
+
+/* Global singleton instance */
+static PerfMapJitState perf_jit_map_state;
+
+// =============================================================================
+//                              TIME UTILITIES
+// =============================================================================
 
+/* Time conversion constant */
+static const intptr_t nanoseconds_per_second = 1000000000;
+
+/*
+ * Get current monotonic time in nanoseconds
+ *
+ * Monotonic time is preferred for event timestamps because it's not affected
+ * by system clock adjustments. This ensures consistent timing relationships
+ * between events even if the system clock is changed.
+ *
+ * Returns: Current monotonic time in nanoseconds since an arbitrary epoch
+ */
 static int64_t get_current_monotonic_ticks(void) {
     struct timespec ts;
     if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) {
-        UNREACHABLE();
+        Py_UNREACHABLE();  // Should never fail on supported systems
         return 0;
     }
-    // Convert to nanoseconds.
+
+    /* Convert to nanoseconds for maximum precision */
     int64_t result = ts.tv_sec;
     result *= nanoseconds_per_second;
     result += ts.tv_nsec;
     return result;
 }
 
+/*
+ * Get current wall clock time in microseconds
+ *
+ * Used for the jitdump file header timestamp. Unlike monotonic time,
+ * this represents actual wall clock time that can be correlated with
+ * other system events.
+ *
+ * Returns: Current time in microseconds since Unix epoch
+ */
 static int64_t get_current_time_microseconds(void) {
-  // gettimeofday has microsecond resolution.
-  struct timeval tv;
-  if (gettimeofday(&tv, NULL) < 0) {
-    UNREACHABLE();
-    return 0;
-  }
-  return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec;
+    struct timeval tv;
+    if (gettimeofday(&tv, NULL) < 0) {
+        Py_UNREACHABLE();  // Should never fail on supported systems
+        return 0;
+    }
+    return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec;
 }
 
+// =============================================================================
+//                              UTILITY FUNCTIONS
+// =============================================================================
 
+/*
+ * Round up a value to the next multiple of a given number
+ *
+ * This is essential for maintaining proper alignment requirements in the
+ * jitdump format. Many structures need to be aligned to specific boundaries
+ * (typically 8 or 16 bytes) for efficient processing by perf.
+ *
+ * Args:
+ *   value: The value to round up
+ *   multiple: The multiple to round up to
+ *
+ * Returns: The smallest value >= input that is a multiple of 'multiple'
+ */
 static size_t round_up(int64_t value, int64_t multiple) {
     if (multiple == 0) {
-        // Avoid division by zero
-        return value;
+        return value;  // Avoid division by zero
     }
 
     int64_t remainder = value % multiple;
     if (remainder == 0) {
-        // Value is already a multiple of 'multiple'
-        return value;
+        return value;  // Already aligned
     }
 
-    // Calculate the difference to the next multiple
+    /* Calculate how much to add to reach the next multiple */
     int64_t difference = multiple - remainder;
-
-    // Add the difference to the value
     int64_t rounded_up_value = value + difference;
 
     return rounded_up_value;
 }
 
+// =============================================================================
+//                              FILE I/O UTILITIES
+// =============================================================================
 
+/*
+ * Write data to the jitdump file with error handling
+ *
+ * This function ensures that all data is written to the file, handling
+ * partial writes that can occur with large buffers or when the system
+ * is under load.
+ *
+ * Args:
+ *   buffer: Pointer to data to write
+ *   size: Number of bytes to write
+ */
 static void perf_map_jit_write_fully(const void* buffer, size_t size) {
     FILE* out_file = perf_jit_map_state.perf_map;
     const char* ptr = (const char*)(buffer);
+
     while (size > 0) {
         const size_t written = fwrite(ptr, 1, size, out_file);
         if (written == 0) {
-            UNREACHABLE();
+            Py_UNREACHABLE();  // Write failure - should be very rare
             break;
         }
         size -= written;
@@ -243,284 +356,720 @@ static void perf_map_jit_write_fully(const void* buffer, size_t size) {
     }
 }
 
+/*
+ * Write the jitdump file header
+ *
+ * The header must be written exactly once at the beginning of each jitdump
+ * file. It provides metadata that perf uses to parse the rest of the file.
+ *
+ * Args:
+ *   pid: Process ID to include in the header
+ *   out_file: File handle to write to (currently unused, uses global state)
+ */
 static void perf_map_jit_write_header(int pid, FILE* out_file) {
     Header header;
-    header.magic = 0x4A695444;
-    header.version = 1;
-    header.size = sizeof(Header);
-    header.elf_mach_target = GetElfMachineArchitecture();
-    header.process_id = pid;
-    header.time_stamp = get_current_time_microseconds();
-    header.flags = 0;
-    perf_map_jit_write_fully(&header, sizeof(header));
-}
 
-static void* perf_map_jit_init(void) {
-    char filename[100];
-    int pid = getpid();
-    snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid);
-    const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666);
-    if (fd == -1) {
-        return NULL;
-    }
+    /* Initialize header with required values */
+    header.magic = 0x4A695444;                    // "JiTD" magic number
+    header.version = 1;                           // Current jitdump version
+    header.size = sizeof(Header);                 // Header size for validation
+    header.elf_mach_target = GetElfMachineArchitecture();  // Target architecture
+    header.process_id = pid;                      // Process identifier
+    header.time_stamp = get_current_time_microseconds();   // Creation time
+    header.flags = 0;                             // No special flags currently used
 
-    const long page_size = sysconf(_SC_PAGESIZE);  // NOLINT(runtime/int)
-    if (page_size == -1) {
-        close(fd);
-        return NULL;
-    }
-
-    // The perf jit interface forces us to map the first page of the file
-    // to signal that we are using the interface.
-    perf_jit_map_state.mapped_buffer = mmap(NULL, page_size, PROT_READ | PROT_EXEC, MAP_PRIVATE, fd, 0);
-    if (perf_jit_map_state.mapped_buffer == NULL) {
-        close(fd);
-        return NULL;
-    }
-    perf_jit_map_state.mapped_size = page_size;
-    perf_jit_map_state.perf_map = fdopen(fd, "w+");
-    if (perf_jit_map_state.perf_map == NULL) {
-        close(fd);
-        return NULL;
-    }
-    setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB);
-    perf_map_jit_write_header(pid, perf_jit_map_state.perf_map);
-
-    perf_jit_map_state.map_lock = PyThread_allocate_lock();
-    if (perf_jit_map_state.map_lock == NULL) {
-        fclose(perf_jit_map_state.perf_map);
-        return NULL;
-    }
-    perf_jit_map_state.code_id = 0;
-
-    trampoline_api.code_padding = PERF_JIT_CODE_PADDING;
-    return &perf_jit_map_state;
+    perf_map_jit_write_fully(&header, sizeof(header));
 }
 
-/* DWARF definitions. */
+// =============================================================================
+//                              DWARF CONSTANTS AND UTILITIES
+// =============================================================================
+
+/*
+ * DWARF (Debug With Arbitrary Record Formats) constants
+ *
+ * DWARF is a debugging data format used to provide stack unwinding information.
+ * These constants define the various encoding types and opcodes used in
+ * DWARF Call Frame Information (CFI) records.
+ */
 
+/* DWARF Call Frame Information version */
 #define DWRF_CIE_VERSION 1
 
+/* DWARF CFA (Call Frame Address) opcodes */
 enum {
-    DWRF_CFA_nop = 0x0,
-    DWRF_CFA_offset_extended = 0x5,
-    DWRF_CFA_def_cfa = 0xc,
-    DWRF_CFA_def_cfa_offset = 0xe,
-    DWRF_CFA_offset_extended_sf = 0x11,
-    DWRF_CFA_advance_loc = 0x40,
-    DWRF_CFA_offset = 0x80
+    DWRF_CFA_nop = 0x0,                    // No operation
+    DWRF_CFA_offset_extended = 0x5,        // Extended offset instruction
+    DWRF_CFA_def_cfa = 0xc,               // Define CFA rule
+    DWRF_CFA_def_cfa_offset = 0xe,        // Define CFA offset
+    DWRF_CFA_offset_extended_sf = 0x11,   // Extended signed offset
+    DWRF_CFA_advance_loc = 0x40,          // Advance location counter
+    DWRF_CFA_offset = 0x80                // Simple offset instruction
 };
 
-enum
-  {
-    DWRF_EH_PE_absptr = 0x00,
-    DWRF_EH_PE_omit = 0xff,
-
-    /* FDE data encoding.  */
-    DWRF_EH_PE_uleb128 = 0x01,
-    DWRF_EH_PE_udata2 = 0x02,
-    DWRF_EH_PE_udata4 = 0x03,
-    DWRF_EH_PE_udata8 = 0x04,
-    DWRF_EH_PE_sleb128 = 0x09,
-    DWRF_EH_PE_sdata2 = 0x0a,
-    DWRF_EH_PE_sdata4 = 0x0b,
-    DWRF_EH_PE_sdata8 = 0x0c,
-    DWRF_EH_PE_signed = 0x08,
-
-    /* FDE flags.  */
-    DWRF_EH_PE_pcrel = 0x10,
-    DWRF_EH_PE_textrel = 0x20,
-    DWRF_EH_PE_datarel = 0x30,
-    DWRF_EH_PE_funcrel = 0x40,
-    DWRF_EH_PE_aligned = 0x50,
-
-    DWRF_EH_PE_indirect = 0x80
-  };
+/* DWARF Exception Handling pointer encodings */
+enum {
+    DWRF_EH_PE_absptr = 0x00,             // Absolute pointer
+    DWRF_EH_PE_omit = 0xff,               // Omitted value
+
+    /* Data type encodings */
+    DWRF_EH_PE_uleb128 = 0x01,            // Unsigned LEB128
+    DWRF_EH_PE_udata2 = 0x02,             // Unsigned 2-byte
+    DWRF_EH_PE_udata4 = 0x03,             // Unsigned 4-byte
+    DWRF_EH_PE_udata8 = 0x04,             // Unsigned 8-byte
+    DWRF_EH_PE_sleb128 = 0x09,            // Signed LEB128
+    DWRF_EH_PE_sdata2 = 0x0a,             // Signed 2-byte
+    DWRF_EH_PE_sdata4 = 0x0b,             // Signed 4-byte
+    DWRF_EH_PE_sdata8 = 0x0c,             // Signed 8-byte
+    DWRF_EH_PE_signed = 0x08,             // Signed flag
+
+    /* Reference type encodings */
+    DWRF_EH_PE_pcrel = 0x10,              // PC-relative
+    DWRF_EH_PE_textrel = 0x20,            // Text-relative
+    DWRF_EH_PE_datarel = 0x30,            // Data-relative
+    DWRF_EH_PE_funcrel = 0x40,            // Function-relative
+    DWRF_EH_PE_aligned = 0x50,            // Aligned
+    DWRF_EH_PE_indirect = 0x80            // Indirect
+};
 
+/* Additional DWARF constants for debug information */
 enum { DWRF_TAG_compile_unit = 0x11 };
-
 enum { DWRF_children_no = 0, DWRF_children_yes = 1 };
+enum {
+    DWRF_AT_name = 0x03,         // Name attribute
+    DWRF_AT_stmt_list = 0x10,    // Statement list
+    DWRF_AT_low_pc = 0x11,       // Low PC address
+    DWRF_AT_high_pc = 0x12       // High PC address
+};
+enum {
+    DWRF_FORM_addr = 0x01,       // Address form
+    DWRF_FORM_data4 = 0x06,      // 4-byte data
+    DWRF_FORM_string = 0x08      // String form
+};
 
-enum { DWRF_AT_name = 0x03, DWRF_AT_stmt_list = 0x10, DWRF_AT_low_pc = 0x11, DWRF_AT_high_pc = 0x12 };
-
-enum { DWRF_FORM_addr = 0x01, DWRF_FORM_data4 = 0x06, DWRF_FORM_string = 0x08 };
-
-enum { DWRF_LNS_extended_op = 0, DWRF_LNS_copy = 1, DWRF_LNS_advance_pc = 2, DWRF_LNS_advance_line = 3 };
+/* Line number program opcodes */
+enum {
+    DWRF_LNS_extended_op = 0,    // Extended opcode
+    DWRF_LNS_copy = 1,           // Copy operation
+    DWRF_LNS_advance_pc = 2,     // Advance program counter
+    DWRF_LNS_advance_line = 3    // Advance line number
+};
 
-enum { DWRF_LNE_end_sequence = 1, DWRF_LNE_set_address = 2 };
+/* Line number extended opcodes */
+enum {
+    DWRF_LNE_end_sequence = 1,   // End of sequence
+    DWRF_LNE_set_address = 2     // Set address
+};
 
+/*
+ * Architecture-specific DWARF register numbers
+ *
+ * These constants define the register numbering scheme used by DWARF
+ * for each supported architecture. The numbers must match the ABI
+ * specification for proper stack unwinding.
+ */
 enum {
 #ifdef __x86_64__
-    /* Yes, the order is strange, but correct. */
-    DWRF_REG_AX,
-    DWRF_REG_DX,
-    DWRF_REG_CX,
-    DWRF_REG_BX,
-    DWRF_REG_SI,
-    DWRF_REG_DI,
-    DWRF_REG_BP,
-    DWRF_REG_SP,
-    DWRF_REG_8,
-    DWRF_REG_9,
-    DWRF_REG_10,
-    DWRF_REG_11,
-    DWRF_REG_12,
-    DWRF_REG_13,
-    DWRF_REG_14,
-    DWRF_REG_15,
-    DWRF_REG_RA,
+    /* x86_64 register numbering (note: order is defined by x86_64 ABI) */
+    DWRF_REG_AX,    // RAX
+    DWRF_REG_DX,    // RDX
+    DWRF_REG_CX,    // RCX
+    DWRF_REG_BX,    // RBX
+    DWRF_REG_SI,    // RSI
+    DWRF_REG_DI,    // RDI
+    DWRF_REG_BP,    // RBP
+    DWRF_REG_SP,    // RSP
+    DWRF_REG_8,     // R8
+    DWRF_REG_9,     // R9
+    DWRF_REG_10,    // R10
+    DWRF_REG_11,    // R11
+    DWRF_REG_12,    // R12
+    DWRF_REG_13,    // R13
+    DWRF_REG_14,    // R14
+    DWRF_REG_15,    // R15
+    DWRF_REG_RA,    // Return address (RIP)
 #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
-    DWRF_REG_SP = 31,
-    DWRF_REG_RA = 30,
+    /* AArch64 register numbering */
+    DWRF_REG_FP = 29,  // Frame Pointer
+    DWRF_REG_RA = 30,  // Link register (return address)
+    DWRF_REG_SP = 31,  // Stack pointer
 #else
 #    error "Unsupported target architecture"
 #endif
 };
 
-typedef struct ELFObjectContext
-{
-    uint8_t* p; /* Pointer to next address in obj.space. */
-    uint8_t* startp; /* Pointer to start address in obj.space. */
-    uint8_t* eh_frame_p; /* Pointer to start address in obj.space. */
-    uint32_t code_size; /* Size of machine code. */
+/* DWARF encoding constants used in EH frame headers */
+static const uint8_t DwarfUData4 = 0x03;     // Unsigned 4-byte data
+static const uint8_t DwarfSData4 = 0x0b;     // Signed 4-byte data
+static const uint8_t DwarfPcRel = 0x10;      // PC-relative encoding
+static const uint8_t DwarfDataRel = 0x30;    // Data-relative encoding
+
+// =============================================================================
+//                              ELF OBJECT CONTEXT
+// =============================================================================
+
+/*
+ * Context for building ELF/DWARF structures
+ *
+ * This structure maintains state while constructing DWARF unwind information.
+ * It acts as a simple buffer manager with pointers to track current position
+ * and important landmarks within the buffer.
+ */
+typedef struct ELFObjectContext {
+    uint8_t* p;            // Current write position in buffer
+    uint8_t* startp;       // Start of buffer (for offset calculations)
+    uint8_t* eh_frame_p;   // Start of EH frame data (for relative offsets)
+    uint32_t code_size;    // Size of the code being described
 } ELFObjectContext;
 
-/* Append a null-terminated string. */
-static uint32_t
-elfctx_append_string(ELFObjectContext* ctx, const char* str)
-{
+/*
+ * EH Frame Header structure for DWARF unwinding
+ *
+ * This structure provides metadata about the DWARF unwinding information
+ * that follows. It's required by the perf jitdump format to enable proper
+ * stack unwinding during profiling.
+ */
+typedef struct {
+    unsigned char version;           // EH frame version (always 1)
+    unsigned char eh_frame_ptr_enc;  // Encoding of EH frame pointer
+    unsigned char fde_count_enc;     // Encoding of FDE count
+    unsigned char table_enc;         // Encoding of table entries
+    int32_t eh_frame_ptr;           // Pointer to EH frame data
+    int32_t eh_fde_count;           // Number of FDEs (Frame Description Entries)
+    int32_t from;                   // Start address of code range
+    int32_t to;                     // End address of code range
+} EhFrameHeader;
+
+// =============================================================================
+//                              DWARF GENERATION UTILITIES
+// =============================================================================
+
+/*
+ * Append a null-terminated string to the ELF context buffer
+ *
+ * Args:
+ *   ctx: ELF object context
+ *   str: String to append (must be null-terminated)
+ *
+ * Returns: Offset from start of buffer where string was written
+ */
+static uint32_t elfctx_append_string(ELFObjectContext* ctx, const char* str) {
     uint8_t* p = ctx->p;
     uint32_t ofs = (uint32_t)(p - ctx->startp);
+
+    /* Copy string including null terminator */
     do {
         *p++ = (uint8_t)*str;
     } while (*str++);
+
     ctx->p = p;
     return ofs;
 }
 
-/* Append a SLEB128 value. */
-static void
-elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v)
-{
+/*
+ * Append a SLEB128 (Signed Little Endian Base 128) value
+ *
+ * SLEB128 is a variable-length encoding used extensively in DWARF.
+ * It efficiently encodes small numbers in fewer bytes.
+ *
+ * Args:
+ *   ctx: ELF object context
+ *   v: Signed value to encode
+ */
+static void elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v) {
     uint8_t* p = ctx->p;
+
+    /* Encode 7 bits at a time, with continuation bit in MSB */
     for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) {
-        *p++ = (uint8_t)((v & 0x7f) | 0x80);
+        *p++ = (uint8_t)((v & 0x7f) | 0x80);  // Set continuation bit
     }
-    *p++ = (uint8_t)(v & 0x7f);
+    *p++ = (uint8_t)(v & 0x7f);  // Final byte without continuation bit
+
     ctx->p = p;
 }
 
-/* Append a ULEB128 to buffer. */
-static void
-elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v)
-{
+/*
+ * Append a ULEB128 (Unsigned Little Endian Base 128) value
+ *
+ * Similar to SLEB128 but for unsigned values.
+ *
+ * Args:
+ *   ctx: ELF object context
+ *   v: Unsigned value to encode
+ */
+static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) {
     uint8_t* p = ctx->p;
+
+    /* Encode 7 bits at a time, with continuation bit in MSB */
     for (; v >= 0x80; v >>= 7) {
-        *p++ = (char)((v & 0x7f) | 0x80);
+        *p++ = (char)((v & 0x7f) | 0x80);  // Set continuation bit
     }
-    *p++ = (char)v;
+    *p++ = (char)v;  // Final byte without continuation bit
+
     ctx->p = p;
 }
 
-/* Shortcuts to generate DWARF structures. */
-#define DWRF_U8(x) (*p++ = (x))
-#define DWRF_I8(x) (*(int8_t*)p = (x), p++)
-#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2)
-#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4)
-#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t))
-#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p)
-#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p)
-#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p)
-#define DWRF_ALIGNNOP(s)                                                                                \
-    while ((uintptr_t)p & ((s)-1)) {                                                                    \
-        *p++ = DWRF_CFA_nop;                                                                            \
+/*
+ * Macros for generating DWARF structures
+ *
+ * These macros provide a convenient way to write various data types
+ * to the DWARF buffer while automatically advancing the pointer.
+ */
+#define DWRF_U8(x) (*p++ = (x))                                    // Write unsigned 8-bit
+#define DWRF_I8(x) (*(int8_t*)p = (x), p++)                       // Write signed 8-bit
+#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2)                 // Write unsigned 16-bit
+#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4)                 // Write unsigned 32-bit
+#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t)) // Write address
+#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p) // Write ULEB128
+#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p) // Write SLEB128
+#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p) // Write string
+
+/* Align to specified boundary with NOP instructions */
+#define DWRF_ALIGNNOP(s)                                          \
+    while ((uintptr_t)p & ((s)-1)) {                              \
+        *p++ = DWRF_CFA_nop;                                       \
     }
-#define DWRF_SECTION(name, stmt)                                                                        \
-    {                                                                                                   \
-        uint32_t* szp_##name = (uint32_t*)p;                                                            \
-        p += 4;                                                                                         \
-        stmt;                                                                                           \
-        *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4);                                       \
+
+/* Write a DWARF section with automatic size calculation */
+#define DWRF_SECTION(name, stmt)                                  \
+    {                                                             \
+        uint32_t* szp_##name = (uint32_t*)p;                      \
+        p += 4;                                                   \
+        stmt;                                                     \
+        *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \
     }
 
-/* Initialize .eh_frame section. */
-static void
-elf_init_ehframe(ELFObjectContext* ctx)
-{
+// =============================================================================
+//                              DWARF EH FRAME GENERATION
+// =============================================================================
+
+/*
+ * Initialize DWARF .eh_frame section for a code region
+ *
+ * The .eh_frame section contains Call Frame Information (CFI) that describes
+ * how to unwind the stack at any point in the code. This is essential for
+ * proper profiling as it allows perf to generate accurate call graphs.
+ *
+ * The function generates two main components:
+ * 1. CIE (Common Information Entry) - describes calling conventions
+ * 2. FDE (Frame Description Entry) - describes specific function unwinding
+ *
+ * Args:
+ *   ctx: ELF object context containing code size and buffer pointers
+ */
+static void elf_init_ehframe(ELFObjectContext* ctx) {
     uint8_t* p = ctx->p;
-    uint8_t* framep = p;
-
-    /* Emit DWARF EH CIE. */
-    DWRF_SECTION(CIE, DWRF_U32(0); /* Offset to CIE itself. */
-                 DWRF_U8(DWRF_CIE_VERSION);
-                 DWRF_STR("zR"); /* Augmentation. */
-                 DWRF_UV(1); /* Code alignment factor. */
-                 DWRF_SV(-(int64_t)sizeof(uintptr_t)); /* Data alignment factor. */
-                 DWRF_U8(DWRF_REG_RA); /* Return address register. */
-                 DWRF_UV(1);
-                 DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); /* Augmentation data. */
-                 DWRF_U8(DWRF_CFA_def_cfa); DWRF_UV(DWRF_REG_SP); DWRF_UV(sizeof(uintptr_t));
-                 DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); DWRF_UV(1);
-                 DWRF_ALIGNNOP(sizeof(uintptr_t));
+    uint8_t* framep = p;  // Remember start of frame data
+
+    /*
+    * DWARF Unwind Table for Trampoline Function
+    *
+    * This section defines DWARF Call Frame Information (CFI) using encoded macros
+    * like `DWRF_U8`, `DWRF_UV`, and `DWRF_SECTION` to describe how the trampoline function
+    * preserves and restores registers. This is used by profiling tools (e.g., `perf`)
+    * and debuggers for stack unwinding in JIT-compiled code.
+    *
+    * -------------------------------------------------
+    * TO REGENERATE THIS TABLE FROM GCC OBJECTS:
+    * -------------------------------------------------
+    *
+    * 1. Create a trampoline source file (e.g., `trampoline.c`):
+    *
+    *      #include <Python.h>
+    *      typedef PyObject* (*py_evaluator)(void*, void*, int);
+    *      PyObject* trampoline(void *ts, void *f, int throwflag, py_evaluator evaluator) {
+    *          return evaluator(ts, f, throwflag);
+    *      }
+    *
+    * 2. Compile to an object file with frame pointer preservation:
+    *
+    *      gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c
+    *
+    * 3. Extract DWARF unwind info from the object file:
+    *
+    *      readelf -w trampoline.o
+    *
+    *    Example output from `.eh_frame`:
+    *
+    *      00000000 CIE
+    *        Version:               1
+    *        Augmentation:          "zR"
+    *        Code alignment factor: 4
+    *        Data alignment factor: -8
+    *        Return address column: 30
+    *        DW_CFA_def_cfa: r31 (sp) ofs 0
+    *
+    *      00000014 FDE cie=00000000 pc=0..14
+    *        DW_CFA_advance_loc: 4
+    *        DW_CFA_def_cfa_offset: 16
+    *        DW_CFA_offset: r29 at cfa-16
+    *        DW_CFA_offset: r30 at cfa-8
+    *        DW_CFA_advance_loc: 12
+    *        DW_CFA_restore: r30
+    *        DW_CFA_restore: r29
+    *        DW_CFA_def_cfa_offset: 0
+    *
+    * -- These values can be verified by comparing with `readelf -w` or `llvm-dwarfdump --eh-frame`.
+    *
+    * ----------------------------------
+    * HOW TO TRANSLATE TO DWRF_* MACROS:
+    * ----------------------------------
+    *
+    * After compiling your trampoline with:
+    *
+    *     gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c
+    *
+    * run:
+    *
+    *     readelf -w trampoline.o
+    *
+    * to inspect the generated `.eh_frame` data. You will see two main components:
+    *
+    *     1. A CIE (Common Information Entry): shared configuration used by all FDEs.
+    *     2. An FDE (Frame Description Entry): function-specific unwind instructions.
+    *
+    * ---------------------
+    * Translating the CIE:
+    * ---------------------
+    * From `readelf -w`, you might see:
+    *
+    *   00000000 0000000000000010 00000000 CIE
+    *     Version:               1
+    *     Augmentation:          "zR"
+    *     Code alignment factor: 4
+    *     Data alignment factor: -8
+    *     Return address column: 30
+    *     Augmentation data:     1b
+    *     DW_CFA_def_cfa: r31 (sp) ofs 0
+    *
+    * Map this to:
+    *
+    *     DWRF_SECTION(CIE,
+    *         DWRF_U32(0);                             // CIE ID (always 0 for CIEs)
+    *         DWRF_U8(DWRF_CIE_VERSION);              // Version: 1
+    *         DWRF_STR("zR");                         // Augmentation string "zR"
+    *         DWRF_UV(4);                             // Code alignment factor = 4
+    *         DWRF_SV(-8);                            // Data alignment factor = -8
+    *         DWRF_U8(DWRF_REG_RA);                   // Return address register (e.g., x30 = 30)
+    *         DWRF_UV(1);                             // Augmentation data length = 1
+    *         DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // Encoding for FDE pointers
+    *
+    *         DWRF_U8(DWRF_CFA_def_cfa);              // DW_CFA_def_cfa
+    *         DWRF_UV(DWRF_REG_SP);                   // Register: SP (r31)
+    *         DWRF_UV(0);                             // Offset = 0
+    *
+    *         DWRF_ALIGNNOP(sizeof(uintptr_t));       // Align to pointer size boundary
+    *     )
+    *
+    * Notes:
+    *   - Use `DWRF_UV` for unsigned LEB128, `DWRF_SV` for signed LEB128.
+    *   - `DWRF_REG_RA` and `DWRF_REG_SP` are architecture-defined constants.
+    *
+    * ---------------------
+    * Translating the FDE:
+    * ---------------------
+    * From `readelf -w`:
+    *
+    *   00000014 0000000000000020 00000018 FDE cie=00000000 pc=0000000000000000..0000000000000014
+    *     DW_CFA_advance_loc: 4
+    *     DW_CFA_def_cfa_offset: 16
+    *     DW_CFA_offset: r29 at cfa-16
+    *     DW_CFA_offset: r30 at cfa-8
+    *     DW_CFA_advance_loc: 12
+    *     DW_CFA_restore: r30
+    *     DW_CFA_restore: r29
+    *     DW_CFA_def_cfa_offset: 0
+    *
+    * Map the FDE header and instructions to:
+    *
+    *     DWRF_SECTION(FDE,
+    *         DWRF_U32((uint32_t)(p - framep));       // Offset to CIE (relative from here)
+    *         DWRF_U32(-0x30);                        // Initial PC-relative location of the code
+    *         DWRF_U32(ctx->code_size);               // Code range covered by this FDE
+    *         DWRF_U8(0);                             // Augmentation data length (none)
+    *
+    *         DWRF_U8(DWRF_CFA_advance_loc | 1);      // Advance location by 1 unit (1 * 4 = 4 bytes)
+    *         DWRF_U8(DWRF_CFA_def_cfa_offset);       // CFA = SP + 16
+    *         DWRF_UV(16);
+    *
+    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Save x29 (frame pointer)
+    *         DWRF_UV(2);                             // At offset 2 * 8 = 16 bytes
+    *
+    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Save x30 (return address)
+    *         DWRF_UV(1);                             // At offset 1 * 8 = 8 bytes
+    *
+    *         DWRF_U8(DWRF_CFA_advance_loc | 3);      // Advance location by 3 units (3 * 4 = 12 bytes)
+    *
+    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore x30
+    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore x29
+    *
+    *         DWRF_U8(DWRF_CFA_def_cfa_offset);       // CFA = SP
+    *         DWRF_UV(0);
+    *     )
+    *
+    * To regenerate:
+    *   1. Get the `code alignment factor`, `data alignment factor`, and `RA column` from the CIE.
+    *   2. Note the range of the function from the FDE's `pc=...` line and map it to the JIT code as
+    *      the code is in a different address space every time.
+    *   3. For each `DW_CFA_*` entry, use the corresponding `DWRF_*` macro:
+    *        - `DW_CFA_def_cfa_offset`     → DWRF_U8(DWRF_CFA_def_cfa_offset), DWRF_UV(value)
+    *        - `DW_CFA_offset: rX`         → DWRF_U8(DWRF_CFA_offset | reg), DWRF_UV(offset)
+    *        - `DW_CFA_restore: rX`        → DWRF_U8(DWRF_CFA_offset | reg) // restore is same as reusing offset
+    *        - `DW_CFA_advance_loc: N`     → DWRF_U8(DWRF_CFA_advance_loc | (N / code_alignment_factor))
+    *   4. Use `DWRF_REG_FP`, `DWRF_REG_RA`, etc., for register numbers.
+    *   5. Use `sizeof(uintptr_t)` (typically 8) for pointer size calculations and alignment.
+    */
+
+    /*
+     * Emit DWARF EH CIE (Common Information Entry)
+     *
+     * The CIE describes the calling conventions and basic unwinding rules
+     * that apply to all functions in this compilation unit.
+     */
+    DWRF_SECTION(CIE,
+        DWRF_U32(0);                           // CIE ID (0 indicates this is a CIE)
+        DWRF_U8(DWRF_CIE_VERSION);            // CIE version (1)
+        DWRF_STR("zR");                       // Augmentation string ("zR" = has LSDA)
+        DWRF_UV(1);                           // Code alignment factor
+        DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative)
+        DWRF_U8(DWRF_REG_RA);                 // Return address register number
+        DWRF_UV(1);                           // Augmentation data length
+        DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // FDE pointer encoding
+
+        /* Initial CFI instructions - describe default calling convention */
+        DWRF_U8(DWRF_CFA_def_cfa);            // Define CFA (Call Frame Address)
+        DWRF_UV(DWRF_REG_SP);                 // CFA = SP register
+        DWRF_UV(sizeof(uintptr_t));           // CFA = SP + pointer_size
+        DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved
+        DWRF_UV(1);                           // At offset 1 from CFA
+
+        DWRF_ALIGNNOP(sizeof(uintptr_t));     // Align to pointer boundary
     )
 
-    ctx->eh_frame_p = p;
-
-    /* Emit DWARF EH FDE. */
-    DWRF_SECTION(FDE, DWRF_U32((uint32_t)(p - framep)); /* Offset to CIE. */
-                 DWRF_U32(-0x30); /* Machine code offset relative to .text. */
-                 DWRF_U32(ctx->code_size); /* Machine code length. */
-                 DWRF_U8(0); /* Augmentation data. */
-    /* Registers saved in CFRAME. */
+    ctx->eh_frame_p = p;  // Remember start of FDE data
+
+    /*
+     * Emit DWARF EH FDE (Frame Description Entry)
+     *
+     * The FDE describes unwinding information specific to this function.
+     * It references the CIE and provides function-specific CFI instructions.
+     */
+    DWRF_SECTION(FDE,
+        DWRF_U32((uint32_t)(p - framep));     // Offset to CIE (backwards reference)
+        DWRF_U32(-0x30);                      // Machine code offset relative to .text
+        DWRF_U32(ctx->code_size);             // Address range covered by this FDE (code lenght)
+        DWRF_U8(0);                           // Augmentation data length (none)
+
+        /*
+         * Architecture-specific CFI instructions
+         *
+         * These instructions describe how registers are saved and restored
+         * during function calls. Each architecture has different calling
+         * conventions and register usage patterns.
+         */
 #ifdef __x86_64__
-                 DWRF_U8(DWRF_CFA_advance_loc | 4);
-                 DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(16);
-                 DWRF_U8(DWRF_CFA_advance_loc | 6);
-                 DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(8);
-    /* Extra registers saved for JIT-compiled code. */
+        /* x86_64 calling convention unwinding rules */
+        DWRF_U8(DWRF_CFA_advance_loc | 4);    // Advance location by 4 bytes
+        DWRF_U8(DWRF_CFA_def_cfa_offset);     // Redefine CFA offset
+        DWRF_UV(16);                          // New offset: SP + 16
+        DWRF_U8(DWRF_CFA_advance_loc | 6);    // Advance location by 6 bytes
+        DWRF_U8(DWRF_CFA_def_cfa_offset);     // Redefine CFA offset
+        DWRF_UV(8);                           // New offset: SP + 8
 #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
-                 DWRF_U8(DWRF_CFA_advance_loc | 1);
-                 DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(16);
-                 DWRF_U8(DWRF_CFA_offset | 29); DWRF_UV(2);
-                 DWRF_U8(DWRF_CFA_offset | 30); DWRF_UV(1);
-                 DWRF_U8(DWRF_CFA_advance_loc | 3);
-                 DWRF_U8(DWRF_CFA_offset | -(64 - 29));
-                 DWRF_U8(DWRF_CFA_offset | -(64 - 30));
-                 DWRF_U8(DWRF_CFA_def_cfa_offset);
-                 DWRF_UV(0);
+        /* AArch64 calling convention unwinding rules */
+        DWRF_U8(DWRF_CFA_advance_loc | 1);        // Advance location by 1 instruction (stp x29, x30)
+        DWRF_U8(DWRF_CFA_def_cfa_offset);         // Redefine CFA offset
+        DWRF_UV(16);                              // CFA = SP + 16 (stack pointer after push)
+        DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP);   // Frame pointer (x29) saved
+        DWRF_UV(2);                               // At offset 2 from CFA (2 * 8 = 16 bytes)
+        DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA);   // Link register (x30) saved
+        DWRF_UV(1);                               // At offset 1 from CFA (1 * 8 = 8 bytes)
+        DWRF_U8(DWRF_CFA_advance_loc | 3);        // Advance by 3 instructions (mov x16, x3; mov x29, sp; ldp...)
+        DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP);   // Restore frame pointer (x29)
+        DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA);   // Restore link register (x30)
+        DWRF_U8(DWRF_CFA_def_cfa_offset);         // Final CFA adjustment
+        DWRF_UV(0);                               // CFA = SP + 0 (stack restored)
+
 #else
 #    error "Unsupported target architecture"
 #endif
-                 DWRF_ALIGNNOP(sizeof(uintptr_t));)
 
-    ctx->p = p;
+        DWRF_ALIGNNOP(sizeof(uintptr_t));     // Align to pointer boundary
+    )
+
+    ctx->p = p;  // Update context pointer to end of generated data
+}
+
+// =============================================================================
+//                              JITDUMP INITIALIZATION
+// =============================================================================
+
+/*
+ * Initialize the perf jitdump interface
+ *
+ * This function sets up everything needed to generate jitdump files:
+ * 1. Creates the jitdump file with a unique name
+ * 2. Maps the first page to signal perf that we're using the interface
+ * 3. Writes the jitdump header
+ * 4. Initializes synchronization primitives
+ *
+ * The memory mapping is crucial - perf detects jitdump files by scanning
+ * for processes that have mapped files matching the pattern /tmp/jit-*.dump
+ *
+ * Returns: Pointer to initialized state, or NULL on failure
+ */
+static void* perf_map_jit_init(void) {
+    char filename[100];
+    int pid = getpid();
+
+    /* Create unique filename based on process ID */
+    snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid);
+
+    /* Create/open the jitdump file with appropriate permissions */
+    const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666);
+    if (fd == -1) {
+        return NULL;  // Failed to create file
+    }
+
+    /* Get system page size for memory mapping */
+    const long page_size = sysconf(_SC_PAGESIZE);
+    if (page_size == -1) {
+        close(fd);
+        return NULL;  // Failed to get page size
+    }
+
+    /*
+     * Map the first page of the jitdump file
+     *
+     * This memory mapping serves as a signal to perf that this process
+     * is generating JIT code. Perf scans /proc/.../maps looking for mapped
+     * files that match the jitdump naming pattern.
+     *
+     * The mapping must be PROT_READ | PROT_EXEC to be detected by perf.
+     */
+    perf_jit_map_state.mapped_buffer = mmap(
+        NULL,                    // Let kernel choose address
+        page_size,               // Map one page
+        PROT_READ | PROT_EXEC,   // Read and execute permissions (required by perf)
+        MAP_PRIVATE,             // Private mapping
+        fd,                      // File descriptor
+        0                        // Offset 0 (first page)
+    );
+
+    if (perf_jit_map_state.mapped_buffer == NULL) {
+        close(fd);
+        return NULL;  // Memory mapping failed
+    }
+
+    perf_jit_map_state.mapped_size = page_size;
+
+    /* Convert file descriptor to FILE* for easier I/O operations */
+    perf_jit_map_state.perf_map = fdopen(fd, "w+");
+    if (perf_jit_map_state.perf_map == NULL) {
+        close(fd);
+        return NULL;  // Failed to create FILE*
+    }
+
+    /*
+     * Set up file buffering for better performance
+     *
+     * We use a large buffer (2MB) because jitdump files can be written
+     * frequently during program execution. Buffering reduces system call
+     * overhead and improves overall performance.
+     */
+    setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB);
+
+    /* Write the jitdump file header */
+    perf_map_jit_write_header(pid, perf_jit_map_state.perf_map);
+
+    /*
+     * Initialize thread synchronization lock
+     *
+     * Multiple threads may attempt to write to the jitdump file
+     * simultaneously. This lock ensures thread-safe access to the
+     * global jitdump state.
+     */
+    perf_jit_map_state.map_lock = PyThread_allocate_lock();
+    if (perf_jit_map_state.map_lock == NULL) {
+        fclose(perf_jit_map_state.perf_map);
+        return NULL;  // Failed to create lock
+    }
+
+    /* Initialize code ID counter */
+    perf_jit_map_state.code_id = 0;
+
+    /* Configure trampoline API with padding information */
+    trampoline_api.code_padding = PERF_JIT_CODE_PADDING;
+
+    return &perf_jit_map_state;
 }
 
+// =============================================================================
+//                              MAIN JITDUMP ENTRY WRITING
+// =============================================================================
+
+/*
+ * Write a complete jitdump entry for a Python function
+ *
+ * This is the main function called by Python's trampoline system whenever
+ * a new piece of JIT-compiled code needs to be recorded. It writes both
+ * the unwinding information and the code load event to the jitdump file.
+ *
+ * The function performs these steps:
+ * 1. Initialize jitdump system if not already done
+ * 2. Extract function name and filename from Python code object
+ * 3. Generate DWARF unwinding information
+ * 4. Write unwinding info event to jitdump file
+ * 5. Write code load event to jitdump file
+ *
+ * Args:
+ *   state: Jitdump state (currently unused, uses global state)
+ *   code_addr: Address where the compiled code resides
+ *   code_size: Size of the compiled code in bytes
+ *   co: Python code object containing metadata
+ *
+ * IMPORTANT: This function signature is part of Python's internal API
+ * and must not be changed without coordinating with core Python development.
+ */
 static void perf_map_jit_write_entry(void *state, const void *code_addr,
-                         unsigned int code_size, PyCodeObject *co)
+                                    unsigned int code_size, PyCodeObject *co)
 {
-
+    /* Initialize jitdump system on first use */
     if (perf_jit_map_state.perf_map == NULL) {
         void* ret = perf_map_jit_init();
         if(ret == NULL){
-            return;
+            return;  // Initialization failed, silently abort
         }
     }
 
+    /*
+     * Extract function information from Python code object
+     *
+     * We create a human-readable function name by combining the qualified
+     * name (includes class/module context) with the filename. This helps
+     * developers identify functions in perf reports.
+     */
     const char *entry = "";
     if (co->co_qualname != NULL) {
         entry = PyUnicode_AsUTF8(co->co_qualname);
     }
+
     const char *filename = "";
     if (co->co_filename != NULL) {
         filename = PyUnicode_AsUTF8(co->co_filename);
     }
 
-
+    /*
+     * Create formatted function name for perf display
+     *
+     * Format: "py::<function_name>:<filename>"
+     * The "py::" prefix helps identify Python functions in mixed-language
+     * profiles (e.g., when profiling C extensions alongside Python code).
+     */
     size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
     char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
     if (perf_map_entry == NULL) {
-        return;
+        return;  // Memory allocation failed
     }
     snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);
 
@@ -528,90 +1077,185 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr,
     uword base = (uword)code_addr;
     uword size = code_size;
 
-    // Write the code unwinding info event.
-
-    // Create unwinding information (eh frame)
+    /*
+     * Generate DWARF unwinding information
+     *
+     * DWARF data is essential for proper stack unwinding during profiling.
+     * Without it, perf cannot generate accurate call graphs, especially
+     * in optimized code where frame pointers may be omitted.
+     */
     ELFObjectContext ctx;
-    char buffer[1024];
+    char buffer[1024];  // Buffer for DWARF data (1KB should be sufficient)
     ctx.code_size = code_size;
     ctx.startp = ctx.p = (uint8_t*)buffer;
+
+    /* Generate EH frame (Exception Handling frame) data */
     elf_init_ehframe(&ctx);
     int eh_frame_size = ctx.p - ctx.startp;
 
-    // Populate the unwind info event for perf
+    /*
+     * Write Code Unwinding Information Event
+     *
+     * This event must be written before the code load event to ensure
+     * perf has the unwinding information available when it processes
+     * the code region.
+     */
     CodeUnwindingInfoEvent ev2;
     ev2.base.event = PerfUnwindingInfo;
     ev2.base.time_stamp = get_current_monotonic_ticks();
     ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
-    // Ensure we have enough space between DSOs when perf maps them
+
+    /* Verify we don't exceed our padding budget */
     assert(ev2.unwind_data_size <= PERF_JIT_CODE_PADDING);
+
     ev2.eh_frame_hdr_size = sizeof(EhFrameHeader);
-    ev2.mapped_size = round_up(ev2.unwind_data_size, 16);
+    ev2.mapped_size = round_up(ev2.unwind_data_size, 16);  // 16-byte alignment
+
+    /* Calculate total event size with padding */
     int content_size = sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size;
-    int padding_size = round_up(content_size, 8) - content_size;
+    int padding_size = round_up(content_size, 8) - content_size;  // 8-byte align
     ev2.base.size = content_size + padding_size;
-    perf_map_jit_write_fully(&ev2, sizeof(ev2));
 
+    /* Write the unwinding info event header */
+    perf_map_jit_write_fully(&ev2, sizeof(ev2));
 
-    // Populate the eh Frame header
+    /*
+     * Write EH Frame Header
+     *
+     * The EH frame header provides metadata about the DWARF unwinding
+     * information that follows. It includes pointers and counts that
+     * help perf navigate the unwinding data efficiently.
+     */
     EhFrameHeader f;
     f.version = 1;
-    f.eh_frame_ptr_enc = DwarfSData4 | DwarfPcRel;
-    f.fde_count_enc = DwarfUData4;
-    f.table_enc = DwarfSData4 | DwarfDataRel;
+    f.eh_frame_ptr_enc = DwarfSData4 | DwarfPcRel;  // PC-relative signed 4-byte
+    f.fde_count_enc = DwarfUData4;                  // Unsigned 4-byte count
+    f.table_enc = DwarfSData4 | DwarfDataRel;       // Data-relative signed 4-byte
+
+    /* Calculate relative offsets for EH frame navigation */
     f.eh_frame_ptr = -(eh_frame_size + 4 * sizeof(unsigned char));
-    f.eh_fde_count = 1;
+    f.eh_fde_count = 1;  // We generate exactly one FDE per function
     f.from = -(round_up(code_size, 8) + eh_frame_size);
+
     int cie_size = ctx.eh_frame_p - ctx.startp;
     f.to = -(eh_frame_size - cie_size);
 
+    /* Write EH frame data and header */
     perf_map_jit_write_fully(ctx.startp, eh_frame_size);
     perf_map_jit_write_fully(&f, sizeof(f));
 
+    /* Write padding to maintain alignment */
     char padding_bytes[] = "\0\0\0\0\0\0\0\0";
     perf_map_jit_write_fully(&padding_bytes, padding_size);
 
-    // Write the code load event.
+    /*
+     * Write Code Load Event
+     *
+     * This event tells perf about the new code region. It includes:
+     * - Memory addresses and sizes
+     * - Process and thread identification
+     * - Function name for symbol resolution
+     * - The actual machine code bytes
+     */
     CodeLoadEvent ev;
     ev.base.event = PerfLoad;
     ev.base.size = sizeof(ev) + (name_length+1) + size;
     ev.base.time_stamp = get_current_monotonic_ticks();
     ev.process_id = getpid();
-    ev.thread_id = syscall(SYS_gettid);
-    ev.vma = base;
-    ev.code_address = base;
+    ev.thread_id = syscall(SYS_gettid);  // Get thread ID via system call
+    ev.vma = base;                       // Virtual memory address
+    ev.code_address = base;              // Same as VMA for our use case
     ev.code_size = size;
+
+    /* Assign unique code ID and increment counter */
     perf_jit_map_state.code_id += 1;
     ev.code_id = perf_jit_map_state.code_id;
 
+    /* Write code load event and associated data */
     perf_map_jit_write_fully(&ev, sizeof(ev));
-    perf_map_jit_write_fully(perf_map_entry, name_length+1);
-    perf_map_jit_write_fully((void*)(base), size);
-    return;
+    perf_map_jit_write_fully(perf_map_entry, name_length+1);  // Include null terminator
+    perf_map_jit_write_fully((void*)(base), size);           // Copy actual machine code
+
+    /* Clean up allocated memory */
+    PyMem_RawFree(perf_map_entry);
 }
 
+// =============================================================================
+//                              CLEANUP AND FINALIZATION
+// =============================================================================
+
+/*
+ * Finalize and cleanup the perf jitdump system
+ *
+ * This function is called when Python is shutting down or when the
+ * perf trampoline system is being disabled. It ensures all resources
+ * are properly released and all buffered data is flushed to disk.
+ *
+ * Args:
+ *   state: Jitdump state (currently unused, uses global state)
+ *
+ * Returns: 0 on success
+ *
+ * IMPORTANT: This function signature is part of Python's internal API
+ * and must not be changed without coordinating with core Python development.
+ */
 static int perf_map_jit_fini(void* state) {
+    /*
+     * Close jitdump file with proper synchronization
+     *
+     * We need to acquire the lock to ensure no other threads are
+     * writing to the file when we close it. This prevents corruption
+     * and ensures all data is properly flushed.
+     */
     if (perf_jit_map_state.perf_map != NULL) {
-        // close the file
         PyThread_acquire_lock(perf_jit_map_state.map_lock, 1);
-        fclose(perf_jit_map_state.perf_map);
+        fclose(perf_jit_map_state.perf_map);  // This also flushes buffers
         PyThread_release_lock(perf_jit_map_state.map_lock);
 
-        // clean up the lock and state
+        /* Clean up synchronization primitive */
         PyThread_free_lock(perf_jit_map_state.map_lock);
         perf_jit_map_state.perf_map = NULL;
     }
+
+    /*
+     * Unmap the memory region
+     *
+     * This removes the signal to perf that we were generating JIT code.
+     * After this point, perf will no longer detect this process as
+     * having JIT capabilities.
+     */
     if (perf_jit_map_state.mapped_buffer != NULL) {
         munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size);
+        perf_jit_map_state.mapped_buffer = NULL;
     }
+
+    /* Clear global state reference */
     trampoline_api.state = NULL;
-    return 0;
+
+    return 0;  // Success
 }
 
+// =============================================================================
+//                              PUBLIC API EXPORT
+// =============================================================================
+
+/*
+ * Python Perf Callbacks Structure
+ *
+ * This structure defines the callback interface that Python's trampoline
+ * system uses to integrate with perf profiling. It contains function
+ * pointers for initialization, event writing, and cleanup.
+ *
+ * CRITICAL: This structure and its contents are part of Python's internal
+ * API. The function signatures and behavior must remain stable to maintain
+ * compatibility with the Python interpreter's perf integration system.
+ *
+ * Used by: Python's _PyPerf_Callbacks system in pycore_ceval.h
+ */
 _PyPerf_Callbacks _Py_perfmap_jit_callbacks = {
-    &perf_map_jit_init,
-    &perf_map_jit_write_entry,
-    &perf_map_jit_fini,
+    &perf_map_jit_init,        // Initialization function
+    &perf_map_jit_write_entry, // Event writing function
+    &perf_map_jit_fini,        // Cleanup function
 };
 
-#endif
+#endif /* PY_HAVE_PERF_TRAMPOLINE */
\ No newline at end of file