Skip to content

Commit 81c8eb8

Browse files
authored
gh-138122: Add blocking mode for accurate stack traces in Tachyon (#142998)
1 parent f9704f1 commit 81c8eb8

File tree

15 files changed

+855
-77
lines changed

15 files changed

+855
-77
lines changed

Doc/library/profiling.sampling.rst

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,8 @@ The default configuration works well for most use cases:
342342
- Disabled
343343
* - Default for ``--subprocesses``
344344
- Disabled
345+
* - Default for ``--blocking``
346+
- Disabled (non-blocking sampling)
345347

346348

347349
Sampling interval and duration
@@ -392,6 +394,50 @@ This option is particularly useful when investigating concurrency issues or
392394
when work is distributed across a thread pool.
393395

394396

397+
.. _blocking-mode:
398+
399+
Blocking mode
400+
-------------
401+
402+
By default, Tachyon reads the target process's memory without stopping it.
403+
This non-blocking approach is ideal for most profiling scenarios because it
404+
imposes virtually zero overhead on the target application: the profiled
405+
program runs at full speed and is unaware it is being observed.
406+
407+
However, non-blocking sampling can occasionally produce incomplete or
408+
inconsistent stack traces in applications with many generators or coroutines
409+
that rapidly switch between yield points, or in programs with very fast-changing
410+
call stacks where functions enter and exit between the start and end of a single
411+
stack read, resulting in reconstructed stacks that mix frames from different
412+
execution states or that never actually existed.
413+
414+
For these cases, the :option:`--blocking` option stops the target process during
415+
each sample::
416+
417+
python -m profiling.sampling run --blocking script.py
418+
python -m profiling.sampling attach --blocking 12345
419+
420+
When blocking mode is enabled, the profiler suspends the target process,
421+
reads its stack, then resumes it. This guarantees that each captured stack
422+
represents a real, consistent snapshot of what the process was doing at that
423+
instant. The trade-off is that the target process runs slower because it is
424+
repeatedly paused.
425+
426+
.. warning::
427+
428+
Do not use very high sample rates (low ``--interval`` values) with blocking
429+
mode. Suspending and resuming a process takes time, and if the sampling
430+
interval is too short, the target will spend more time stopped than running.
431+
For blocking mode, intervals of 1000 microseconds (1 millisecond) or higher
432+
are recommended. The default 100 microsecond interval may cause noticeable
433+
slowdown in the target application.
434+
435+
Use blocking mode only when you observe inconsistent stacks in your profiles,
436+
particularly with generator-heavy or coroutine-heavy code. For most
437+
applications, the default non-blocking mode provides accurate results with
438+
zero impact on the target process.
439+
440+
395441
Special frames
396442
--------------
397443

@@ -1383,6 +1429,13 @@ Sampling options
13831429
Also profile subprocesses. Each subprocess gets its own profiler
13841430
instance and output file. Incompatible with ``--live``.
13851431

1432+
.. option:: --blocking
1433+
1434+
Pause the target process during each sample. This ensures consistent
1435+
stack traces at the cost of slowing down the target. Use with longer
1436+
intervals (1000 µs or higher) to minimize impact. See :ref:`blocking-mode`
1437+
for details.
1438+
13861439

13871440
Mode options
13881441
------------

Lib/profiling/sampling/cli.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,13 @@ def _add_sampling_options(parser):
347347
action="store_true",
348348
help="Also profile subprocesses. Each subprocess gets its own profiler and output file.",
349349
)
350+
sampling_group.add_argument(
351+
"--blocking",
352+
action="store_true",
353+
help="Stop all threads in target process before sampling to get consistent snapshots. "
354+
"Uses thread_suspend on macOS and ptrace on Linux. Adds overhead but ensures memory "
355+
"reads are from a frozen state.",
356+
)
350357

351358

352359
def _add_mode_options(parser):
@@ -585,6 +592,15 @@ def _validate_args(args, parser):
585592
if getattr(args, 'command', None) == "replay":
586593
return
587594

595+
# Warn about blocking mode with aggressive sampling intervals
596+
if args.blocking and args.interval < 100:
597+
print(
598+
f"Warning: --blocking with a {args.interval} µs interval will stop all threads "
599+
f"{1_000_000 // args.interval} times per second. "
600+
"Consider using --interval 1000 or higher to reduce overhead.",
601+
file=sys.stderr
602+
)
603+
588604
# Check if live mode is available
589605
if hasattr(args, 'live') and args.live and LiveStatsCollector is None:
590606
parser.error(
@@ -861,6 +877,7 @@ def _handle_attach(args):
861877
native=args.native,
862878
gc=args.gc,
863879
opcodes=args.opcodes,
880+
blocking=args.blocking,
864881
)
865882
_handle_output(collector, args, args.pid, mode)
866883

@@ -939,6 +956,7 @@ def _handle_run(args):
939956
native=args.native,
940957
gc=args.gc,
941958
opcodes=args.opcodes,
959+
blocking=args.blocking,
942960
)
943961
_handle_output(collector, args, process.pid, mode)
944962
finally:
@@ -984,6 +1002,7 @@ def _handle_live_attach(args, pid):
9841002
native=args.native,
9851003
gc=args.gc,
9861004
opcodes=args.opcodes,
1005+
blocking=args.blocking,
9871006
)
9881007

9891008

@@ -1031,6 +1050,7 @@ def _handle_live_run(args):
10311050
native=args.native,
10321051
gc=args.gc,
10331052
opcodes=args.opcodes,
1053+
blocking=args.blocking,
10341054
)
10351055
finally:
10361056
# Clean up the subprocess

Lib/profiling/sampling/sample.py

Lines changed: 38 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import _remote_debugging
2+
import contextlib
23
import os
34
import statistics
45
import sys
@@ -7,7 +8,26 @@
78
from collections import deque
89
from _colorize import ANSIColors
910

11+
from .pstats_collector import PstatsCollector
12+
from .stack_collector import CollapsedStackCollector, FlamegraphCollector
13+
from .heatmap_collector import HeatmapCollector
14+
from .gecko_collector import GeckoCollector
1015
from .binary_collector import BinaryCollector
16+
17+
18+
@contextlib.contextmanager
19+
def _pause_threads(unwinder, blocking):
20+
"""Context manager to pause/resume threads around sampling if blocking is True."""
21+
if blocking:
22+
unwinder.pause_threads()
23+
try:
24+
yield
25+
finally:
26+
unwinder.resume_threads()
27+
else:
28+
yield
29+
30+
1131
from .constants import (
1232
PROFILING_MODE_WALL,
1333
PROFILING_MODE_CPU,
@@ -25,12 +45,13 @@
2545

2646

2747
class SampleProfiler:
28-
def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL, native=False, gc=True, opcodes=False, skip_non_matching_threads=True, collect_stats=False):
48+
def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL, native=False, gc=True, opcodes=False, skip_non_matching_threads=True, collect_stats=False, blocking=False):
2949
self.pid = pid
3050
self.sample_interval_usec = sample_interval_usec
3151
self.all_threads = all_threads
3252
self.mode = mode # Store mode for later use
3353
self.collect_stats = collect_stats
54+
self.blocking = blocking
3455
try:
3556
self.unwinder = self._new_unwinder(native, gc, opcodes, skip_non_matching_threads)
3657
except RuntimeError as err:
@@ -60,12 +81,11 @@ def sample(self, collector, duration_sec=10, *, async_aware=False):
6081
running_time = 0
6182
num_samples = 0
6283
errors = 0
84+
interrupted = False
6385
start_time = next_time = time.perf_counter()
6486
last_sample_time = start_time
6587
realtime_update_interval = 1.0 # Update every second
6688
last_realtime_update = start_time
67-
interrupted = False
68-
6989
try:
7090
while running_time < duration_sec:
7191
# Check if live collector wants to stop
@@ -75,14 +95,15 @@ def sample(self, collector, duration_sec=10, *, async_aware=False):
7595
current_time = time.perf_counter()
7696
if next_time < current_time:
7797
try:
78-
if async_aware == "all":
79-
stack_frames = self.unwinder.get_all_awaited_by()
80-
elif async_aware == "running":
81-
stack_frames = self.unwinder.get_async_stack_trace()
82-
else:
83-
stack_frames = self.unwinder.get_stack_trace()
84-
collector.collect(stack_frames)
85-
except ProcessLookupError:
98+
with _pause_threads(self.unwinder, self.blocking):
99+
if async_aware == "all":
100+
stack_frames = self.unwinder.get_all_awaited_by()
101+
elif async_aware == "running":
102+
stack_frames = self.unwinder.get_async_stack_trace()
103+
else:
104+
stack_frames = self.unwinder.get_stack_trace()
105+
collector.collect(stack_frames)
106+
except ProcessLookupError as e:
86107
duration_sec = current_time - start_time
87108
break
88109
except (RuntimeError, UnicodeDecodeError, MemoryError, OSError):
@@ -350,6 +371,7 @@ def sample(
350371
native=False,
351372
gc=True,
352373
opcodes=False,
374+
blocking=False,
353375
):
354376
"""Sample a process using the provided collector.
355377
@@ -365,6 +387,7 @@ def sample(
365387
native: Whether to include native frames
366388
gc: Whether to include GC frames
367389
opcodes: Whether to include opcode information
390+
blocking: Whether to stop all threads before sampling for consistent snapshots
368391
369392
Returns:
370393
The collector with collected samples
@@ -390,6 +413,7 @@ def sample(
390413
opcodes=opcodes,
391414
skip_non_matching_threads=skip_non_matching_threads,
392415
collect_stats=realtime_stats,
416+
blocking=blocking,
393417
)
394418
profiler.realtime_stats = realtime_stats
395419

@@ -411,6 +435,7 @@ def sample_live(
411435
native=False,
412436
gc=True,
413437
opcodes=False,
438+
blocking=False,
414439
):
415440
"""Sample a process in live/interactive mode with curses TUI.
416441
@@ -426,6 +451,7 @@ def sample_live(
426451
native: Whether to include native frames
427452
gc: Whether to include GC frames
428453
opcodes: Whether to include opcode information
454+
blocking: Whether to stop all threads before sampling for consistent snapshots
429455
430456
Returns:
431457
The collector with collected samples
@@ -451,6 +477,7 @@ def sample_live(
451477
opcodes=opcodes,
452478
skip_non_matching_threads=skip_non_matching_threads,
453479
collect_stats=realtime_stats,
480+
blocking=blocking,
454481
)
455482
profiler.realtime_stats = realtime_stats
456483

Lib/test/test_external_inspection.py

Lines changed: 39 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -2931,24 +2931,24 @@ def top():
29312931
"Test only runs on Linux with process_vm_readv support",
29322932
)
29332933
def test_partial_stack_reuse(self):
2934-
"""Test that unchanged bottom frames are reused when top changes (A→B→C to A→B→D)."""
2934+
"""Test that unchanged parent frames are reused from cache when top frame moves."""
29352935
script_body = """\
2936-
def func_c():
2937-
sock.sendall(b"at_c")
2936+
def level4():
2937+
sock.sendall(b"sync1")
29382938
sock.recv(16)
2939-
2940-
def func_d():
2941-
sock.sendall(b"at_d")
2939+
sock.sendall(b"sync2")
29422940
sock.recv(16)
29432941
2944-
def func_b():
2945-
func_c()
2946-
func_d()
2942+
def level3():
2943+
level4()
29472944
2948-
def func_a():
2949-
func_b()
2945+
def level2():
2946+
level3()
2947+
2948+
def level1():
2949+
level2()
29502950
2951-
func_a()
2951+
level1()
29522952
"""
29532953

29542954
with self._target_process(script_body) as (
@@ -2958,55 +2958,51 @@ def func_a():
29582958
):
29592959
unwinder = make_unwinder(cache_frames=True)
29602960

2961-
# Sample at C: stack is A→B→C
2962-
frames_c = self._sample_frames(
2961+
# Sample 1: level4 at first sendall
2962+
frames1 = self._sample_frames(
29632963
client_socket,
29642964
unwinder,
2965-
b"at_c",
2965+
b"sync1",
29662966
b"ack",
2967-
{"func_a", "func_b", "func_c"},
2967+
{"level1", "level2", "level3", "level4"},
29682968
)
2969-
# Sample at D: stack is A→B→D (C returned, D called)
2970-
frames_d = self._sample_frames(
2969+
# Sample 2: level4 at second sendall (same stack, different line)
2970+
frames2 = self._sample_frames(
29712971
client_socket,
29722972
unwinder,
2973-
b"at_d",
2973+
b"sync2",
29742974
b"done",
2975-
{"func_a", "func_b", "func_d"},
2975+
{"level1", "level2", "level3", "level4"},
29762976
)
29772977

2978-
self.assertIsNotNone(frames_c)
2979-
self.assertIsNotNone(frames_d)
2978+
self.assertIsNotNone(frames1)
2979+
self.assertIsNotNone(frames2)
29802980

2981-
# Find func_a and func_b frames in both samples
29822981
def find_frame(frames, funcname):
29832982
for f in frames:
29842983
if f.funcname == funcname:
29852984
return f
29862985
return None
29872986

2988-
frame_a_in_c = find_frame(frames_c, "func_a")
2989-
frame_b_in_c = find_frame(frames_c, "func_b")
2990-
frame_a_in_d = find_frame(frames_d, "func_a")
2991-
frame_b_in_d = find_frame(frames_d, "func_b")
2992-
2993-
self.assertIsNotNone(frame_a_in_c)
2994-
self.assertIsNotNone(frame_b_in_c)
2995-
self.assertIsNotNone(frame_a_in_d)
2996-
self.assertIsNotNone(frame_b_in_d)
2997-
2998-
# The bottom frames (A, B) should be the SAME objects (cache reuse)
2999-
self.assertIs(
3000-
frame_a_in_c,
3001-
frame_a_in_d,
3002-
"func_a frame should be reused from cache",
3003-
)
3004-
self.assertIs(
3005-
frame_b_in_c,
3006-
frame_b_in_d,
3007-
"func_b frame should be reused from cache",
2987+
# level4 should have different line numbers (it moved)
2988+
l4_1 = find_frame(frames1, "level4")
2989+
l4_2 = find_frame(frames2, "level4")
2990+
self.assertIsNotNone(l4_1)
2991+
self.assertIsNotNone(l4_2)
2992+
self.assertNotEqual(
2993+
l4_1.location.lineno,
2994+
l4_2.location.lineno,
2995+
"level4 should be at different lines",
30082996
)
30092997

2998+
# Parent frames (level1, level2, level3) should be reused from cache
2999+
for name in ["level1", "level2", "level3"]:
3000+
f1 = find_frame(frames1, name)
3001+
f2 = find_frame(frames2, name)
3002+
self.assertIsNotNone(f1, f"{name} missing from sample 1")
3003+
self.assertIsNotNone(f2, f"{name} missing from sample 2")
3004+
self.assertIs(f1, f2, f"{name} should be reused from cache")
3005+
30103006
@skip_if_not_supported
30113007
@unittest.skipIf(
30123008
sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,

0 commit comments

Comments
 (0)