Skip to content

Commit c70e876

Browse files
authored
feat: Add flamegraph support to transpiler (#216)
This PR adds flamegraph generation support to the transpiler. I've manually verified it by integrating it to the airbender-platform (locally for now). It does so without depending on `risc_v_simulator` for that, since one of the goals is to remove `risc_v_simulator` completely as obsolete. Once this is merged, I'll start working on removing `risc_v_simulator` from the repository.
1 parent 60da8cd commit c70e876

File tree

10 files changed

+613
-0
lines changed

10 files changed

+613
-0
lines changed

Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,13 +175,16 @@ gpu_prover = { path = "./gpu_prover", default-features = false }
175175

176176
serde = { version = "1", default-features = false, features = ["derive", "alloc"] }
177177
clap = { version = "4.5.21", features = ["derive"] }
178+
addr2line_new = { package = "addr2line", version = "0.25" }
179+
object = "0.37"
178180
# rand = {version = "0.8", default-features = false, features = ["std_rng"] }
179181
rand = { version = "0.9", default-features = false }
180182
unroll = "0.1"
181183
seq-macro = "0.3"
182184
super-seq-macro = "0.3"
183185
arrayvec = { version = "0.7", default-features = false }
184186
itertools = { version = "0.14" }
187+
inferno = "0.12"
185188
log = "0.4"
186189
sha3 = { version = "*", default-features = false }
187190

riscv_transpiler/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ risc_v_simulator = { workspace = true, features = ["delegation"] }
1818
worker = { workspace = true }
1919
serde = { workspace = true }
2020
seq-macro = { workspace = true }
21+
inferno = { workspace = true, optional = true }
22+
addr2line_new = { workspace = true, optional = true }
23+
object = { workspace = true, optional = true }
2124
serde-big-array = "*"
2225
keccak = "*"
2326

@@ -27,6 +30,7 @@ capstone = {version = "0.13", optional = true }
2730

2831
[features]
2932
jit = ["dynasmrt", "riscv-decode", "capstone"]
33+
flamegraph = ["dep:inferno", "dep:addr2line_new", "dep:object"]
3034
default = ["jit"]
3135

3236
[dev-dependencies]
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
use std::collections::HashMap;
2+
3+
use super::symbolizer::Addr2LineContext;
4+
5+
/// Converts raw sampled stacks into `inferno`'s collapsed stack format.
6+
pub(super) fn build_collapsed_stack_lines(
7+
frames: &[(u32, Vec<u32>)],
8+
symbolizer: &Addr2LineContext<'_>,
9+
) -> Vec<String> {
10+
// Hot PCs appear in many samples, so we resolve symbols once per address.
11+
let mut symbol_cache: HashMap<u32, Vec<String>> = HashMap::new();
12+
let mut collapsed_line_counts: HashMap<String, usize> = HashMap::new();
13+
// Temporary merged stack for one sample before collapsing into a line.
14+
let mut buffer = Vec::with_capacity(64);
15+
16+
for (pc, callsites) in frames.iter() {
17+
buffer.clear();
18+
19+
if let Some(names) = try_frames_for_pc(symbolizer, &mut symbol_cache, *pc) {
20+
append_frames_with_overlap(&mut buffer, names);
21+
}
22+
23+
// Stack unwinding and DWARF frame expansion can produce overlapping
24+
// frame sequences. We merge overlaps to avoid duplicate path segments.
25+
for callsite_pc in callsites.iter().copied().skip(1) {
26+
let Some(names) = try_frames_for_pc(symbolizer, &mut symbol_cache, callsite_pc) else {
27+
continue;
28+
};
29+
append_frames_with_overlap(&mut buffer, names);
30+
}
31+
32+
if buffer.is_empty() {
33+
continue;
34+
}
35+
36+
// `inferno` collapsed format expects root-first paths separated by `;`.
37+
// Our merged buffer is leaf-first, so we reverse it at serialization.
38+
let mut line = String::with_capacity(buffer.len() * 16 + 12);
39+
for (idx, el) in buffer.iter().rev().enumerate() {
40+
if idx > 0 {
41+
line.push(';');
42+
}
43+
line.push_str(el);
44+
}
45+
46+
*collapsed_line_counts.entry(line).or_default() += 1;
47+
}
48+
49+
let mut remapped = Vec::with_capacity(collapsed_line_counts.len());
50+
for (line, count) in collapsed_line_counts.into_iter() {
51+
let mut line_with_count = line;
52+
line_with_count.push(' ');
53+
line_with_count.push_str(&count.to_string());
54+
remapped.push(line_with_count);
55+
}
56+
57+
remapped
58+
}
59+
60+
#[inline(always)]
61+
fn try_frames_for_pc<'a>(
62+
symbolizer: &Addr2LineContext<'_>,
63+
symbol_cache: &'a mut HashMap<u32, Vec<String>>,
64+
pc: u32,
65+
) -> Option<&'a [String]> {
66+
// Unaligned PCs are not valid instruction addresses in this VM and are
67+
// usually artifacts of incomplete stack data.
68+
if pc % 4 != 0 {
69+
return None;
70+
}
71+
72+
if symbol_cache.contains_key(&pc) == false {
73+
let frames = symbolizer.collect_frames(pc);
74+
symbol_cache.insert(pc, frames);
75+
}
76+
77+
let names = symbol_cache
78+
.get(&pc)
79+
.expect("symbol cache must contain a value");
80+
if names.is_empty() {
81+
None
82+
} else {
83+
Some(names)
84+
}
85+
}
86+
87+
#[inline(always)]
88+
fn append_frames_with_overlap(buffer: &mut Vec<String>, names: &[String]) {
89+
if names.is_empty() {
90+
return;
91+
}
92+
93+
if buffer.is_empty() {
94+
buffer.extend(names.iter().cloned());
95+
return;
96+
}
97+
98+
let max_overlap = buffer.len().min(names.len());
99+
// Keep only the non-overlapping suffix from `names` so each logical stack
100+
// segment appears once in the merged path.
101+
let overlap = (1..=max_overlap)
102+
.rev()
103+
.find(|overlap_len| buffer[(buffer.len() - *overlap_len)..] == names[..*overlap_len])
104+
.unwrap_or(0);
105+
106+
if overlap < names.len() {
107+
buffer.extend(names[overlap..].iter().cloned());
108+
}
109+
}
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
use std::path::PathBuf;
2+
3+
/// Runtime knobs for VM flamegraph generation.
4+
///
5+
/// We intentionally separate collection (cheap, during execution) from
6+
/// symbolization/rendering (heavier, after execution). These values tune both
7+
/// phases.
8+
#[derive(Clone, Debug)]
9+
pub struct FlamegraphConfig {
10+
/// ELF/object file that provides symbols and DWARF info for PC resolution.
11+
pub symbols_path: PathBuf,
12+
/// Destination SVG written by `inferno`.
13+
pub output_path: PathBuf,
14+
/// Controls whether stacks are rendered in reverse order.
15+
pub reverse_graph: bool,
16+
/// Collect one sample every `frequency_recip` VM cycles.
17+
///
18+
/// Larger values reduce runtime overhead but may hide short-lived frames.
19+
pub frequency_recip: usize,
20+
}
21+
22+
impl FlamegraphConfig {
23+
pub fn new(symbols_path: PathBuf, output_path: PathBuf) -> Self {
24+
// Defaults bias toward low overhead while keeping a readable graph.
25+
Self {
26+
symbols_path,
27+
output_path,
28+
reverse_graph: false,
29+
frequency_recip: 100,
30+
}
31+
}
32+
}
33+
34+
/// Sampling counters that help estimate profiler effectiveness.
35+
///
36+
/// `samples_total` tracks how many sampling points were attempted, while
37+
/// `samples_collected` tracks how many produced a non-empty stack trace.
38+
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
39+
pub struct FlamegraphSampleStats {
40+
pub samples_total: usize,
41+
pub samples_collected: usize,
42+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
mod collapse;
2+
mod config;
3+
mod profiler;
4+
mod ram;
5+
mod stacktrace;
6+
mod symbolizer;
7+
8+
pub use self::config::{FlamegraphConfig, FlamegraphSampleStats};
9+
pub use self::profiler::VmFlamegraphProfiler;
10+
pub use self::ram::FlamegraphReadableRam;
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
use super::collapse::build_collapsed_stack_lines;
2+
use super::config::{FlamegraphConfig, FlamegraphSampleStats};
3+
use super::ram::FlamegraphReadableRam;
4+
use super::stacktrace::collect_stacktrace_raw;
5+
use super::symbolizer::Addr2LineContext;
6+
use crate::vm::{Counters, State};
7+
8+
/// Coordinates the flamegraph pipeline:
9+
/// 1) collect lightweight raw samples during execution,
10+
/// 2) symbolize and render once execution finishes.
11+
pub struct VmFlamegraphProfiler {
12+
config: FlamegraphConfig,
13+
symbol_binary: Vec<u8>,
14+
raw_frames: Vec<(u32, Vec<u32>)>,
15+
stats: FlamegraphSampleStats,
16+
}
17+
18+
impl VmFlamegraphProfiler {
19+
pub fn new(config: FlamegraphConfig) -> std::io::Result<Self> {
20+
// Zero would both disable progress and cause division-by-zero.
21+
if config.frequency_recip == 0 {
22+
return Err(std::io::Error::new(
23+
std::io::ErrorKind::InvalidInput,
24+
"frequency_recip must be greater than zero",
25+
));
26+
}
27+
28+
let symbol_binary = std::fs::read(&config.symbols_path)?;
29+
30+
Ok(Self {
31+
config,
32+
symbol_binary,
33+
raw_frames: Vec::new(),
34+
stats: FlamegraphSampleStats::default(),
35+
})
36+
}
37+
38+
pub fn stats(&self) -> FlamegraphSampleStats {
39+
self.stats
40+
}
41+
42+
#[inline(always)]
43+
pub fn sample_cycle<C: Counters, R: FlamegraphReadableRam>(
44+
&mut self,
45+
state: &State<C>,
46+
ram: &R,
47+
cycle: usize,
48+
) {
49+
// Sampling is on the VM hot path, so we keep this branch and data
50+
// collection minimal and defer expensive work to finalization.
51+
if cycle % self.config.frequency_recip != 0 {
52+
return;
53+
}
54+
55+
self.stats.samples_total += 1;
56+
57+
let (pc, frames) = collect_stacktrace_raw(state, ram);
58+
if frames.is_empty() == false {
59+
// Empty stacks are expected when we cannot reconstruct a valid frame
60+
// chain; they are tracked via stats but not emitted.
61+
self.stats.samples_collected += 1;
62+
self.raw_frames.push((pc, frames));
63+
}
64+
}
65+
66+
pub fn write_flamegraph(&mut self) -> std::io::Result<()> {
67+
// Symbolization is deferred to here to keep execution-time sampling
68+
// overhead predictable and low.
69+
let symbolizer = Addr2LineContext::new(&self.symbol_binary)?;
70+
71+
let collapsed_lines = build_collapsed_stack_lines(&self.raw_frames, &symbolizer);
72+
73+
let collapsed_lines = if collapsed_lines.is_empty() {
74+
// Produce a minimal graph instead of failing when no usable samples
75+
// were collected.
76+
vec![String::from("no_samples 1")]
77+
} else {
78+
collapsed_lines
79+
};
80+
81+
let output_file = std::fs::File::create(&self.config.output_path)?;
82+
let mut options = inferno::flamegraph::Options::default();
83+
options.reverse_stack_order = self.config.reverse_graph;
84+
inferno::flamegraph::from_lines(
85+
&mut options,
86+
collapsed_lines.iter().map(String::as_str),
87+
output_file,
88+
)
89+
.map_err(|error| {
90+
std::io::Error::new(
91+
std::io::ErrorKind::Other,
92+
format!("while attempting to generate flamegraph: {error}"),
93+
)
94+
})?;
95+
96+
// The profiler can be reused across VM runs with the same config.
97+
self.raw_frames.clear();
98+
99+
Ok(())
100+
}
101+
}
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
use crate::vm::{RamPeek, RamWithRomRegion};
2+
3+
/// Minimal RAM contract needed by the flamegraph unwinder.
4+
///
5+
/// `RamPeek` is intentionally low-level and uses debug assertions for bounds,
6+
/// so flamegraph collection adds an explicit checked read API to keep profiling
7+
/// robust even when frame-pointer metadata is malformed.
8+
pub trait FlamegraphReadableRam: RamPeek {
9+
fn total_words_for_flamegraph(&self) -> usize;
10+
11+
#[inline(always)]
12+
fn try_peek_word_for_flamegraph(&self, address: u32) -> Option<u32> {
13+
if address % 4 != 0 {
14+
return None;
15+
}
16+
17+
let word_idx = (address / 4) as usize;
18+
if word_idx >= self.total_words_for_flamegraph() {
19+
return None;
20+
}
21+
22+
Some(self.peek_word(address))
23+
}
24+
}
25+
26+
impl<const N: usize> FlamegraphReadableRam for [u32; N] {
27+
#[inline(always)]
28+
fn total_words_for_flamegraph(&self) -> usize {
29+
N
30+
}
31+
}
32+
33+
impl FlamegraphReadableRam for [u32] {
34+
#[inline(always)]
35+
fn total_words_for_flamegraph(&self) -> usize {
36+
self.len()
37+
}
38+
}
39+
40+
impl<const ROM_BOUND_SECOND_WORD_BITS: usize> FlamegraphReadableRam
41+
for RamWithRomRegion<ROM_BOUND_SECOND_WORD_BITS>
42+
{
43+
#[inline(always)]
44+
fn total_words_for_flamegraph(&self) -> usize {
45+
self.backing.len()
46+
}
47+
}

0 commit comments

Comments
 (0)