Skip to content

Commit 289abce

Browse files
fix(perf-metrics): perf instrumentation slow (#1936)
The per-instruction call to calculate memory access adapter heights is very slow because it iterates linearly through the entire adapter record. I changed it now to only do this at the end. The downside is that we won't be able to see the contribution to access adapters on a per-instruction basis, but this could be extracted more efficiently in a different way. Proof it works: https://github.com/axiom-crypto/openvm-reth-benchmark/actions/runs/16767319820
1 parent 3fb163e commit 289abce

File tree

7 files changed

+104
-69
lines changed

7 files changed

+104
-69
lines changed

ci/scripts/metric_unify/flamegraph.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ def get_stack_lines(metrics_dict, group_by_kvs, stack_keys, metric_name, sum_met
6060
function_symbols = [get_function_symbol(string_table, offset) for offset in symbol_offsets]
6161
stack_values.extend(function_symbols)
6262
else:
63+
# don't make a stack frame for empty label
64+
if labels[key] == '':
65+
continue
6366
stack_values.append(labels[key])
6467
if filter:
6568
continue

crates/vm/src/arch/interpreter_preflight.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ where
5353
self.execute_instruction(state)?;
5454
state.instret += 1;
5555
}
56+
5657
Ok(())
5758
}
5859

@@ -106,7 +107,7 @@ where
106107

107108
#[cfg(feature = "metrics")]
108109
{
109-
crate::metrics::update_instruction_metrics(state, executor, pc_entry);
110+
crate::metrics::update_instruction_metrics(state, executor, pc, pc_entry);
110111
}
111112

112113
Ok(())

crates/vm/src/arch/vm.rs

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -446,9 +446,14 @@ where
446446
let mut exec_state = VmExecState::new(vm_state, ctx);
447447
execute_spanned!("execute_e3", instance, &mut exec_state)?;
448448
let filtered_exec_frequencies = instance.handler.filtered_execution_frequencies();
449-
let mut memory = exec_state.vm_state.memory;
450-
let touched_memory = memory.finalize::<Val<E::SC>>(system_config.continuation_enabled);
449+
let touched_memory = exec_state
450+
.vm_state
451+
.memory
452+
.finalize::<Val<E::SC>>(system_config.continuation_enabled);
453+
#[cfg(feature = "perf-metrics")]
454+
crate::metrics::end_segment_metrics(&mut exec_state);
451455

456+
let memory = exec_state.vm_state.memory;
452457
let to_state = ExecutionState::new(exec_state.vm_state.pc, memory.timestamp());
453458
let public_values = system_config
454459
.has_public_values_chip()
@@ -507,7 +512,12 @@ where
507512
state.metrics.debug_infos = exe.program.debug_infos();
508513
}
509514
#[cfg(feature = "perf-metrics")]
510-
state.metrics.set_pk_info(&self.pk);
515+
{
516+
state.metrics.set_pk_info(&self.pk);
517+
state.metrics.num_sys_airs = self.config().as_ref().num_airs();
518+
state.metrics.access_adapter_offset =
519+
self.config().as_ref().access_adapter_air_id_offset();
520+
}
511521
state
512522
}
513523

@@ -1195,32 +1205,31 @@ mod vm_metrics {
11951205
use metrics::counter;
11961206

11971207
use super::*;
1198-
use crate::metrics::get_dyn_trace_heights_from_arenas;
1208+
use crate::arch::Arena;
11991209

12001210
impl<E, VB> VirtualMachine<E, VB>
12011211
where
12021212
E: StarkEngine,
12031213
VB: VmBuilder<E>,
12041214
{
1205-
/// See [`metrics::get_trace_heights_from_arenas`](crate::metrics::get_trace_heights_from_arenas).
1206-
/// In addition, this function includes the constant trace heights and the used height of
1207-
/// the program trace.
1215+
/// Assumed that `record_arenas` has length equal to number of AIRs.
1216+
///
1217+
/// Best effort calculation of the used trace heights per chip without padding to powers of
1218+
/// two. This is best effort because some periphery chips may not have record arenas to
1219+
/// instrument. This function includes the constant trace heights, and the used height of
1220+
/// the program trace. It does not include the memory access adapter trace heights,
1221+
/// which is included in `SystemChipComplex::finalize_trace_heights`.
12081222
pub(crate) fn get_trace_heights_from_arenas(
12091223
&self,
12101224
system_records: &SystemRecords<Val<E::SC>>,
12111225
record_arenas: &[VB::RecordArena],
12121226
) -> Vec<usize> {
12131227
let num_airs = self.num_airs();
12141228
assert_eq!(num_airs, record_arenas.len());
1215-
let sys_config = self.config().as_ref();
1216-
let num_sys_airs = sys_config.num_airs();
1217-
let access_adapter_offset = sys_config.access_adapter_air_id_offset();
1218-
let mut heights = get_dyn_trace_heights_from_arenas::<Val<E::SC>, _>(
1219-
num_sys_airs,
1220-
access_adapter_offset,
1221-
&system_records.access_adapter_records,
1222-
record_arenas,
1223-
);
1229+
let mut heights: Vec<usize> = record_arenas
1230+
.iter()
1231+
.map(|arena| arena.current_trace_height())
1232+
.collect();
12241233
// If there are any constant trace heights, set them
12251234
for (pk, height) in zip(&self.pk.per_air, &mut heights) {
12261235
if let Some(constant_height) =
@@ -1250,7 +1259,8 @@ mod vm_metrics {
12501259
counter!("main_cells_used").absolute(main_cells_used as u64);
12511260
counter!("total_cells_used").absolute(total_cells_used as u64);
12521261

1253-
if self.config().as_ref().profiling {
1262+
#[cfg(feature = "perf-metrics")]
1263+
{
12541264
for (name, value) in zip(self.air_names(), heights) {
12551265
let labels = [("air_name", name.to_string())];
12561266
counter!("rows_used", &labels).absolute(*value as u64);

crates/vm/src/metrics/mod.rs

Lines changed: 55 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,8 @@ use openvm_instructions::{
1111
use openvm_stark_backend::prover::{hal::ProverBackend, types::DeviceMultiStarkProvingKey};
1212

1313
use crate::{
14-
arch::{
15-
execution_mode::tracegen::TracegenCtx, Arena, DenseRecordArena, PreflightExecutor,
16-
VmExecState,
17-
},
18-
system::{
19-
memory::{adapter::AccessAdapterInventory, online::TracingMemory},
20-
program::PcEntry,
21-
},
14+
arch::{execution_mode::tracegen::TracegenCtx, Arena, PreflightExecutor, VmExecState},
15+
system::{memory::online::TracingMemory, program::PcEntry},
2216
};
2317

2418
pub mod cycle_tracker;
@@ -60,6 +54,7 @@ pub struct VmMetrics {
6054
pub fn update_instruction_metrics<F, RA, Executor>(
6155
state: &mut VmExecState<F, TracingMemory, TracegenCtx<RA>>,
6256
executor: &mut Executor,
57+
prev_pc: u32, // the pc of the instruction executed, state.pc is next pc
6358
pc_entry: &PcEntry<F>,
6459
) where
6560
F: Clone + Send + Sync,
@@ -80,59 +75,68 @@ pub fn update_instruction_metrics<F, RA, Executor>(
8075
let opcode = pc_entry.insn.opcode;
8176
let opcode_name = executor.get_opcode_name(opcode.as_usize());
8277

83-
let num_sys_airs = state.metrics.num_sys_airs;
84-
let access_adapter_offset = state.metrics.access_adapter_offset;
85-
let debug_info = state.metrics.debug_infos.get(pc);
78+
let debug_info = state.metrics.debug_infos.get(prev_pc);
8679
let dsl_instr = debug_info.as_ref().map(|info| info.dsl_instruction.clone());
8780

88-
let now_trace_heights = get_dyn_trace_heights_from_arenas::<F, _>(
89-
num_sys_airs,
90-
access_adapter_offset,
91-
&state.memory.access_adapter_records,
92-
&state.ctx.arenas,
93-
);
94-
let mut now_trace_cells = now_trace_heights;
95-
96-
let metrics = &mut state.metrics;
97-
for (main_width, cell_count) in zip(&metrics.main_widths, &mut now_trace_cells) {
98-
*cell_count *= main_width;
99-
}
100-
metrics.update_trace_cells(now_trace_cells, opcode_name, dsl_instr);
101-
102-
metrics.update_current_fn(pc);
81+
let now_trace_heights: Vec<usize> = state
82+
.ctx
83+
.arenas
84+
.iter()
85+
.map(|arena| arena.current_trace_height())
86+
.collect();
87+
let now_trace_cells = zip(&state.metrics.main_widths, &now_trace_heights)
88+
.map(|(main_width, h)| main_width * h)
89+
.collect_vec();
90+
state
91+
.metrics
92+
.update_trace_cells(now_trace_cells, opcode_name, dsl_instr);
93+
94+
state.metrics.update_current_fn(pc);
10395
}
10496
}
10597

106-
/// Assumed that `record_arenas` has length equal to number of AIRs.
107-
///
108-
/// Best effort calculation of the used trace heights per chip without padding to powers of
109-
/// two. This is best effort because some periphery chips may not have record arenas
110-
/// to instrument.
111-
///
112-
/// Does not include constant trace heights or the used program trace height.
113-
pub(crate) fn get_dyn_trace_heights_from_arenas<F, RA>(
114-
num_sys_airs: usize,
115-
access_adapter_offset: usize,
116-
access_adapter_records: &DenseRecordArena,
117-
record_arenas: &[RA],
118-
) -> Vec<usize>
98+
// Memory access adapter height calculation is slow, so only do it if this is the end of
99+
// execution.
100+
// We also clear the current trace cell counts so there aren't negative diffs at the start of the
101+
// next segment.
102+
#[cfg(feature = "perf-metrics")]
103+
pub fn end_segment_metrics<F, RA>(state: &mut VmExecState<F, TracingMemory, TracegenCtx<RA>>)
119104
where
120105
F: Clone + Send + Sync,
121106
RA: Arena,
122107
{
123-
// First, get used heights from record arenas
124-
let mut heights: Vec<usize> = record_arenas
125-
.iter()
126-
.map(|arena| arena.current_trace_height())
127-
.collect();
128-
// Memory is special case, so extract the memory AIR's trace heights from the special
129-
// arena
108+
use std::iter::zip;
109+
110+
use crate::system::memory::adapter::AccessAdapterInventory;
111+
112+
let access_adapter_offset = state.metrics.access_adapter_offset;
113+
let num_sys_airs = state.metrics.num_sys_airs;
114+
let mut now_heights = vec![0; num_sys_airs - access_adapter_offset];
130115
AccessAdapterInventory::<F>::compute_heights_from_arena(
131-
access_adapter_records,
132-
&mut heights[access_adapter_offset..num_sys_airs],
116+
&state.memory.access_adapter_records,
117+
&mut now_heights,
133118
);
134-
135-
heights
119+
let now_trace_cells = zip(
120+
&state.metrics.main_widths[access_adapter_offset..],
121+
&now_heights,
122+
)
123+
.map(|(main_width, h)| main_width * h)
124+
.collect_vec();
125+
for (air_name, &now_value) in itertools::izip!(
126+
&state.metrics.air_names[access_adapter_offset..],
127+
&now_trace_cells,
128+
) {
129+
if now_value != 0 {
130+
let labels = [
131+
("air_name", air_name.clone()),
132+
("opcode", String::default()),
133+
("dsl_ir", String::default()),
134+
("cycle_tracker_span", "memory_access_adapters".to_owned()),
135+
];
136+
counter!("cells_used", &labels).increment(now_value as u64);
137+
}
138+
}
139+
state.metrics.current_trace_cells.fill(0);
136140
}
137141

138142
impl VmMetrics {
@@ -151,6 +155,7 @@ impl VmMetrics {
151155
self.air_names = air_names;
152156
self.main_widths = main_widths;
153157
self.total_widths = total_widths;
158+
self.current_trace_cells = vec![0; self.air_names.len()];
154159
}
155160

156161
pub fn update_trace_cells(

crates/vm/src/system/memory/adapter/mod.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ pub struct AccessAdapterInventory<F> {
5050
chips: Vec<GenericAccessAdapterChip<F>>,
5151
#[getset(set = "pub")]
5252
arena: DenseRecordArena,
53+
#[cfg(feature = "metrics")]
54+
pub(crate) trace_heights: Vec<usize>,
5355
}
5456

5557
impl<F: Clone + Send + Sync> AccessAdapterInventory<F> {
@@ -77,6 +79,8 @@ impl<F: Clone + Send + Sync> AccessAdapterInventory<F> {
7779
memory_config,
7880
chips,
7981
arena: DenseRecordArena::with_byte_capacity(0),
82+
#[cfg(feature = "metrics")]
83+
trace_heights: Vec::new(),
8084
}
8185
}
8286

@@ -177,6 +181,10 @@ impl<F: Clone + Send + Sync> AccessAdapterInventory<F> {
177181
.zip(heights.iter())
178182
.map(|(&width, &height)| RowMajorMatrix::new(vec![F::ZERO; width * height], width))
179183
.collect::<Vec<_>>();
184+
#[cfg(feature = "metrics")]
185+
{
186+
self.trace_heights = heights;
187+
}
180188

181189
let mut trace_ptrs = vec![0; num_adapters];
182190

crates/vm/src/system/memory/controller/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ pub struct MemoryController<F: Field> {
7878
pub range_checker: SharedVariableRangeCheckerChip,
7979
// Store separately to avoid smart pointer reference each time
8080
range_checker_bus: VariableRangeCheckerBus,
81-
access_adapter_inventory: AccessAdapterInventory<F>,
81+
pub(crate) access_adapter_inventory: AccessAdapterInventory<F>,
8282
pub(crate) hasher_chip: Option<Arc<Poseidon2PeripheryChip<F>>>,
8383
}
8484

crates/vm/src/system/mod.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -477,6 +477,7 @@ where
477477
use crate::system::memory::interface::MemoryInterface;
478478

479479
let boundary_idx = PUBLIC_VALUES_AIR_ID + usize::from(self.public_values_chip.is_some());
480+
let mut access_adapter_offset = boundary_idx + 1;
480481
match &self.memory_controller.interface_chip {
481482
MemoryInterface::Volatile { boundary_chip } => {
482483
let boundary_height = boundary_chip
@@ -494,6 +495,7 @@ where
494495
let boundary_height = 2 * boundary_chip.touched_labels.len();
495496
heights[boundary_idx] = boundary_height;
496497
heights[boundary_idx + 1] = merkle_chip.current_height;
498+
access_adapter_offset += 1;
497499

498500
// Poseidon2Periphery height also varies based on memory, so set it now even though
499501
// it's not a system chip:
@@ -505,6 +507,12 @@ where
505507
heights[poseidon_idx] = poseidon_height;
506508
}
507509
}
510+
let access_heights = &self
511+
.memory_controller
512+
.access_adapter_inventory
513+
.trace_heights;
514+
heights[access_adapter_offset..access_adapter_offset + access_heights.len()]
515+
.copy_from_slice(access_heights);
508516
}
509517
}
510518

0 commit comments

Comments
 (0)