Skip to content

Commit 186302a

Browse files
authored
[OVM 1.4] 70% off APC preflight execution overhead (#3437)
As seen in the test results below (first being current main, second being this PR, and third being APC=0 base line, each run 3+ times so should be robust), preflight execution overhead from APC should be mostly gone (7851 --> 4699 with a baseline of 4622). However, it's extremely perplexing that tracegen time increased by almost a full second (7370 --> 8233 with a baseline of 7171) despite nothing much changed there. Tried multiple ways to dissect and benchmark where the increase comes from but with no avail. The net effect should still be savings, as now preflight + tracegen should have very small overhead `(4699 + 8233) / (4622 + 7171) = ~+10%.` Disclaimer: these stats are for 100 APCs, so can be quite different for other # of APCs. Calling this 70% off preflight because it eliminates most preflight overhead but somehow adds to tracegen... ``` filename num_segments app_proof_cells app_proof_cols total_proof_time_ms app_proof_time_ms app_execute_preflight_time_ms app_execute_metered_time_ms app_trace_gen_time_ms leaf_proof_time_ms inner_recursion_proof_time_ms normal_instruction_ratio openvm_precompile_ratio powdr_ratio powdr_rows /home/steve/openvm-reth-benchmark/apc_100_new.json 19 13856523983 354152 31097 31097 7851 708 7370 0 0 0.307127 0.540265 0.152608 14033237 /home/steve/openvm-reth-benchmark/apc_100_preflight.json 19 13856523983 354152 31166 31166 4699 701 8233 0 0 0.307127 0.540265 0.152608 14033237 ../openvm-reth-benchmark/metrics_apc0.json 26 20019740816 216005 42660 42660 4622 749 7171 0 0 0.612871 0.387129 0.000000 0 ```
1 parent 45eb86a commit 186302a

File tree

4 files changed

+106
-58
lines changed

4 files changed

+106
-58
lines changed

openvm/src/extraction_utils.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,9 +136,9 @@ impl<F> OriginalAirs<F> {
136136
pub fn record_arena_dimension_by_air_name_per_apc_call<F>(
137137
apc: &Apc<F, Instr<F>>,
138138
air_by_opcode_id: &OriginalAirs<F>,
139-
) -> HashMap<String, RecordArenaDimension> {
139+
) -> BTreeMap<String, RecordArenaDimension> {
140140
apc.instructions().iter().map(|instr| &instr.0.opcode).fold(
141-
HashMap::new(),
141+
BTreeMap::new(),
142142
|mut acc, opcode| {
143143
// Get the air name for this opcode
144144
let air_name = air_by_opcode_id.opcode_to_air.get(opcode).unwrap();

openvm/src/powdr_extension/executor/mod.rs

Lines changed: 102 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ use crate::{
1313
Instr,
1414
};
1515

16+
use itertools::Itertools;
1617
use openvm_circuit::arch::{
1718
execution_mode::{ExecutionCtx, MeteredCtx},
1819
Arena, DenseRecordArena, E2PreCompute, MatrixRecordArena, PreflightExecutor,
@@ -33,7 +34,6 @@ use openvm_circuit::{
3334
},
3435
system::memory::online::{GuestMemory, TracingMemory},
3536
};
36-
use powdr_autoprecompiles::InstructionHandler;
3737

3838
/// A struct which holds the state of the execution based on the original instructions in this block and a dummy inventory.
3939
/// It holds arenas for each original use for both cpu and gpu execution, so that this struct can be agnostic to the execution backend.
@@ -45,6 +45,7 @@ pub struct PowdrExecutor {
4545
pub original_arenas_cpu: Rc<RefCell<OriginalArenas<MatrixRecordArena<BabyBear>>>>,
4646
pub original_arenas_gpu: Rc<RefCell<OriginalArenas<DenseRecordArena>>>,
4747
pub height_change: u32,
48+
cached_instructions_meta: Vec<CachedInstructionMeta>,
4849
}
4950

5051
/// A shared mutable reference to the arenas used to store the traces of the original instructions, accessed during preflight execution and trace generation.
@@ -82,21 +83,21 @@ impl<A: Arena> OriginalArenas<A> {
8283
}
8384
}
8485

85-
/// Returns a mutable reference to the arenas.
86+
/// Returns a mutable reference to the arena of the given vector index.
8687
/// - Panics if the arenas are not initialized.
87-
pub fn arenas_mut(&mut self) -> &mut HashMap<String, A> {
88+
pub fn arena_mut_by_index(&mut self, index: usize) -> &mut A {
8889
match self {
8990
OriginalArenas::Uninitialized => panic!("original arenas are uninitialized"),
90-
OriginalArenas::Initialized(initialized) => &mut initialized.arenas,
91+
OriginalArenas::Initialized(initialized) => initialized.arena_mut_by_index(index),
9192
}
9293
}
9394

94-
/// Returns a reference to the arenas.
95+
/// Returns the arena of the given air name.
9596
/// - Panics if the arenas are not initialized.
96-
pub fn arenas(&self) -> &HashMap<String, A> {
97+
pub fn take_arena(&mut self, air_name: &str) -> Option<A> {
9798
match self {
9899
OriginalArenas::Uninitialized => panic!("original arenas are uninitialized"),
99-
OriginalArenas::Initialized(initialized) => &initialized.arenas,
100+
OriginalArenas::Initialized(initialized) => initialized.take_arena(air_name),
100101
}
101102
}
102103

@@ -123,7 +124,8 @@ impl<A: Arena> OriginalArenas<A> {
123124
/// and how many calls to each air are made per APC call.
124125
#[derive(Default)]
125126
pub struct InitializedOriginalArenas<A> {
126-
pub arenas: HashMap<String, A>,
127+
arenas: Vec<Option<A>>,
128+
air_name_to_arena_index: HashMap<String, usize>,
127129
pub number_of_calls: usize,
128130
}
129131

@@ -136,28 +138,49 @@ impl<A: Arena> InitializedOriginalArenas<A> {
136138
) -> Self {
137139
let record_arena_dimensions =
138140
record_arena_dimension_by_air_name_per_apc_call(apc, original_airs);
139-
Self {
140-
arenas: record_arena_dimensions
141-
.iter()
142-
.map(
143-
|(
141+
let (air_name_to_arena_index, arenas) =
142+
record_arena_dimensions.into_iter().enumerate().fold(
143+
(HashMap::new(), Vec::new()),
144+
|(mut air_name_to_arena_index, mut arenas),
145+
(
146+
idx,
147+
(
144148
air_name,
145149
RecordArenaDimension {
146150
height: num_calls,
147151
width: air_width,
148152
},
149-
)| {
150-
(
151-
air_name.clone(),
152-
A::with_capacity(*num_calls * apc_call_count_estimate, *air_width),
153-
)
154-
},
155-
)
156-
.collect(),
153+
),
154+
)| {
155+
air_name_to_arena_index.insert(air_name, idx);
156+
arenas.push(Some(A::with_capacity(
157+
num_calls * apc_call_count_estimate,
158+
air_width,
159+
)));
160+
(air_name_to_arena_index, arenas)
161+
},
162+
);
163+
164+
Self {
165+
arenas,
166+
air_name_to_arena_index,
157167
// This is the actual number of calls, which we don't know yet. It will be updated during preflight execution.
158168
number_of_calls: 0,
159169
}
160170
}
171+
172+
#[inline]
173+
fn arena_mut_by_index(&mut self, index: usize) -> &mut A {
174+
self.arenas
175+
.get_mut(index)
176+
.and_then(|arena| arena.as_mut())
177+
.expect("arena missing for index")
178+
}
179+
180+
fn take_arena(&mut self, air_name: &str) -> Option<A> {
181+
let index = *self.air_name_to_arena_index.get(air_name)?;
182+
self.arenas[index].take()
183+
}
161184
}
162185

163186
/// The dimensions of a record arena for a given air name, used to initialize the arenas.
@@ -166,6 +189,12 @@ pub struct RecordArenaDimension {
166189
pub width: usize,
167190
}
168191

192+
#[derive(Clone, Copy)]
193+
struct CachedInstructionMeta {
194+
executor_index: usize,
195+
arena_index: usize,
196+
}
197+
169198
/// A struct to interpret the pre-compute data as for PowdrExecutor.
170199
#[derive(AlignedBytesBorrow, Clone)]
171200
#[repr(C)]
@@ -426,20 +455,15 @@ impl PreflightExecutor<BabyBear, MatrixRecordArena<BabyBear>> for PowdrExecutor
426455
let apc_call_count = || ctx.trace_buffer.len() / ctx.width;
427456

428457
original_arenas.ensure_initialized(apc_call_count, &self.air_by_opcode_id, &self.apc);
429-
430-
let arenas = original_arenas.arenas_mut();
431-
432458
// execute the original instructions one by one
433-
for instruction in self.apc.instructions() {
434-
let executor = self
435-
.executor_inventory
436-
.get_executor(instruction.0.opcode)
437-
.unwrap();
438-
439-
let air_name = self
440-
.air_by_opcode_id
441-
.get_instruction_air_and_id(instruction)
442-
.0;
459+
for (instruction, cached_meta) in self
460+
.apc
461+
.instructions()
462+
.iter()
463+
.zip_eq(&self.cached_instructions_meta)
464+
{
465+
let executor = &self.executor_inventory.executors[cached_meta.executor_index];
466+
let ctx_arena = original_arenas.arena_mut_by_index(cached_meta.arena_index);
443467

444468
let state = VmStateMut {
445469
pc,
@@ -448,7 +472,7 @@ impl PreflightExecutor<BabyBear, MatrixRecordArena<BabyBear>> for PowdrExecutor
448472
rng,
449473
custom_pvs,
450474
// We execute in the context of the relevant original table
451-
ctx: arenas.get_mut(&air_name).unwrap(),
475+
ctx: ctx_arena,
452476
// TODO: should we pass around the same metrics object, or snapshot it at the beginning of this method and apply a single update at the end?
453477
#[cfg(feature = "metrics")]
454478
metrics,
@@ -501,20 +525,15 @@ impl PreflightExecutor<BabyBear, DenseRecordArena> for PowdrExecutor {
501525
};
502526

503527
original_arenas.ensure_initialized(apc_call_count, &self.air_by_opcode_id, &self.apc);
504-
505-
let arenas = original_arenas.arenas_mut();
506-
507528
// execute the original instructions one by one
508-
for instruction in self.apc.instructions() {
509-
let executor = self
510-
.executor_inventory
511-
.get_executor(instruction.0.opcode)
512-
.unwrap();
513-
514-
let air_name = self
515-
.air_by_opcode_id
516-
.get_instruction_air_and_id(instruction)
517-
.0;
529+
for (instruction, cached_meta) in self
530+
.apc
531+
.instructions()
532+
.iter()
533+
.zip(&self.cached_instructions_meta)
534+
{
535+
let executor = &self.executor_inventory.executors[cached_meta.executor_index];
536+
let ctx_arena = original_arenas.arena_mut_by_index(cached_meta.arena_index);
518537

519538
let state = VmStateMut {
520539
pc,
@@ -523,7 +542,7 @@ impl PreflightExecutor<BabyBear, DenseRecordArena> for PowdrExecutor {
523542
rng,
524543
custom_pvs,
525544
// We execute in the context of the relevant original table
526-
ctx: arenas.get_mut(&air_name).unwrap(),
545+
ctx: ctx_arena,
527546
// TODO: should we pass around the same metrics object, or snapshot it at the beginning of this method and apply a single update at the end?
528547
#[cfg(feature = "metrics")]
529548
metrics,
@@ -552,13 +571,46 @@ impl PowdrExecutor {
552571
record_arena_by_air_name_gpu: Rc<RefCell<OriginalArenas<DenseRecordArena>>>,
553572
height_change: u32,
554573
) -> Self {
574+
let executor_inventory = base_config.sdk_config.sdk.create_executors().unwrap();
575+
576+
let arena_index_by_name =
577+
record_arena_dimension_by_air_name_per_apc_call(apc.as_ref(), &air_by_opcode_id)
578+
.iter()
579+
.enumerate()
580+
.map(|(idx, (name, _))| (name.clone(), idx))
581+
.collect::<HashMap<_, _>>();
582+
583+
let cached_instructions_meta = apc
584+
.instructions()
585+
.iter()
586+
.map(|instruction| {
587+
let executor_index = *executor_inventory
588+
.instruction_lookup
589+
.get(&instruction.0.opcode)
590+
.expect("missing executor for opcode")
591+
as usize;
592+
let air_name = air_by_opcode_id
593+
.opcode_to_air
594+
.get(&instruction.0.opcode)
595+
.expect("missing air for opcode");
596+
let arena_index = *arena_index_by_name
597+
.get(air_name)
598+
.expect("missing arena for air");
599+
CachedInstructionMeta {
600+
executor_index,
601+
arena_index,
602+
}
603+
})
604+
.collect();
605+
555606
Self {
556607
air_by_opcode_id,
557-
executor_inventory: base_config.sdk_config.sdk.create_executors().unwrap(),
608+
executor_inventory,
558609
apc,
559610
original_arenas_cpu: record_arena_by_air_name_cpu,
560611
original_arenas_gpu: record_arena_by_air_name_gpu,
561612
height_change,
613+
cached_instructions_meta,
562614
}
563615
}
564616
}

openvm/src/powdr_extension/trace_generator/cpu/mod.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,6 @@ impl PowdrTraceGeneratorCpu {
119119
.inventory
120120
};
121121

122-
let arenas = original_arenas.arenas_mut();
123-
124122
let dummy_trace_by_air_name: HashMap<String, SharedCpuTrace<BabyBear>> = chip_inventory
125123
.chips()
126124
.iter()
@@ -130,7 +128,7 @@ impl PowdrTraceGeneratorCpu {
130128
let air_name = chip_inventory.airs().ext_airs()[insertion_idx].name();
131129

132130
let record_arena = {
133-
match arenas.remove(&air_name) {
131+
match original_arenas.take_arena(&air_name) {
134132
Some(ra) => ra,
135133
None => return None, // skip this iteration, because we only have record arena for chips that are used
136134
}

openvm/src/powdr_extension/trace_generator/cuda/mod.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -219,8 +219,6 @@ impl PowdrTraceGeneratorGpu {
219219
.inventory
220220
};
221221

222-
let arenas = original_arenas.arenas_mut();
223-
224222
let dummy_trace_by_air_name: HashMap<String, DeviceMatrix<BabyBear>> = chip_inventory
225223
.chips()
226224
.iter()
@@ -230,7 +228,7 @@ impl PowdrTraceGeneratorGpu {
230228
let air_name = chip_inventory.airs().ext_airs()[insertion_idx].name();
231229

232230
let record_arena = {
233-
match arenas.remove(&air_name) {
231+
match original_arenas.take_arena(&air_name) {
234232
Some(ra) => ra,
235233
None => return None, // skip this iteration, because we only have record arena for chips that are used
236234
}

0 commit comments

Comments
 (0)