Skip to content

Commit 0491465

Browse files
authored
[GPU] 70% off APC tracegen overhead (#3436)
For GPU trace gen, we previously loop over each air per block, each `Subst` per warp, and each row per thread. This PR explores an alternative that loops over each row per thread (regardless of air or `Subst`). Surprisingly, this shaves another ~70% off the current APC trace gen overhead. The following three scenarios are: 1. Current main (APC=100, threads=256): 7874 ms in tracegen. 2. This PR (APC=100, threads=256): 7370 ms in tracegen. 3. Baseline to benchmark for dummy tracegen time (APC=0, threads=256): 7171 ms in tracegen. Therefore, this PR shaves another `(7874 - 7370) / (7874 - 7171) = 72%` off tracegen time. ``` filename num_segments app_proof_cells app_proof_cols total_proof_time_ms app_proof_time_ms app_execute_preflight_time_ms app_execute_metered_time_ms app_trace_gen_time_ms leaf_proof_time_ms inner_recursion_proof_time_ms normal_instruction_ratio openvm_precompile_ratio powdr_ratio powdr_rows /home/steve/openvm-reth-benchmark/apc_100_app_256.json 19 13856523983 354152 31156 31156 7886 703 7874 0 0 0.307127 0.540265 0.152608 14033237 /home/steve/openvm-reth-benchmark/apc_100_new.json 19 13856523983 354152 31097 31097 7851 708 7370 0 0 0.307127 0.540265 0.152608 14033237 ../openvm-reth-benchmark/metrics_apc0.json 26 20019740816 216005 42660 42660 4622 749 7171 0 0 0.612871 0.387129 0.000000 0 ``` I have some rough theories about where the diff come from: 1. In our prior strategy, because each original air is assigned to a block, there can be lopsided cases when a few original airs are "called" many times while other airs aren't. These cases should be quite common, as we can think of instructions from like the ALU chip is called way more often than other chips. 2. Lopsided cases means that some blocks can be left idle when they could have been redirected to other airs that are still processing. 3. This method does have the disadvantage of not localizing memory accesses enough (which our prior strategy optimizes for), but it has the main benefit of almost 100% utilization of all threads allocated, because each thread is assigned to an APC row.
1 parent 43adb4b commit 0491465

File tree

3 files changed

+43
-55
lines changed

3 files changed

+43
-55
lines changed

openvm/cuda/src/apc_tracegen.cu

Lines changed: 30 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,10 @@ struct OriginalAir {
1212
int height; // number of rows (Ha)
1313
const Fp* buffer; // column-major base: col*height + row
1414
int row_block_size; // stride between used rows
15-
int substitutions_offset;// offset in d_subs
16-
int substitutions_length;// count in d_subs for this AIR
1715
};
1816

1917
struct Subst {
18+
int air_index; // index into d_original_airs
2019
int col; // source column within this AIR
2120
int row; // base row offset within the row-block
2221
int apc_col; // destination APC column
@@ -30,49 +29,39 @@ extern "C" {
3029
}
3130

3231
// ============================================================================================
33-
// Kernel: one block per OriginalAir; each warp handles one substitution (APC column).
32+
// Kernel: each thread iterates rows and processes all substitutions.
3433
// ============================================================================================
3534

3635
__global__ void apc_tracegen_kernel(
3736
Fp* __restrict__ d_output, // column-major
3837
size_t H, // height of the output
3938
const OriginalAir* __restrict__ d_original_airs, // metadata per AIR
4039
const Subst* __restrict__ d_subs, // all substitutions
40+
size_t n_subs, // number of substitutions
4141
int num_apc_calls // number of APC calls
4242
) {
43-
const int air_id = blockIdx.x;
44-
const OriginalAir air = d_original_airs[air_id];
45-
46-
const Fp* __restrict__ src_base = air.buffer;
47-
const int Ha = air.height;
48-
const int RBS = air.row_block_size;
49-
50-
const int lane = threadIdx.x & 31; // 0..31
51-
const int warp = threadIdx.x >> 5; // warp index in block
52-
const int warps_per_block = blockDim.x >> 5;
53-
54-
// Process this AIR's substitutions in batches of warps_per_block
55-
for (int rel = warp; rel < air.substitutions_length; rel += warps_per_block) {
56-
57-
const Subst sub = d_subs[air.substitutions_offset + rel];
58-
59-
// Column bases (column-major)
60-
const size_t dst_col_base = (size_t)sub.apc_col * (size_t)H;
61-
const size_t src_col_base = (size_t)sub.col * (size_t)Ha;
62-
63-
// Each lane writes rows lane, lane+32, lane+64, ... (coalesced per warp)
64-
// Loop over full output height; zero-pad rows beyond `num_apc_calls`.
65-
for (size_t r = (size_t)lane; r < (size_t)H; r += 32) {
66-
if (r < (size_t)num_apc_calls) {
67-
const size_t src_r = (size_t)sub.row + r * (size_t)RBS;
68-
if (src_r < (size_t)Ha) {
69-
d_output[dst_col_base + r] = src_base[src_col_base + src_r];
70-
}
71-
} else {
72-
d_output[dst_col_base + r] = Fp(0);
43+
const size_t total_threads = (size_t)gridDim.x * (size_t)blockDim.x;
44+
const size_t tid = (size_t)blockIdx.x * (size_t)blockDim.x + (size_t)threadIdx.x;
45+
46+
for (size_t r = tid; r < H; r += total_threads) {
47+
const bool row_in_range = r < (size_t)num_apc_calls;
48+
49+
for (size_t i = 0; i < n_subs; ++i) {
50+
const Subst sub = d_subs[i];
51+
const size_t dst_idx = (size_t)sub.apc_col * H + r;
52+
53+
if (!row_in_range) {
54+
d_output[dst_idx] = Fp(0);
55+
continue;
7356
}
57+
58+
const size_t air_idx = (size_t)sub.air_index;
59+
const OriginalAir air = d_original_airs[air_idx];
60+
const Fp* __restrict__ src_base = air.buffer;
61+
const size_t src_col_base = (size_t)sub.col * (size_t)air.height;
62+
const size_t src_r = (size_t)sub.row + r * (size_t)air.row_block_size;
63+
d_output[dst_idx] = src_base[src_col_base + src_r];
7464
}
75-
// Warps are independent for different substitutions; no syncthreads needed here.
7665
}
7766
}
7867

@@ -137,19 +126,21 @@ extern "C" int _apc_apply_derived_expr(
137126
extern "C" int _apc_tracegen(
138127
Fp* d_output, // [output_height * output_width], column-major
139128
size_t output_height, // H_out
140-
const OriginalAir* d_original_airs, // device array, length = n_airs
141-
size_t n_airs, // one block per AIR
129+
const OriginalAir* d_original_airs, // device array of AIR metadata
142130
const Subst* d_subs, // device array of all substitutions
131+
size_t n_subs, // number of substitutions
143132
int num_apc_calls // number of APC calls
144133
) {
145134
assert((output_height & (output_height - 1)) == 0); // power-of-two height check
146135

147136
const int block_x = 256;
148137
const dim3 block(block_x, 1, 1);
149-
const dim3 grid((unsigned int)n_airs, 1, 1);
138+
unsigned g = (unsigned)((output_height + block_x - 1) / block_x);
139+
if (g == 0u) g = 1u;
140+
const dim3 grid(g, 1, 1);
150141

151142
apc_tracegen_kernel<<<grid, block>>>(
152-
d_output, output_height, d_original_airs, d_subs, num_apc_calls
143+
d_output, output_height, d_original_airs, d_subs, n_subs, num_apc_calls
153144
);
154145
return (int)cudaGetLastError();
155-
}
146+
}

openvm/src/cuda_abi.rs

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@ extern "C" {
1212
pub fn _apc_tracegen(
1313
d_output: *mut BabyBear, // column-major
1414
output_height: usize, // H_out
15-
d_original_airs: *const OriginalAir, // device array, len = n_original_airs
16-
n_original_airs: usize, //
15+
d_original_airs: *const OriginalAir, // device array of AIR metadata
1716
d_subs: *const Subst, // device array of all substitutions
17+
n_subs: usize, // number of substitutions
1818
num_apc_calls: i32, // number of APC calls
1919
) -> i32;
2020

@@ -66,17 +66,17 @@ extern "C" {
6666
#[repr(C)]
6767
#[derive(Clone, Copy, Debug)]
6868
pub struct OriginalAir {
69-
pub width: i32, // number of columns
70-
pub height: i32, // number of rows (Ha)
71-
pub buffer: *const BabyBear, // column-major base: col*height + row (device ptr)
72-
pub row_block_size: i32, // stride between used rows
73-
pub substitutions_offset: i32, // offset in d_subs
74-
pub substitutions_length: i32, // count in d_subs for this AIR
69+
pub width: i32, // number of columns
70+
pub height: i32, // number of rows (Ha)
71+
pub buffer: *const BabyBear, // column-major base: col*height + row (device ptr)
72+
pub row_block_size: i32, // stride between used rows
7573
}
7674

7775
#[repr(C)]
7876
#[derive(Clone, Copy, Debug)]
7977
pub struct Subst {
78+
/// Index of the source AIR in `d_original_airs`
79+
pub air_index: i32,
8080
/// Source column within this AIR
8181
pub col: i32,
8282
/// Base row offset within the row-block
@@ -96,20 +96,19 @@ pub struct DerivedExprSpec {
9696

9797
pub fn apc_tracegen(
9898
output: &mut DeviceMatrix<BabyBear>, // column-major
99-
original_airs: DeviceBuffer<OriginalAir>, // device array, len = n_airs
99+
original_airs: DeviceBuffer<OriginalAir>, // device array of AIR metadata
100100
substitutions: DeviceBuffer<Subst>, // device array of all substitutions
101101
num_apc_calls: usize,
102102
) -> Result<(), CudaError> {
103103
let output_height = output.height();
104-
let n_airs = original_airs.len();
105104

106105
unsafe {
107106
CudaError::from_result(_apc_tracegen(
108107
output.buffer().as_mut_ptr(),
109108
output_height,
110109
original_airs.as_ptr(),
111-
n_airs,
112110
substitutions.as_ptr(),
111+
substitutions.len(),
113112
num_apc_calls as i32,
114113
))
115114
}

openvm/src/powdr_extension/trace_generator/cuda/mod.rs

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -270,9 +270,10 @@ impl PowdrTraceGeneratorGpu {
270270
.into_group_map()
271271
// go through each air and its substitutions
272272
.iter()
273+
.enumerate()
273274
.fold(
274275
(Vec::new(), Vec::new()),
275-
|(mut airs, mut substitutions), (air_name, subs_by_row)| {
276+
|(mut airs, mut substitutions), (air_index, (air_name, subs_by_row))| {
276277
// Find the substitutions that map to an apc column
277278
let new_substitutions: Vec<Subst> = subs_by_row
278279
.iter()
@@ -283,13 +284,12 @@ impl PowdrTraceGeneratorGpu {
283284
subs.iter()
284285
.map(move |sub| (row, sub))
285286
.map(|(row, sub)| Subst {
287+
air_index: air_index as i32,
286288
col: sub.original_poly_index as i32,
287289
row: row as i32,
288290
apc_col: apc_poly_id_to_index[&sub.apc_poly_id] as i32,
289291
})
290292
})
291-
// sort by column so that reads to the same column are coalesced, as the table is column major
292-
.sorted_by(|left, right| left.col.cmp(&right.col))
293293
.collect();
294294

295295
// get the device dummy trace for this air
@@ -301,8 +301,6 @@ impl PowdrTraceGeneratorGpu {
301301
height: dummy_trace.height() as i32,
302302
buffer: dummy_trace.buffer().as_ptr(),
303303
row_block_size: subs_by_row.len() as i32,
304-
substitutions_offset: substitutions.len() as i32,
305-
substitutions_length: new_substitutions.len() as i32,
306304
});
307305

308306
substitutions.extend(new_substitutions);

0 commit comments

Comments
 (0)