[GPU] 70% off APC tracegen overhead (#3436)

qwang98 · web-flow · commit 0491465f6e22 · 2025-11-10T09:59:26.000Z
For GPU trace gen, we previously loop over each air per block, each
`Subst` per warp, and each row per thread. This PR explores an
alternative that loops over each row per thread (regardless of air or
`Subst`). Surprisingly, this shaves another ~70% off the current APC
trace gen overhead.

The following three scenarios are:
1. Current main (APC=100, threads=256): 7874 ms in tracegen.
2. This PR (APC=100, threads=256): 7370 ms in tracegen.
3. Baseline to benchmark for dummy tracegen time (APC=0, threads=256):
7171 ms in tracegen.

Therefore, this PR shaves another `(7874 - 7370) / (7874 - 7171) = 72%`
off tracegen time.

```
                                                 filename  num_segments  app_proof_cells  app_proof_cols  total_proof_time_ms  app_proof_time_ms  app_execute_preflight_time_ms  app_execute_metered_time_ms  app_trace_gen_time_ms  leaf_proof_time_ms  inner_recursion_proof_time_ms  normal_instruction_ratio  openvm_precompile_ratio  powdr_ratio  powdr_rows
   /home/steve/openvm-reth-benchmark/apc_100_app_256.json            19      13856523983          354152                31156              31156                           7886                          703                   7874                   0                              0                  0.307127                 0.540265     0.152608    14033237
       /home/steve/openvm-reth-benchmark/apc_100_new.json            19      13856523983          354152                31097              31097                           7851                          708                   7370                   0                              0                  0.307127                 0.540265     0.152608    14033237
               ../openvm-reth-benchmark/metrics_apc0.json            26      20019740816          216005                42660              42660                           4622                          749                   7171                   0                              0                  0.612871                 0.387129     0.000000           0
```

I have some rough theories about where the diff come from:
1. In our prior strategy, because each original air is assigned to a
block, there can be lopsided cases when a few original airs are "called"
many times while other airs aren't. These cases should be quite common,
as we can think of instructions from like the ALU chip is called way
more often than other chips.
2. Lopsided cases means that some blocks can be left idle when they
could have been redirected to other airs that are still processing.
3. This method does have the disadvantage of not localizing memory
accesses enough (which our prior strategy optimizes for), but it has the
main benefit of almost 100% utilization of all threads allocated,
because each thread is assigned to an APC row.
diff --git a/openvm/cuda/src/apc_tracegen.cu b/openvm/cuda/src/apc_tracegen.cu
@@ -12,11 +12,10 @@ struct OriginalAir {
     int height;              // number of rows (Ha)
     const Fp* buffer;        // column-major base: col*height + row
     int row_block_size;      // stride between used rows
-    int substitutions_offset;// offset in d_subs
-    int substitutions_length;// count in d_subs for this AIR
 };
 
 struct Subst {
+    int air_index; // index into d_original_airs
     int col;      // source column within this AIR
     int row;      // base row offset within the row-block
     int apc_col;  // destination APC column
@@ -30,49 +29,39 @@ extern "C" {
 }
 
 // ============================================================================================
-// Kernel: one block per OriginalAir; each warp handles one substitution (APC column).
+// Kernel: each thread iterates rows and processes all substitutions.
 // ============================================================================================
 
 __global__ void apc_tracegen_kernel(
     Fp* __restrict__ d_output,                         // column-major
     size_t H,                                          // height of the output
     const OriginalAir* __restrict__ d_original_airs,   // metadata per AIR
     const Subst* __restrict__ d_subs,                  // all substitutions
+    size_t n_subs,                                     // number of substitutions
     int num_apc_calls                                  // number of APC calls
 ) {
-    const int air_id = blockIdx.x;
-    const OriginalAir air = d_original_airs[air_id];
-
-    const Fp* __restrict__ src_base = air.buffer;
-    const int Ha  = air.height;
-    const int RBS = air.row_block_size;
-
-    const int lane  = threadIdx.x & 31;     // 0..31
-    const int warp  = threadIdx.x >> 5;     // warp index in block
-    const int warps_per_block = blockDim.x >> 5;
-
-    // Process this AIR's substitutions in batches of warps_per_block
-    for (int rel = warp; rel < air.substitutions_length; rel += warps_per_block) {
-
-        const Subst sub = d_subs[air.substitutions_offset + rel];
-
-        // Column bases (column-major)
-        const size_t dst_col_base = (size_t)sub.apc_col * (size_t)H;
-        const size_t src_col_base = (size_t)sub.col     * (size_t)Ha;
-
-        // Each lane writes rows lane, lane+32, lane+64, ... (coalesced per warp)
-        // Loop over full output height; zero-pad rows beyond `num_apc_calls`.
-        for (size_t r = (size_t)lane; r < (size_t)H; r += 32) {
-            if (r < (size_t)num_apc_calls) {
-                const size_t src_r = (size_t)sub.row + r * (size_t)RBS;
-                if (src_r < (size_t)Ha) {
-                    d_output[dst_col_base + r] = src_base[src_col_base + src_r];
-                }
-            } else {
-                d_output[dst_col_base + r] = Fp(0);
+    const size_t total_threads = (size_t)gridDim.x * (size_t)blockDim.x;
+    const size_t tid = (size_t)blockIdx.x * (size_t)blockDim.x + (size_t)threadIdx.x;
+
+    for (size_t r = tid; r < H; r += total_threads) {
+        const bool row_in_range = r < (size_t)num_apc_calls;
+
+        for (size_t i = 0; i < n_subs; ++i) {
+            const Subst sub = d_subs[i];
+            const size_t dst_idx = (size_t)sub.apc_col * H + r;
+
+            if (!row_in_range) {
+                d_output[dst_idx] = Fp(0);
+                continue;
             }
+
+            const size_t air_idx = (size_t)sub.air_index;
+            const OriginalAir air = d_original_airs[air_idx];
+            const Fp* __restrict__ src_base = air.buffer;
+            const size_t src_col_base = (size_t)sub.col * (size_t)air.height;
+            const size_t src_r = (size_t)sub.row + r * (size_t)air.row_block_size;
+            d_output[dst_idx] = src_base[src_col_base + src_r];
         }
-        // Warps are independent for different substitutions; no syncthreads needed here.
     }
 }
 
@@ -137,19 +126,21 @@ extern "C" int _apc_apply_derived_expr(
 extern "C" int _apc_tracegen(
     Fp*                      d_output,          // [output_height * output_width], column-major
     size_t                   output_height,     // H_out
-    const OriginalAir*       d_original_airs,   // device array, length = n_airs
-    size_t                   n_airs,            // one block per AIR
+    const OriginalAir*       d_original_airs,   // device array of AIR metadata
     const Subst*             d_subs,            // device array of all substitutions
+    size_t                   n_subs,            // number of substitutions
     int                      num_apc_calls      // number of APC calls
 ) {
     assert((output_height & (output_height - 1)) == 0);  // power-of-two height check
 
     const int block_x = 256;
     const dim3 block(block_x, 1, 1);
-    const dim3 grid((unsigned int)n_airs, 1, 1);
+    unsigned g = (unsigned)((output_height + block_x - 1) / block_x);
+    if (g == 0u) g = 1u;
+    const dim3 grid(g, 1, 1);
 
     apc_tracegen_kernel<<<grid, block>>>(
-        d_output, output_height, d_original_airs, d_subs, num_apc_calls
+        d_output, output_height, d_original_airs, d_subs, n_subs, num_apc_calls
     );
     return (int)cudaGetLastError();
-}
+}
diff --git a/openvm/src/cuda_abi.rs b/openvm/src/cuda_abi.rs
@@ -12,9 +12,9 @@ extern "C" {
     pub fn _apc_tracegen(
         d_output: *mut BabyBear,             // column-major
         output_height: usize,                // H_out
-        d_original_airs: *const OriginalAir, // device array, len = n_original_airs
-        n_original_airs: usize,              //
+        d_original_airs: *const OriginalAir, // device array of AIR metadata
         d_subs: *const Subst,                // device array of all substitutions
+        n_subs: usize,                       // number of substitutions
         num_apc_calls: i32,                  // number of APC calls
     ) -> i32;
 
@@ -66,17 +66,17 @@ extern "C" {
 #[repr(C)]
 #[derive(Clone, Copy, Debug)]
 pub struct OriginalAir {
-    pub width: i32,                // number of columns
-    pub height: i32,               // number of rows (Ha)
-    pub buffer: *const BabyBear,   // column-major base: col*height + row (device ptr)
-    pub row_block_size: i32,       // stride between used rows
-    pub substitutions_offset: i32, // offset in d_subs
-    pub substitutions_length: i32, // count in d_subs for this AIR
+    pub width: i32,              // number of columns
+    pub height: i32,             // number of rows (Ha)
+    pub buffer: *const BabyBear, // column-major base: col*height + row (device ptr)
+    pub row_block_size: i32,     // stride between used rows
 }
 
 #[repr(C)]
 #[derive(Clone, Copy, Debug)]
 pub struct Subst {
+    /// Index of the source AIR in `d_original_airs`
+    pub air_index: i32,
     /// Source column within this AIR
     pub col: i32,
     /// Base row offset within the row-block
@@ -96,20 +96,19 @@ pub struct DerivedExprSpec {
 
 pub fn apc_tracegen(
     output: &mut DeviceMatrix<BabyBear>,      // column-major
-    original_airs: DeviceBuffer<OriginalAir>, // device array, len = n_airs
+    original_airs: DeviceBuffer<OriginalAir>, // device array of AIR metadata
     substitutions: DeviceBuffer<Subst>,       // device array of all substitutions
     num_apc_calls: usize,
 ) -> Result<(), CudaError> {
     let output_height = output.height();
-    let n_airs = original_airs.len();
 
     unsafe {
         CudaError::from_result(_apc_tracegen(
             output.buffer().as_mut_ptr(),
             output_height,
             original_airs.as_ptr(),
-            n_airs,
             substitutions.as_ptr(),
+            substitutions.len(),
             num_apc_calls as i32,
         ))
     }
diff --git a/openvm/src/powdr_extension/trace_generator/cuda/mod.rs b/openvm/src/powdr_extension/trace_generator/cuda/mod.rs
@@ -270,9 +270,10 @@ impl PowdrTraceGeneratorGpu {
                 .into_group_map()
                 // go through each air and its substitutions
                 .iter()
+                .enumerate()
                 .fold(
                     (Vec::new(), Vec::new()),
-                    |(mut airs, mut substitutions), (air_name, subs_by_row)| {
+                    |(mut airs, mut substitutions), (air_index, (air_name, subs_by_row))| {
                         // Find the substitutions that map to an apc column
                         let new_substitutions: Vec<Subst> = subs_by_row
                             .iter()
@@ -283,13 +284,12 @@ impl PowdrTraceGeneratorGpu {
                                 subs.iter()
                                     .map(move |sub| (row, sub))
                                     .map(|(row, sub)| Subst {
+                                        air_index: air_index as i32,
                                         col: sub.original_poly_index as i32,
                                         row: row as i32,
                                         apc_col: apc_poly_id_to_index[&sub.apc_poly_id] as i32,
                                     })
                             })
-                            // sort by column so that reads to the same column are coalesced, as the table is column major
-                            .sorted_by(|left, right| left.col.cmp(&right.col))
                             .collect();
 
                         // get the device dummy trace for this air
@@ -301,8 +301,6 @@ impl PowdrTraceGeneratorGpu {
                             height: dummy_trace.height() as i32,
                             buffer: dummy_trace.buffer().as_ptr(),
                             row_block_size: subs_by_row.len() as i32,
-                            substitutions_offset: substitutions.len() as i32,
-                            substitutions_length: new_substitutions.len() as i32,
                         });
 
                         substitutions.extend(new_substitutions);