Skip to content

Commit d52e573

Browse files
chore: migrate CUDA primitive chips + test (#1974)
1 parent ce4846f commit d52e573

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+1458
-1310
lines changed

Cargo.lock

Lines changed: 75 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,8 @@ lto = "thin"
114114
# Stark Backend
115115
openvm-stark-backend = { git = "https://github.com/openvm-org/stark-backend.git", tag = "v1.2.0-rc.1", default-features = false }
116116
openvm-stark-sdk = { git = "https://github.com/openvm-org/stark-backend.git", tag = "v1.2.0-rc.1", default-features = false }
117+
# TODO[stephenh]: Replace this once open sourced
118+
stark-backend-gpu = { git = "https://github.com/axiom-crypto/axiom-gpu.git", default-features = false }
117119

118120
# OpenVM
119121
openvm-sdk = { path = "crates/sdk", default-features = false }

crates/circuits/poseidon2-air/Cargo.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ zkhash = { workspace = true }
1717

1818
openvm-stark-backend = { workspace = true }
1919
openvm-stark-sdk = { workspace = true }
20+
stark-backend-gpu = { workspace = true, optional = true }
2021

2122
rand.workspace = true
2223
lazy_static.workspace = true
@@ -26,5 +27,6 @@ derivative.workspace = true
2627
p3-symmetric = { workspace = true }
2728

2829
[features]
29-
default = ["parallel"]
30+
default = ["parallel", "cuda"]
3031
parallel = ["openvm-stark-backend/parallel"]
32+
cuda = ["dep:stark-backend-gpu"]

tracegen-gpu/cuda/src/dummy/poseidon2.cu renamed to crates/circuits/poseidon2-air/cuda/src/dummy.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ __global__ void cukernel_poseidon2_tracegen(Fp *output, Fp *inputs, uint32_t n)
2828
PARTIAL_ROUNDS>(row, state);
2929
}
3030

31-
extern "C" int _poseidon2_tracegen(Fp *output, Fp *inputs, uint32_t sbox_regs, uint32_t n) {
31+
extern "C" int _poseidon2_dummy_tracegen(Fp *output, Fp *inputs, uint32_t sbox_regs, uint32_t n) {
3232

3333
auto [grid, block] = kernel_launch_params(n);
3434
switch (sbox_regs) {
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#![allow(clippy::missing_safety_doc)]
2+
3+
use stark_backend_gpu::{
4+
cuda::{d_buffer::DeviceBuffer, error::CudaError},
5+
prelude::F,
6+
};
7+
8+
pub mod poseidon2 {
9+
/// Poseidon2 tracegen on GPU (parallelized over rows)
10+
///
11+
/// # Arguments
12+
///
13+
/// * `d_output` - DeviceBuffer for the output (column major)
14+
/// * `d_inputs` - DeviceBuffer for the inputs (column major)
15+
/// * `sbox_regs` - Number of sbox registers (0 or 1)
16+
/// * `n` - Number of rows
17+
///
18+
/// Currently only supports same constants as
19+
/// https://github.com/openvm-org/openvm/blob/08bbf79368b07437271aeacb25fb8857980ca863/crates/circuits/poseidon2-air/src/lib.rs
20+
/// so:
21+
/// * `WIDTH` - 16
22+
/// * `SBOX_DEGREE` - 7
23+
/// * `HALF_FULL_ROUNDS` - 4
24+
/// * `PARTIAL_ROUNDS` - 13
25+
use super::*;
26+
27+
extern "C" {
28+
fn _poseidon2_dummy_tracegen(output: *mut F, inputs: *mut F, sbox_regs: u32, n: u32)
29+
-> i32;
30+
}
31+
32+
pub unsafe fn dummy_tracegen(
33+
d_output: &DeviceBuffer<F>,
34+
d_inputs: &DeviceBuffer<F>,
35+
sbox_regs: u32,
36+
n: u32,
37+
) -> Result<(), CudaError> {
38+
CudaError::from_result(_poseidon2_dummy_tracegen(
39+
d_output.as_mut_ptr(),
40+
d_inputs.as_mut_ptr(),
41+
sbox_regs,
42+
n,
43+
))
44+
}
45+
}

crates/circuits/poseidon2-air/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ pub use babybear::*;
2828
pub use config::*;
2929
pub use permute::*;
3030

31+
#[cfg(all(feature = "cuda", test))]
32+
mod cuda_abi;
33+
3134
#[cfg(test)]
3235
mod tests;
3336

crates/circuits/poseidon2-air/src/tests.rs

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,13 @@ use rand::{rngs::StdRng, Rng, RngCore};
1919
use super::{Poseidon2Config, Poseidon2Constants, Poseidon2SubChip};
2020
use crate::BABY_BEAR_POSEIDON2_HALF_FULL_ROUNDS;
2121

22+
#[cfg(feature = "cuda")]
23+
use {
24+
crate::cuda_abi::poseidon2,
25+
openvm_stark_backend::p3_field::PrimeField32,
26+
stark_backend_gpu::{base::DeviceMatrix, cuda::copy::MemCopyH2D as _, types::F},
27+
};
28+
2229
fn run_poseidon2_subchip_test(subchip: Arc<Poseidon2SubChip<BabyBear, 0>>, rng: &mut StdRng) {
2330
// random state and trace generation
2431
let num_rows = 1 << 4;
@@ -93,3 +100,48 @@ fn test_poseidon2_random_constants() {
93100
let poseidon2_subchip = Arc::new(Poseidon2SubChip::<BabyBear, 0>::new(constants));
94101
run_poseidon2_subchip_test(poseidon2_subchip, &mut rng);
95102
}
103+
104+
#[cfg(feature = "cuda")]
105+
#[test]
106+
fn test_cuda_tracegen_poseidon2() {
107+
const WIDTH: usize = 16; // constant for BabyBear
108+
const N: usize = 16;
109+
const SBOX_REGS: usize = 1;
110+
const HALF_FULL_ROUNDS: usize = 4; // Constant for BabyBear
111+
const PARTIAL_ROUNDS: usize = 13; // Constant for BabyBear
112+
113+
// Generate random states and prepare GPU inputs
114+
let mut rng = create_seeded_rng();
115+
let cpu_inputs: Vec<[F; WIDTH]> = (0..N)
116+
.map(|_| std::array::from_fn(|_| F::from_canonical_u32(rng.gen_range(0..F::ORDER_U32))))
117+
.collect();
118+
119+
// Flatten inputs in row-major order for GPU (same layout as cpu_inputs)
120+
let inputs_dev = cpu_inputs
121+
.iter()
122+
.flat_map(|r| r.iter().copied())
123+
.collect::<Vec<_>>()
124+
.to_device()
125+
.unwrap();
126+
127+
// Launch GPU tracegen
128+
let num_cols = 1
129+
+ WIDTH
130+
+ HALF_FULL_ROUNDS * (WIDTH * SBOX_REGS + WIDTH)
131+
+ PARTIAL_ROUNDS * (SBOX_REGS + 1)
132+
+ HALF_FULL_ROUNDS * (WIDTH * SBOX_REGS + WIDTH);
133+
134+
let gpu_mat = DeviceMatrix::<F>::with_capacity(N, num_cols);
135+
136+
unsafe {
137+
poseidon2::dummy_tracegen(gpu_mat.buffer(), &inputs_dev, SBOX_REGS as u32, N as u32)
138+
.expect("GPU tracegen failed");
139+
}
140+
141+
// Run CPU tracegen and compare results
142+
let config = Poseidon2Config::<BabyBear>::default();
143+
let chip: Poseidon2SubChip<_, SBOX_REGS> = Poseidon2SubChip::new(config.constants);
144+
let _cpu_trace = Arc::new(chip.generate_trace(cpu_inputs));
145+
// TODO[stephenh]: Uncomment this when we decide where to put it
146+
// assert_eq_cpu_and_gpu_matrix(cpu_trace, &gpu_mat);
147+
}

crates/circuits/primitives/Cargo.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ license.workspace = true
1111
[dependencies]
1212
openvm-stark-backend = { workspace = true }
1313
openvm-circuit-primitives-derive = { workspace = true }
14+
stark-backend-gpu = { workspace = true, optional = true }
1415

1516
rand.workspace = true
1617
derive-new.workspace = true
@@ -24,5 +25,6 @@ test-case.workspace = true
2425
openvm-stark-sdk = { workspace = true }
2526

2627
[features]
27-
default = ["parallel"]
28+
default = ["parallel", "cuda"]
2829
parallel = ["openvm-stark-backend/parallel"]
30+
cuda = ["dep:stark-backend-gpu"]

crates/circuits/primitives/cuda/inlcude/buffer_view.cuh renamed to crates/circuits/primitives/cuda/include/buffer_view.cuh

File renamed without changes.
File renamed without changes.

0 commit comments

Comments
 (0)