In this article, we'll explore how to integrate Rust and CUDA with a simple project - adding two vectors in bfloat16 format.
- Rust (tested on 1.92.0)
- CUDA Toolkit 13.1 (
nvccin PATH)
rustycuda
├── Cargo.toml
├── cuda-kernels
│ ├── build.rs
│ ├── Cargo.toml
│ └── src
│ ├── add.cu
│ └── lib.rs
└── src
├── kernels.rs
└── main.rs
Key idea: CUDA kernels are compiled to PTX at build time and embedded into Rust code as string constants.
A separate crate for storing and building CUDA kernels.
[package]
name = "cuda-kernels"
version = "0.1.0"
edition = "2024"
[dependencies]
[build-dependencies]
bindgen_cuda = "0.1.5"Adds two vectors in bfloat16 format:
#include <cuda_bf16.h>
extern "C" __global__ void add_bf16(__nv_bfloat16 *out, const __nv_bfloat16 *a,
const __nv_bfloat16 *b, unsigned int n) {
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
out[i] = __float2bfloat16(__bfloat162float(a[i]) + __bfloat162float(b[i]));
}
}Describes how add.cu is compiled to PTX: optimization flags and output path.
fn main() {
println!("cargo:rerun-if-changed=build.rs");
let builder = bindgen_cuda::Builder::default()
.arg("-O3")
.arg("--use_fast_math")
.arg("-U_GNU_SOURCE")
.arg("-U_DEFAULT_SOURCE")
.arg("-D__STRICT_ANSI__");
println!("cargo:info={builder:?}");
let bindings = builder.build_ptx().unwrap();
bindings.write("src/lib.rs").unwrap();
}The resulting PTX is saved as a string constant in lib.rs:
pub const ADD: &str = include_str!(concat!(env!("OUT_DIR"), "/add.ptx"));[package]
name = "rustycuda"
version = "0.1.0"
edition = "2024"
[dependencies]
anyhow = "1"
cudarc = "0.18.2"
half = "2"
cuda-kernels = { path = "cuda-kernels" }A safe wrapper around the CUDA kernel. The cudarc crate loads the PTX module and wraps it in a typed function:
use anyhow::Result;
use cudarc::driver::{
CudaContext, CudaFunction, CudaSlice, CudaStream, LaunchConfig, PushKernelArg,
};
use cudarc::nvrtc::Ptx;
use std::sync::Arc;
pub struct Kernels {
add_bf16: CudaFunction,
}
impl Kernels {
pub fn new(ctx: Arc<CudaContext>) -> Result<Self> {
let module = ctx.load_module(Ptx::from_src(cuda_kernels::ADD))?;
let add_bf16 = module.load_function("add_bf16")?;
Ok(Self { add_bf16 })
}
pub fn add_bf16(
&self,
stream: &CudaStream,
out: &mut CudaSlice<u16>,
a: &CudaSlice<u16>,
b: &CudaSlice<u16>,
n: i32,
) -> Result<()> {
let config = LaunchConfig::for_num_elems(n as u32);
println!("config: {:?}", config);
unsafe {
stream
.launch_builder(&self.add_bf16)
.arg(out)
.arg(a)
.arg(b)
.arg(&n)
.launch(config)?;
}
Ok(())
}
}Main logic:
- Create CUDA context
- Allocate GPU memory
- Copy data from host to device
- Launch the kernel
- Copy the result back to host
mod kernels;
use half::bf16;
use kernels::Kernels;
use cudarc::driver::CudaContext;
fn main() -> anyhow::Result<()> {
let ctx = CudaContext::new(0)?;
let stream = ctx.default_stream();
let kernels = Kernels::new(ctx.clone())?;
let n = 1024i32;
let a: Vec<u16> = vec![bf16::from_f32(1.0).to_bits(); n as usize];
let b: Vec<u16> = vec![bf16::from_f32(2.0).to_bits(); n as usize];
let d_a = stream.clone_htod(&a)?;
let d_b = stream.clone_htod(&b)?;
let mut d_out = stream.alloc_zeros::<u16>(n as usize)?;
kernels.add_bf16(&stream, &mut d_out, &d_a, &d_b, n)?;
let out = stream.clone_dtoh(&d_out)?;
let out: Vec<bf16> = out.into_iter().map(bf16::from_bits).collect();
println!("Result: {:?}", &out[..10]);
Ok(())
}cargo run --release -p rustycudaOutput:
config: LaunchConfig { grid_dim: (1, 1, 1), block_dim: (1024, 1, 1), shared_mem_bytes: 0 }
Result: [3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0]
You now have a minimal working example of Rust and CUDA integration.
Source code: GitHub
Happy learning Rust and CUDA!