fix(cudart): recreate CUDA runtime bindings based on CUDA 13.0 (#106)

robik75 · web-flow · commit aec514d75c3b · 2025-08-11T15:41:04.000+02:00
# What ❔

This PR recreates the CUDA runtime bindings based on CUDA toolkit
version 13.0.

## Why ❔

The bindings based on CUDA toolkit version 12.x used
`cudaGetDeviceProperties_v2` function which was removed in version 13,
instead `cudaGetDeviceProperties` is used.

## Checklist

- [x] PR title corresponds to the body of PR (we generate changelog
entries from PRs).
- [ ] Tests for the changes have been added / updated.
- [ ] Documentation comments have been added / updated.
- [x] Code has been formatted via `zk fmt` and `zk lint`.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -12,7 +12,7 @@ jobs:
     runs-on: [ubuntu-24.04-github-hosted-32core]
     strategy:
       matrix:
-        cuda: [ "12.0.0-devel-ubuntu20.04", "12.5.0-devel-ubuntu20.04" ]
+        cuda: [ "12.0.0-devel-ubuntu20.04", "13.0.0-devel-ubuntu24.04" ]
     container:
       image: nvidia/cuda:${{ matrix.cuda }}
     env:
diff --git a/.github/workflows/publish-crates.yaml b/.github/workflows/publish-crates.yaml
@@ -14,7 +14,7 @@ jobs:
   publish-crates:
     runs-on: [ubuntu-24.04-github-hosted-32core]
     container:
-      image: nvidia/cuda:12.5.0-devel-ubuntu20.04
+      image: nvidia/cuda:13.0.0-devel-ubuntu24.04
     env:
       BELLMAN_CUDA_DIR: ${{ github.workspace }}/bellman-cuda
       CUDAARCHS: 89
diff --git a/.github/workflows/release-please.yaml b/.github/workflows/release-please.yaml
@@ -34,7 +34,7 @@ jobs:
   process-release:
     runs-on: [ubuntu-24.04-github-hosted-32core]
     container:
-      image: nvidia/cuda:12.5.0-devel-ubuntu20.04
+      image: nvidia/cuda:13.0.0-devel-ubuntu24.04
     env:
       BELLMAN_CUDA_DIR: ${{ github.workspace }}/bellman-cuda
       CUDAARCHS: 89
diff --git a/.github/workflows/test-gpu.yaml b/.github/workflows/test-gpu.yaml
@@ -13,7 +13,7 @@ jobs:
     runs-on: [ ubuntu-latest ]
     strategy:
       matrix:
-        cuda: [ "12.0.0-devel-ubuntu20.04", "12.5.0-devel-ubuntu20.04" ]
+        cuda: [ "12.0.0-devel-ubuntu20.04", "13.0.0-devel-ubuntu24.04" ]
         # TODO: Building the whole workspace with `--test` currently fails with link-time errors,
         # presumably due to either `gpu-ffi` or `gpu-prover` crates.
         # So for now we point at specific packages we want to test.
@@ -125,21 +125,21 @@ jobs:
           name: zksync-crypto-gpu-12.0.0-devel-ubuntu20.04-${{ matrix.package }}-test-binary
           path: zksync-crypto-gpu-test-binary/12.0/
 
-      - name: Download test binary built with CUDA 12.5
+      - name: Download test binary built with CUDA 13.0
         uses: actions/download-artifact@v4
         with:
-          name: zksync-crypto-gpu-12.5.0-devel-ubuntu20.04-${{ matrix.package }}-test-binary
-          path: zksync-crypto-gpu-test-binary/12.5/
+          name: zksync-crypto-gpu-13.0.0-devel-ubuntu24.04-${{ matrix.package }}-test-binary
+          path: zksync-crypto-gpu-test-binary/13.0/
 
-      - name: Run test binary built with CUDA 12.5
-        id: test_cuda_12_5
+      - name: Run test binary built with CUDA 13.0
+        id: test_cuda_13_0
         continue-on-error: true
         run: |
-          chmod +x zksync-crypto-gpu-test-binary/12.5/${{ matrix.package }}
-          zksync-crypto-gpu-test-binary/12.5/${{ matrix.package }}
+          chmod +x zksync-crypto-gpu-test-binary/13.0/${{ matrix.package }}
+          zksync-crypto-gpu-test-binary/13.0/${{ matrix.package }}
 
       - name: Run test binary built with CUDA 12.0
-        if: steps.test_cuda_12_5.outcome == 'failure' || steps.test_cuda_12_5.outcome == 'success'
+        if: steps.test_cuda_13_0.outcome == 'failure' || steps.test_cuda_13_0.outcome == 'success'
         run: |
           chmod +x zksync-crypto-gpu-test-binary/12.0/${{ matrix.package }}
           zksync-crypto-gpu-test-binary/12.0/${{ matrix.package }}
diff --git a/crates/boojum-cuda/build/main.rs b/crates/boojum-cuda/build/main.rs
@@ -19,8 +19,8 @@ fn main() {
         use era_cudart_sys::{get_cuda_lib_path, get_cuda_version};
         let cuda_version =
             get_cuda_version().expect("Failed to determine the CUDA Toolkit version.");
-        if !cuda_version.starts_with("12.") {
-            println!("cargo::warning=CUDA Toolkit version {cuda_version} detected. This crate is only tested with CUDA Toolkit 12.*.");
+        if !(cuda_version.starts_with("12.") || cuda_version.starts_with("13.")) {
+            println!("cargo::warning=CUDA Toolkit version {cuda_version} detected. This crate is only tested with CUDA Toolkit versions 12.* and 13.*.");
         }
         let cudaarchs = std::env::var("CUDAARCHS").unwrap_or("native".to_string());
         let dst = cmake::Config::new("native")
diff --git a/crates/cudart-sys-bindings-generator/Cargo.toml b/crates/cudart-sys-bindings-generator/Cargo.toml
@@ -12,5 +12,5 @@ description = "CUDA Bindings generator for ZKsync"
 publish = false
 
 [dependencies]
-bindgen = "0.69"
+bindgen = "0.72"
 era_cudart_sys.workspace = true
diff --git a/crates/cudart-sys-bindings-generator/src/main.rs b/crates/cudart-sys-bindings-generator/src/main.rs
@@ -1,4 +1,4 @@
-use bindgen::callbacks::{EnumVariantValue, ParseCallbacks};
+use bindgen::callbacks::{EnumVariantValue, ItemInfo, ParseCallbacks};
 use bindgen::{BindgenError, Bindings};
 use era_cudart_sys::get_cuda_include_path;
 
@@ -47,9 +47,9 @@ impl ParseCallbacks for CudaParseCallbacks {
         }
     }
 
-    fn item_name(&self, _original_item_name: &str) -> Option<String> {
+    fn item_name(&self, item_info: ItemInfo) -> Option<String> {
         let from = |s: &str| Some(String::from(s));
-        match _original_item_name {
+        match item_info.name {
             "cudaDeviceAttr" => from("CudaDeviceAttr"),
             "cudaLimit" => from("CudaLimit"),
             "cudaError" => from("CudaError"),
@@ -106,7 +106,7 @@ fn generate_bindings<T: Into<String>>(header: T) -> Result<Bindings, BindgenErro
         .allowlist_function("cudaDeviceSynchronize")
         .allowlist_function("cudaGetDevice")
         .allowlist_function("cudaGetDeviceCount")
-        .allowlist_function("cudaGetDeviceProperties_v2")
+        .allowlist_function("cudaGetDeviceProperties")
         .allowlist_function("cudaSetDevice")
         // error handling
         // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__ERROR.html
diff --git a/crates/cudart-sys/build.rs b/crates/cudart-sys/build.rs
@@ -10,8 +10,8 @@ fn main() {
     } else {
         let cuda_version =
             get_cuda_version().expect("Failed to determine the CUDA Toolkit version.");
-        if !cuda_version.starts_with("12.") {
-            println!("cargo::warning=CUDA Toolkit version {cuda_version} detected. This crate is only tested with CUDA Toolkit version 12.*.");
+        if !(cuda_version.starts_with("12.") || cuda_version.starts_with("13.")) {
+            println!("cargo::warning=CUDA Toolkit version {cuda_version} detected. This crate is only tested with CUDA Toolkit versions 12.* and 13.*.");
         }
         let cuda_lib_path = get_cuda_lib_path().unwrap();
         let cuda_lib_path_str = cuda_lib_path.to_str().unwrap();
diff --git a/crates/cudart-sys/src/bindings.rs b/crates/cudart-sys/src/bindings.rs
@@ -100,6 +100,7 @@ pub enum CudaError {
     ErrorJitCompilationDisabled = 223,
     ErrorUnsupportedExecAffinity = 224,
     ErrorUnsupportedDevSideSync = 225,
+    ErrorContained = 226,
     ErrorInvalidSource = 300,
     ErrorFileNotFound = 301,
     ErrorSharedObjectSymbolNotFound = 302,
@@ -129,6 +130,7 @@ pub enum CudaError {
     ErrorInvalidPc = 718,
     ErrorLaunchFailure = 719,
     ErrorCooperativeLaunchTooLarge = 720,
+    ErrorTensorMemoryLeak = 721,
     ErrorNotPermitted = 800,
     ErrorNotSupported = 801,
     ErrorSystemNotReady = 802,
@@ -218,6 +220,7 @@ pub struct CudaPointerAttributes {
     pub device: ::std::os::raw::c_int,
     pub devicePointer: *mut ::std::os::raw::c_void,
     pub hostPointer: *mut ::std::os::raw::c_void,
+    pub reserved: [::std::os::raw::c_long; 8usize],
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
@@ -279,10 +282,6 @@ pub enum CudaLimit {
     MaxL2FetchGranularity = 5,
     PersistingL2CacheSize = 6,
 }
-impl CudaDeviceAttr {
-    pub const MaxTimelineSemaphoreInteropSupported: CudaDeviceAttr =
-        CudaDeviceAttr::TimelineSemaphoreInteropSupported;
-}
 #[repr(u32)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub enum CudaDeviceAttr {
@@ -380,7 +379,7 @@ pub enum CudaDeviceAttr {
     Reserved93 = 93,
     Reserved94 = 94,
     CooperativeLaunch = 95,
-    CooperativeMultiDeviceLaunch = 96,
+    Reserved96 = 96,
     MaxSharedMemoryPerBlockOptin = 97,
     CanFlushRemoteWrites = 98,
     HostRegisterSupported = 99,
@@ -414,7 +413,16 @@ pub enum CudaDeviceAttr {
     MpsEnabled = 133,
     HostNumaId = 134,
     D3D12CigSupported = 135,
-    Max = 136,
+    VulkanCigSupported = 138,
+    GpuPciDeviceId = 139,
+    GpuPciSubsystemId = 140,
+    Reserved141 = 141,
+    HostNumaMemoryPoolsSupported = 142,
+    HostNumaMultinodeIpcSupported = 143,
+    HostMemoryPoolsSupported = 144,
+    Reserved145 = 145,
+    OnlyPartialHostNativeAtomicSupported = 147,
+    Max = 148,
 }
 #[repr(u32)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
@@ -428,6 +436,9 @@ pub enum CudaMemPoolAttribute {
     AttrUsedMemCurrent = 7,
     AttrUsedMemHigh = 8,
 }
+impl CudaMemLocationType {
+    pub const None: CudaMemLocationType = CudaMemLocationType::Invalid;
+}
 #[repr(u32)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub enum CudaMemLocationType {
@@ -461,6 +472,7 @@ pub struct CudaMemAccessDesc {
 pub enum CudaMemAllocationType {
     Invalid = 0,
     Pinned = 1,
+    Managed = 2,
     Max = 2147483647,
 }
 #[repr(u32)]
@@ -504,21 +516,16 @@ pub struct CudaDeviceProperties {
     pub maxThreadsPerBlock: ::std::os::raw::c_int,
     pub maxThreadsDim: [::std::os::raw::c_int; 3usize],
     pub maxGridSize: [::std::os::raw::c_int; 3usize],
-    pub clockRate: ::std::os::raw::c_int,
     pub totalConstMem: usize,
     pub major: ::std::os::raw::c_int,
     pub minor: ::std::os::raw::c_int,
     pub textureAlignment: usize,
     pub texturePitchAlignment: usize,
-    pub deviceOverlap: ::std::os::raw::c_int,
     pub multiProcessorCount: ::std::os::raw::c_int,
-    pub kernelExecTimeoutEnabled: ::std::os::raw::c_int,
     pub integrated: ::std::os::raw::c_int,
     pub canMapHostMemory: ::std::os::raw::c_int,
-    pub computeMode: ::std::os::raw::c_int,
     pub maxTexture1D: ::std::os::raw::c_int,
     pub maxTexture1DMipmap: ::std::os::raw::c_int,
-    pub maxTexture1DLinear: ::std::os::raw::c_int,
     pub maxTexture2D: [::std::os::raw::c_int; 2usize],
     pub maxTexture2DMipmap: [::std::os::raw::c_int; 2usize],
     pub maxTexture2DLinear: [::std::os::raw::c_int; 3usize],
@@ -545,7 +552,6 @@ pub struct CudaDeviceProperties {
     pub tccDriver: ::std::os::raw::c_int,
     pub asyncEngineCount: ::std::os::raw::c_int,
     pub unifiedAddressing: ::std::os::raw::c_int,
-    pub memoryClockRate: ::std::os::raw::c_int,
     pub memoryBusWidth: ::std::os::raw::c_int,
     pub l2CacheSize: ::std::os::raw::c_int,
     pub persistingL2CacheMaxSize: ::std::os::raw::c_int,
@@ -559,13 +565,11 @@ pub struct CudaDeviceProperties {
     pub isMultiGpuBoard: ::std::os::raw::c_int,
     pub multiGpuBoardGroupID: ::std::os::raw::c_int,
     pub hostNativeAtomicSupported: ::std::os::raw::c_int,
-    pub singleToDoublePrecisionPerfRatio: ::std::os::raw::c_int,
     pub pageableMemoryAccess: ::std::os::raw::c_int,
     pub concurrentManagedAccess: ::std::os::raw::c_int,
     pub computePreemptionSupported: ::std::os::raw::c_int,
     pub canUseHostPointerForRegisteredMem: ::std::os::raw::c_int,
     pub cooperativeLaunch: ::std::os::raw::c_int,
-    pub cooperativeMultiDeviceLaunch: ::std::os::raw::c_int,
     pub sharedMemPerBlockOptin: usize,
     pub pageableMemoryAccessUsesHostPageTables: ::std::os::raw::c_int,
     pub directManagedMemAccessFromHost: ::std::os::raw::c_int,
@@ -585,9 +589,14 @@ pub struct CudaDeviceProperties {
     pub ipcEventSupported: ::std::os::raw::c_int,
     pub clusterLaunch: ::std::os::raw::c_int,
     pub unifiedFunctionPointers: ::std::os::raw::c_int,
-    pub reserved2: [::std::os::raw::c_int; 2usize],
-    pub reserved1: [::std::os::raw::c_int; 1usize],
-    pub reserved: [::std::os::raw::c_int; 60usize],
+    pub deviceNumaConfig: ::std::os::raw::c_int,
+    pub deviceNumaId: ::std::os::raw::c_int,
+    pub mpsEnabled: ::std::os::raw::c_int,
+    pub hostNumaId: ::std::os::raw::c_int,
+    pub gpuPciDeviceID: ::std::os::raw::c_uint,
+    pub gpuPciSubsystemID: ::std::os::raw::c_uint,
+    pub hostNumaMultinodeIpcSupported: ::std::os::raw::c_int,
+    pub reserved: [::std::os::raw::c_int; 56usize],
 }
 pub use self::CudaError as cudaError_t;
 #[repr(C)]
@@ -641,9 +650,11 @@ pub enum CudaLaunchAttributeID {
     Priority = 8,
     MemSyncDomainMap = 9,
     MemSyncDomain = 10,
+    PreferredClusterDimension = 11,
     LaunchCompletionEvent = 12,
     DeviceUpdatableKernelNode = 13,
     PreferredSharedMemoryCarveout = 14,
+    NvlinkUtilCentricScheduling = 16,
 }
 #[repr(C)]
 #[derive(Copy, Clone)]
@@ -659,9 +670,11 @@ pub union CudaLaunchAttributeValue {
     pub priority: ::std::os::raw::c_int,
     pub memSyncDomainMap: cudaLaunchMemSyncDomainMap,
     pub memSyncDomain: CudaLaunchMemSyncDomain,
-    pub launchCompletionEvent: cudaLaunchAttributeValue__bindgen_ty_3,
-    pub deviceUpdatableKernelNode: cudaLaunchAttributeValue__bindgen_ty_4,
+    pub preferredClusterDim: cudaLaunchAttributeValue__bindgen_ty_3,
+    pub launchCompletionEvent: cudaLaunchAttributeValue__bindgen_ty_4,
+    pub deviceUpdatableKernelNode: cudaLaunchAttributeValue__bindgen_ty_5,
     pub sharedMemCarveout: ::std::os::raw::c_uint,
+    pub nvlinkUtilCentricScheduling: ::std::os::raw::c_uint,
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
@@ -680,12 +693,19 @@ pub struct cudaLaunchAttributeValue__bindgen_ty_2 {
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct cudaLaunchAttributeValue__bindgen_ty_3 {
+    pub x: ::std::os::raw::c_uint,
+    pub y: ::std::os::raw::c_uint,
+    pub z: ::std::os::raw::c_uint,
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct cudaLaunchAttributeValue__bindgen_ty_4 {
     pub event: cudaEvent_t,
     pub flags: ::std::os::raw::c_int,
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
-pub struct cudaLaunchAttributeValue__bindgen_ty_4 {
+pub struct cudaLaunchAttributeValue__bindgen_ty_5 {
     pub deviceUpdatable: ::std::os::raw::c_int,
     pub devNode: cudaGraphDeviceNode_t,
 }
@@ -733,7 +753,7 @@ cuda_fn_and_stub! {
     pub fn cudaGetDeviceCount(count: *mut ::std::os::raw::c_int) -> cudaError_t;
 }
 cuda_fn_and_stub! {
-    pub fn cudaGetDeviceProperties_v2(
+    pub fn cudaGetDeviceProperties(
         prop: *mut CudaDeviceProperties,
         device: ::std::os::raw::c_int,
     ) -> cudaError_t;
diff --git a/crates/cudart/src/device.rs b/crates/cudart/src/device.rs
@@ -33,7 +33,7 @@ pub fn get_device() -> CudaResult<i32> {
 
 pub fn get_device_properties(device_id: i32) -> CudaResult<CudaDeviceProperties> {
     let mut props = MaybeUninit::<CudaDeviceProperties>::uninit();
-    unsafe { cudaGetDeviceProperties_v2(props.as_mut_ptr(), device_id).wrap_maybe_uninit(props) }
+    unsafe { cudaGetDeviceProperties(props.as_mut_ptr(), device_id).wrap_maybe_uninit(props) }
 }
 
 pub fn set_device(device_id: i32) -> CudaResult<()> {
diff --git a/crates/gpu-ffi/build.rs b/crates/gpu-ffi/build.rs
@@ -11,8 +11,8 @@ fn main() {
     } else {
         let cuda_version =
             get_cuda_version().expect("Failed to determine the CUDA Toolkit version.");
-        if !cuda_version.starts_with("12.") {
-            println!("cargo::warning=CUDA Toolkit version {cuda_version} detected. This crate is only tested with CUDA Toolkit version 12.*.");
+        if !(cuda_version.starts_with("12.") || cuda_version.starts_with("13.")) {
+            println!("cargo::warning=CUDA Toolkit version {cuda_version} detected. This crate is only tested with CUDA Toolkit versions 12.* and 13.*.");
         }
         let bellman_cuda_dir = var("BELLMAN_CUDA_DIR").unwrap();
         let bellman_cuda_path = Path::new(&bellman_cuda_dir);

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ pub fn get_device() -> CudaResult<i32> {`
`33`	`33`
`34`	`34`	`pub fn get_device_properties(device_id: i32) -> CudaResult<CudaDeviceProperties> {`
`35`	`35`	`let mut props = MaybeUninit::<CudaDeviceProperties>::uninit();`
`36`		`- unsafe { cudaGetDeviceProperties_v2(props.as_mut_ptr(), device_id).wrap_maybe_uninit(props) }`
	`36`	`+ unsafe { cudaGetDeviceProperties(props.as_mut_ptr(), device_id).wrap_maybe_uninit(props) }`
`37`	`37`	`}`
`38`	`38`
`39`	`39`	`pub fn set_device(device_id: i32) -> CudaResult<()> {`