Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
runs-on: [ubuntu-24.04-github-hosted-32core]
strategy:
matrix:
cuda: [ "12.0.0-devel-ubuntu20.04", "12.5.0-devel-ubuntu20.04" ]
cuda: [ "12.0.0-devel-ubuntu20.04", "13.0.0-devel-ubuntu24.04" ]
container:
image: nvidia/cuda:${{ matrix.cuda }}
env:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/publish-crates.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
publish-crates:
runs-on: [ubuntu-24.04-github-hosted-32core]
container:
image: nvidia/cuda:12.5.0-devel-ubuntu20.04
image: nvidia/cuda:13.0.0-devel-ubuntu24.04
env:
BELLMAN_CUDA_DIR: ${{ github.workspace }}/bellman-cuda
CUDAARCHS: 89
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release-please.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
process-release:
runs-on: [ubuntu-24.04-github-hosted-32core]
container:
image: nvidia/cuda:12.5.0-devel-ubuntu20.04
image: nvidia/cuda:13.0.0-devel-ubuntu24.04
env:
BELLMAN_CUDA_DIR: ${{ github.workspace }}/bellman-cuda
CUDAARCHS: 89
Expand Down
18 changes: 9 additions & 9 deletions .github/workflows/test-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
runs-on: [ ubuntu-latest ]
strategy:
matrix:
cuda: [ "12.0.0-devel-ubuntu20.04", "12.5.0-devel-ubuntu20.04" ]
cuda: [ "12.0.0-devel-ubuntu20.04", "13.0.0-devel-ubuntu24.04" ]
# TODO: Building the whole workspace with `--test` currently fails with link-time errors,
# presumably due to either `gpu-ffi` or `gpu-prover` crates.
# So for now we point at specific packages we want to test.
Expand Down Expand Up @@ -125,21 +125,21 @@ jobs:
name: zksync-crypto-gpu-12.0.0-devel-ubuntu20.04-${{ matrix.package }}-test-binary
path: zksync-crypto-gpu-test-binary/12.0/

- name: Download test binary built with CUDA 12.5
- name: Download test binary built with CUDA 13.0
uses: actions/download-artifact@v4
with:
name: zksync-crypto-gpu-12.5.0-devel-ubuntu20.04-${{ matrix.package }}-test-binary
path: zksync-crypto-gpu-test-binary/12.5/
name: zksync-crypto-gpu-13.0.0-devel-ubuntu24.04-${{ matrix.package }}-test-binary
path: zksync-crypto-gpu-test-binary/13.0/

- name: Run test binary built with CUDA 12.5
id: test_cuda_12_5
- name: Run test binary built with CUDA 13.0
id: test_cuda_13_0
continue-on-error: true
run: |
chmod +x zksync-crypto-gpu-test-binary/12.5/${{ matrix.package }}
zksync-crypto-gpu-test-binary/12.5/${{ matrix.package }}
chmod +x zksync-crypto-gpu-test-binary/13.0/${{ matrix.package }}
zksync-crypto-gpu-test-binary/13.0/${{ matrix.package }}

- name: Run test binary built with CUDA 12.0
if: steps.test_cuda_12_5.outcome == 'failure' || steps.test_cuda_12_5.outcome == 'success'
if: steps.test_cuda_13_0.outcome == 'failure' || steps.test_cuda_13_0.outcome == 'success'
run: |
chmod +x zksync-crypto-gpu-test-binary/12.0/${{ matrix.package }}
zksync-crypto-gpu-test-binary/12.0/${{ matrix.package }}
4 changes: 2 additions & 2 deletions crates/boojum-cuda/build/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ fn main() {
use era_cudart_sys::{get_cuda_lib_path, get_cuda_version};
let cuda_version =
get_cuda_version().expect("Failed to determine the CUDA Toolkit version.");
if !cuda_version.starts_with("12.") {
println!("cargo::warning=CUDA Toolkit version {cuda_version} detected. This crate is only tested with CUDA Toolkit 12.*.");
if !(cuda_version.starts_with("12.") || cuda_version.starts_with("13.")) {
println!("cargo::warning=CUDA Toolkit version {cuda_version} detected. This crate is only tested with CUDA Toolkit versions 12.* and 13.*.");
}
let cudaarchs = std::env::var("CUDAARCHS").unwrap_or("native".to_string());
let dst = cmake::Config::new("native")
Expand Down
2 changes: 1 addition & 1 deletion crates/cudart-sys-bindings-generator/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@ description = "CUDA Bindings generator for ZKsync"
publish = false

[dependencies]
bindgen = "0.69"
bindgen = "0.72"
era_cudart_sys.workspace = true
8 changes: 4 additions & 4 deletions crates/cudart-sys-bindings-generator/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use bindgen::callbacks::{EnumVariantValue, ParseCallbacks};
use bindgen::callbacks::{EnumVariantValue, ItemInfo, ParseCallbacks};
use bindgen::{BindgenError, Bindings};
use era_cudart_sys::get_cuda_include_path;

Expand Down Expand Up @@ -47,9 +47,9 @@ impl ParseCallbacks for CudaParseCallbacks {
}
}

fn item_name(&self, _original_item_name: &str) -> Option<String> {
fn item_name(&self, item_info: ItemInfo) -> Option<String> {
let from = |s: &str| Some(String::from(s));
match _original_item_name {
match item_info.name {
"cudaDeviceAttr" => from("CudaDeviceAttr"),
"cudaLimit" => from("CudaLimit"),
"cudaError" => from("CudaError"),
Expand Down Expand Up @@ -106,7 +106,7 @@ fn generate_bindings<T: Into<String>>(header: T) -> Result<Bindings, BindgenErro
.allowlist_function("cudaDeviceSynchronize")
.allowlist_function("cudaGetDevice")
.allowlist_function("cudaGetDeviceCount")
.allowlist_function("cudaGetDeviceProperties_v2")
.allowlist_function("cudaGetDeviceProperties")
.allowlist_function("cudaSetDevice")
// error handling
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__ERROR.html
Expand Down
4 changes: 2 additions & 2 deletions crates/cudart-sys/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ fn main() {
} else {
let cuda_version =
get_cuda_version().expect("Failed to determine the CUDA Toolkit version.");
if !cuda_version.starts_with("12.") {
println!("cargo::warning=CUDA Toolkit version {cuda_version} detected. This crate is only tested with CUDA Toolkit version 12.*.");
if !(cuda_version.starts_with("12.") || cuda_version.starts_with("13.")) {
println!("cargo::warning=CUDA Toolkit version {cuda_version} detected. This crate is only tested with CUDA Toolkit versions 12.* and 13.*.");
}
let cuda_lib_path = get_cuda_lib_path().unwrap();
let cuda_lib_path_str = cuda_lib_path.to_str().unwrap();
Expand Down
62 changes: 41 additions & 21 deletions crates/cudart-sys/src/bindings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ pub enum CudaError {
ErrorJitCompilationDisabled = 223,
ErrorUnsupportedExecAffinity = 224,
ErrorUnsupportedDevSideSync = 225,
ErrorContained = 226,
ErrorInvalidSource = 300,
ErrorFileNotFound = 301,
ErrorSharedObjectSymbolNotFound = 302,
Expand Down Expand Up @@ -129,6 +130,7 @@ pub enum CudaError {
ErrorInvalidPc = 718,
ErrorLaunchFailure = 719,
ErrorCooperativeLaunchTooLarge = 720,
ErrorTensorMemoryLeak = 721,
ErrorNotPermitted = 800,
ErrorNotSupported = 801,
ErrorSystemNotReady = 802,
Expand Down Expand Up @@ -218,6 +220,7 @@ pub struct CudaPointerAttributes {
pub device: ::std::os::raw::c_int,
pub devicePointer: *mut ::std::os::raw::c_void,
pub hostPointer: *mut ::std::os::raw::c_void,
pub reserved: [::std::os::raw::c_long; 8usize],
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
Expand Down Expand Up @@ -279,10 +282,6 @@ pub enum CudaLimit {
MaxL2FetchGranularity = 5,
PersistingL2CacheSize = 6,
}
impl CudaDeviceAttr {
pub const MaxTimelineSemaphoreInteropSupported: CudaDeviceAttr =
CudaDeviceAttr::TimelineSemaphoreInteropSupported;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
pub enum CudaDeviceAttr {
Expand Down Expand Up @@ -380,7 +379,7 @@ pub enum CudaDeviceAttr {
Reserved93 = 93,
Reserved94 = 94,
CooperativeLaunch = 95,
CooperativeMultiDeviceLaunch = 96,
Reserved96 = 96,
MaxSharedMemoryPerBlockOptin = 97,
CanFlushRemoteWrites = 98,
HostRegisterSupported = 99,
Expand Down Expand Up @@ -414,7 +413,16 @@ pub enum CudaDeviceAttr {
MpsEnabled = 133,
HostNumaId = 134,
D3D12CigSupported = 135,
Max = 136,
VulkanCigSupported = 138,
GpuPciDeviceId = 139,
GpuPciSubsystemId = 140,
Reserved141 = 141,
HostNumaMemoryPoolsSupported = 142,
HostNumaMultinodeIpcSupported = 143,
HostMemoryPoolsSupported = 144,
Reserved145 = 145,
OnlyPartialHostNativeAtomicSupported = 147,
Max = 148,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
Expand All @@ -428,6 +436,9 @@ pub enum CudaMemPoolAttribute {
AttrUsedMemCurrent = 7,
AttrUsedMemHigh = 8,
}
impl CudaMemLocationType {
pub const None: CudaMemLocationType = CudaMemLocationType::Invalid;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
pub enum CudaMemLocationType {
Expand Down Expand Up @@ -461,6 +472,7 @@ pub struct CudaMemAccessDesc {
pub enum CudaMemAllocationType {
Invalid = 0,
Pinned = 1,
Managed = 2,
Max = 2147483647,
}
#[repr(u32)]
Expand Down Expand Up @@ -504,21 +516,16 @@ pub struct CudaDeviceProperties {
pub maxThreadsPerBlock: ::std::os::raw::c_int,
pub maxThreadsDim: [::std::os::raw::c_int; 3usize],
pub maxGridSize: [::std::os::raw::c_int; 3usize],
pub clockRate: ::std::os::raw::c_int,
pub totalConstMem: usize,
pub major: ::std::os::raw::c_int,
pub minor: ::std::os::raw::c_int,
pub textureAlignment: usize,
pub texturePitchAlignment: usize,
pub deviceOverlap: ::std::os::raw::c_int,
pub multiProcessorCount: ::std::os::raw::c_int,
pub kernelExecTimeoutEnabled: ::std::os::raw::c_int,
pub integrated: ::std::os::raw::c_int,
pub canMapHostMemory: ::std::os::raw::c_int,
pub computeMode: ::std::os::raw::c_int,
pub maxTexture1D: ::std::os::raw::c_int,
pub maxTexture1DMipmap: ::std::os::raw::c_int,
pub maxTexture1DLinear: ::std::os::raw::c_int,
pub maxTexture2D: [::std::os::raw::c_int; 2usize],
pub maxTexture2DMipmap: [::std::os::raw::c_int; 2usize],
pub maxTexture2DLinear: [::std::os::raw::c_int; 3usize],
Expand All @@ -545,7 +552,6 @@ pub struct CudaDeviceProperties {
pub tccDriver: ::std::os::raw::c_int,
pub asyncEngineCount: ::std::os::raw::c_int,
pub unifiedAddressing: ::std::os::raw::c_int,
pub memoryClockRate: ::std::os::raw::c_int,
pub memoryBusWidth: ::std::os::raw::c_int,
pub l2CacheSize: ::std::os::raw::c_int,
pub persistingL2CacheMaxSize: ::std::os::raw::c_int,
Expand All @@ -559,13 +565,11 @@ pub struct CudaDeviceProperties {
pub isMultiGpuBoard: ::std::os::raw::c_int,
pub multiGpuBoardGroupID: ::std::os::raw::c_int,
pub hostNativeAtomicSupported: ::std::os::raw::c_int,
pub singleToDoublePrecisionPerfRatio: ::std::os::raw::c_int,
pub pageableMemoryAccess: ::std::os::raw::c_int,
pub concurrentManagedAccess: ::std::os::raw::c_int,
pub computePreemptionSupported: ::std::os::raw::c_int,
pub canUseHostPointerForRegisteredMem: ::std::os::raw::c_int,
pub cooperativeLaunch: ::std::os::raw::c_int,
pub cooperativeMultiDeviceLaunch: ::std::os::raw::c_int,
pub sharedMemPerBlockOptin: usize,
pub pageableMemoryAccessUsesHostPageTables: ::std::os::raw::c_int,
pub directManagedMemAccessFromHost: ::std::os::raw::c_int,
Expand All @@ -585,9 +589,14 @@ pub struct CudaDeviceProperties {
pub ipcEventSupported: ::std::os::raw::c_int,
pub clusterLaunch: ::std::os::raw::c_int,
pub unifiedFunctionPointers: ::std::os::raw::c_int,
pub reserved2: [::std::os::raw::c_int; 2usize],
pub reserved1: [::std::os::raw::c_int; 1usize],
pub reserved: [::std::os::raw::c_int; 60usize],
pub deviceNumaConfig: ::std::os::raw::c_int,
pub deviceNumaId: ::std::os::raw::c_int,
pub mpsEnabled: ::std::os::raw::c_int,
pub hostNumaId: ::std::os::raw::c_int,
pub gpuPciDeviceID: ::std::os::raw::c_uint,
pub gpuPciSubsystemID: ::std::os::raw::c_uint,
pub hostNumaMultinodeIpcSupported: ::std::os::raw::c_int,
pub reserved: [::std::os::raw::c_int; 56usize],
}
pub use self::CudaError as cudaError_t;
#[repr(C)]
Expand Down Expand Up @@ -641,9 +650,11 @@ pub enum CudaLaunchAttributeID {
Priority = 8,
MemSyncDomainMap = 9,
MemSyncDomain = 10,
PreferredClusterDimension = 11,
LaunchCompletionEvent = 12,
DeviceUpdatableKernelNode = 13,
PreferredSharedMemoryCarveout = 14,
NvlinkUtilCentricScheduling = 16,
}
#[repr(C)]
#[derive(Copy, Clone)]
Expand All @@ -659,9 +670,11 @@ pub union CudaLaunchAttributeValue {
pub priority: ::std::os::raw::c_int,
pub memSyncDomainMap: cudaLaunchMemSyncDomainMap,
pub memSyncDomain: CudaLaunchMemSyncDomain,
pub launchCompletionEvent: cudaLaunchAttributeValue__bindgen_ty_3,
pub deviceUpdatableKernelNode: cudaLaunchAttributeValue__bindgen_ty_4,
pub preferredClusterDim: cudaLaunchAttributeValue__bindgen_ty_3,
pub launchCompletionEvent: cudaLaunchAttributeValue__bindgen_ty_4,
pub deviceUpdatableKernelNode: cudaLaunchAttributeValue__bindgen_ty_5,
pub sharedMemCarveout: ::std::os::raw::c_uint,
pub nvlinkUtilCentricScheduling: ::std::os::raw::c_uint,
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
Expand All @@ -680,12 +693,19 @@ pub struct cudaLaunchAttributeValue__bindgen_ty_2 {
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudaLaunchAttributeValue__bindgen_ty_3 {
pub x: ::std::os::raw::c_uint,
pub y: ::std::os::raw::c_uint,
pub z: ::std::os::raw::c_uint,
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudaLaunchAttributeValue__bindgen_ty_4 {
pub event: cudaEvent_t,
pub flags: ::std::os::raw::c_int,
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudaLaunchAttributeValue__bindgen_ty_4 {
pub struct cudaLaunchAttributeValue__bindgen_ty_5 {
pub deviceUpdatable: ::std::os::raw::c_int,
pub devNode: cudaGraphDeviceNode_t,
}
Expand Down Expand Up @@ -733,7 +753,7 @@ cuda_fn_and_stub! {
pub fn cudaGetDeviceCount(count: *mut ::std::os::raw::c_int) -> cudaError_t;
}
cuda_fn_and_stub! {
pub fn cudaGetDeviceProperties_v2(
pub fn cudaGetDeviceProperties(
prop: *mut CudaDeviceProperties,
device: ::std::os::raw::c_int,
) -> cudaError_t;
Expand Down
2 changes: 1 addition & 1 deletion crates/cudart/src/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ pub fn get_device() -> CudaResult<i32> {

pub fn get_device_properties(device_id: i32) -> CudaResult<CudaDeviceProperties> {
let mut props = MaybeUninit::<CudaDeviceProperties>::uninit();
unsafe { cudaGetDeviceProperties_v2(props.as_mut_ptr(), device_id).wrap_maybe_uninit(props) }
unsafe { cudaGetDeviceProperties(props.as_mut_ptr(), device_id).wrap_maybe_uninit(props) }
}

pub fn set_device(device_id: i32) -> CudaResult<()> {
Expand Down
4 changes: 2 additions & 2 deletions crates/gpu-ffi/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ fn main() {
} else {
let cuda_version =
get_cuda_version().expect("Failed to determine the CUDA Toolkit version.");
if !cuda_version.starts_with("12.") {
println!("cargo::warning=CUDA Toolkit version {cuda_version} detected. This crate is only tested with CUDA Toolkit version 12.*.");
if !(cuda_version.starts_with("12.") || cuda_version.starts_with("13.")) {
println!("cargo::warning=CUDA Toolkit version {cuda_version} detected. This crate is only tested with CUDA Toolkit versions 12.* and 13.*.");
}
let bellman_cuda_dir = var("BELLMAN_CUDA_DIR").unwrap();
let bellman_cuda_path = Path::new(&bellman_cuda_dir);
Expand Down
Loading