compute_120?

brandonros · brandonros · commit ad4181e2771e · 2025-06-15T16:55:47.000Z
diff --git a/crates/cuda_builder/src/lib.rs b/crates/cuda_builder/src/lib.rs
@@ -163,7 +163,7 @@ impl CudaBuilder {
             generate_line_info: true,
             nvvm_opts: true,
             arch: if cfg!(feature = "nvvm-v19") {
-                NvvmArch::Compute100
+                NvvmArch::Compute120
             } else if cfg!(feature = "nvvm-v7") {
                 NvvmArch::default()
             } else {
diff --git a/crates/cuda_std/src/cfg.rs b/crates/cuda_std/src/cfg.rs
@@ -20,7 +20,8 @@ pub enum ComputeCapability {
     Compute87,
     Compute89,
     Compute90,
-    Compute100
+    Compute100,
+    Compute120,
 }
 
 impl ComputeCapability {
@@ -51,7 +52,8 @@ impl ComputeCapability {
             "870" => ComputeCapability::Compute87,  // Ampere (Jetson AGX Orin)
             "890" => ComputeCapability::Compute89,  // Ada Lovelace (RTX 40 series)
             "900" => ComputeCapability::Compute90,  // Hopper (H100)
-            "1000" => ComputeCapability::Compute100, // Blackwell (RTX 50 series, H200, B100)
+            "1000" => ComputeCapability::Compute100, // Blackwell (RTX 50 series, H200, B100, CUDA 12.6 and later)
+            "1200" => ComputeCapability::Compute120, // Blackwell (RTX 50 series, H200, B100, CUDA 12.8 and later)
             _ => panic!("CUDA_ARCH had an invalid value"),
         }
     }
diff --git a/crates/cust/src/module.rs b/crates/cust/src/module.rs
@@ -60,6 +60,7 @@ pub enum JitTarget {
     Compute89 = 89,
     Compute90 = 90,
     Compute100 = 100,
+    Compute120 = 120,
 }
 
 /// How to handle cases where a loaded module's data does not contain an exact match for the
diff --git a/crates/nvvm/src/lib.rs b/crates/nvvm/src/lib.rs
@@ -259,6 +259,7 @@ impl FromStr for NvvmOption {
                     "89" => NvvmArch::Compute89,
                     "90" => NvvmArch::Compute90,
                     "100" => NvvmArch::Compute100,
+                    "120" => NvvmArch::Compute120,
                     _ => return Err("unknown arch"),
                 };
                 Self::Arch(arch)
@@ -288,6 +289,7 @@ pub enum NvvmArch {
     Compute89,
     Compute90,
     Compute100,
+    Compute120,
 }
 
 impl Display for NvvmArch {
@@ -460,6 +462,7 @@ mod tests {
             "-arch=compute_89",
             "-arch=compute_90",
             "-arch=compute_100",
+            "-arch=compute_120",
             "-ftz=1",
             "-prec-sqrt=0",
             "-prec-div=0",
@@ -486,6 +489,7 @@ mod tests {
             Arch(Compute89),
             Arch(Compute90),
             Arch(Compute100),
+            Arch(Compute120),
             Ftz,
             FastSqrt,
             FastDiv,
diff --git a/crates/rustc_codegen_nvvm_v19/build.rs b/crates/rustc_codegen_nvvm_v19/build.rs
@@ -19,9 +19,9 @@ fn main() {
     rustc_llvm_build();
 
     // this is set by cuda_builder, but in case somebody is using the codegen
-    // manually, default to 1000 (which is what nvvm defaults to).
+    // manually, default to 1200.
     if option_env!("CUDA_ARCH").is_none() {
-        println!("cargo:rustc-env=CUDA_ARCH=1000")
+        println!("cargo:rustc-env=CUDA_ARCH=1200")
     }
 }
 
diff --git a/crates/rustc_codegen_nvvm_v19/src/back.rs b/crates/rustc_codegen_nvvm_v19/src/back.rs
@@ -101,7 +101,7 @@ pub fn target_machine_factory(
     let triple = sess.target.llvm_target.clone().to_string();
     let cpu_string = sess.opts.cg.target_cpu
         .as_deref()
-        .unwrap_or("sm_100") // Use a more compatible target
+        .unwrap_or("sm_120")
         .to_string();
     let features_string = "".to_string();
     let trap_unreachable = sess
diff --git a/crates/rustc_codegen_nvvm_v19/src/target.rs b/crates/rustc_codegen_nvvm_v19/src/target.rs
@@ -22,7 +22,7 @@ pub fn target() -> Target {
     options.linker_flavor = LinkerFlavor::Ptx;
     // nvvm does all the linking for us, but technically its not a linker
     options.linker = None;
-    options.cpu = "sm_100".into();
+    options.cpu = "sm_120".into();
     options.max_atomic_width = Some(64);
     // Unwinding on CUDA is neither feasible nor useful.
     options.panic_strategy = PanicStrategy::Abort;
diff --git a/examples/cuda/vecadd/kernels/Cargo.toml b/examples/cuda/vecadd/kernels/Cargo.toml
@@ -5,6 +5,8 @@ edition = "2024"
 
 [dependencies]
 cuda_std = { path = "../../../../crates/cuda_std" }
+rand_core = { version = "0.9.3" }
+rand_xoshiro = { version = "0.7.0", default-features = false }
 
 [lib]
 crate-type = ["cdylib", "rlib"]
diff --git a/examples/cuda/vecadd/kernels/src/lib.rs b/examples/cuda/vecadd/kernels/src/lib.rs
@@ -1,5 +1,34 @@
 use cuda_std::prelude::*;
 
+use rand_core::{SeedableRng, RngCore};
+use rand_xoshiro::Xoroshiro128StarStar;
+
+const BASE64_CHARS: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+fn splitmix64(mut x: u64) -> u64 {
+    x = x.wrapping_add(0x9e3779b97f4a7c15u64);
+    x = (x ^ (x >> 30)).wrapping_mul(0xbf58476d1ce4e5b9u64);
+    x = (x ^ (x >> 27)).wrapping_mul(0x94d049bb133111ebu64);
+    x ^ (x >> 31)
+}
+
+pub fn generate_random_private_key(thread_idx: usize, rng_seed: u64) -> [u8; 32] {
+    let mixed_seed = splitmix64(rng_seed.wrapping_add(thread_idx as u64));
+    let mut private_key = [0u8; 32];
+    let mut rng = Xoroshiro128StarStar::seed_from_u64(mixed_seed);
+    rng.fill_bytes(&mut private_key);
+    private_key
+}
+
+pub fn generate_base64_nonce(thread_idx: usize, rng_seed: u64, nonce: &mut [u8]) {
+    let mixed_seed = splitmix64(rng_seed.wrapping_add(thread_idx as u64));
+    let mut rng = Xoroshiro128StarStar::seed_from_u64(mixed_seed);
+    for byte in nonce.iter_mut() {
+        let idx = (rng.next_u32() % 64) as usize;
+        *byte = BASE64_CHARS[idx];
+    }
+}
+
 #[kernel]
 #[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
 pub unsafe fn vecadd(a: &[f32], b: &[f32], c: *mut f32) {

Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,8 @@ pub enum ComputeCapability {`
`20`	`20`	`Compute87,`
`21`	`21`	`Compute89,`
`22`	`22`	`Compute90,`
`23`		`- Compute100`
	`23`	`+ Compute100,`
	`24`	`+ Compute120,`
`24`	`25`	`}`
`25`	`26`
`26`	`27`	`impl ComputeCapability {`
`@@ -51,7 +52,8 @@ impl ComputeCapability {`
`51`	`52`	`"870" => ComputeCapability::Compute87, // Ampere (Jetson AGX Orin)`
`52`	`53`	`"890" => ComputeCapability::Compute89, // Ada Lovelace (RTX 40 series)`
`53`	`54`	`"900" => ComputeCapability::Compute90, // Hopper (H100)`
`54`		`- "1000" => ComputeCapability::Compute100, // Blackwell (RTX 50 series, H200, B100)`
	`55`	`+ "1000" => ComputeCapability::Compute100, // Blackwell (RTX 50 series, H200, B100, CUDA 12.6 and later)`
	`56`	`+ "1200" => ComputeCapability::Compute120, // Blackwell (RTX 50 series, H200, B100, CUDA 12.8 and later)`
`55`	`57`	`_ => panic!("CUDA_ARCH had an invalid value"),`
`56`	`58`	`}`
`57`	`59`	`}`
Original file line number	Diff line number	Diff line change
`@@ -60,6 +60,7 @@ pub enum JitTarget {`
`60`	`60`	`Compute89 = 89,`
`61`	`61`	`Compute90 = 90,`
`62`	`62`	`Compute100 = 100,`
	`63`	`+ Compute120 = 120,`
`63`	`64`	`}`
`64`	`65`
`65`	`66`	`/// How to handle cases where a loaded module's data does not contain an exact match for the`
Original file line number	Diff line number	Diff line change
`@@ -19,9 +19,9 @@ fn main() {`
`19`	`19`	`rustc_llvm_build();`
`20`	`20`
`21`	`21`	`// this is set by cuda_builder, but in case somebody is using the codegen`
`22`		`- // manually, default to 1000 (which is what nvvm defaults to).`
	`22`	`+ // manually, default to 1200.`
`23`	`23`	`if option_env!("CUDA_ARCH").is_none() {`
`24`		`- println!("cargo:rustc-env=CUDA_ARCH=1000")`
	`24`	`+ println!("cargo:rustc-env=CUDA_ARCH=1200")`
`25`	`25`	`}`
`26`	`26`	`}`
`27`	`27`