Feat: WIP module rework (cubins dont work lol)

RDambrosio016 · RDambrosio016 · commit e97db1e5a3b8 · 2022-01-19T13:01:16.000-05:00
diff --git a/crates/cuda_std/src/atomic/intrinsics.rs b/crates/cuda_std/src/atomic/intrinsics.rs
@@ -49,6 +49,7 @@ pub unsafe fn fence_acqrel_system() {
     asm!("fence.acq_rel.sys;");
 }
 
+#[allow(unused_macros)]
 macro_rules! load_scope {
     (volatile, $scope:ident) => {
         ""
diff --git a/crates/cust/Cargo.toml b/crates/cust/Cargo.toml
@@ -19,6 +19,7 @@ cust_derive = { path = "../cust_derive", version = "0.1" }
 num-complex = { version = "0.4", optional = true }
 vek = { version = "0.15.1", optional = true, default-features = false }
 bytemuck = { version = "1.7.3", optional = true }
+goblin = { version = "0.4.3", default-features = false, features = ["elf32", "elf64", "std", "endian_fd"] }
 
 [features]
 default = ["bytemuck"]
diff --git a/crates/cust/resources/add.cubin b/crates/cust/resources/add.cubin
diff --git a/crates/cust/resources/add.fatbin b/crates/cust/resources/add.fatbin
diff --git a/crates/cust/src/module.rs b/crates/cust/src/module.rs
@@ -8,6 +8,7 @@ use std::ffi::{c_void, CStr, CString};
 use std::fmt;
 use std::marker::PhantomData;
 use std::mem;
+use std::os::raw::c_uint;
 use std::path::Path;
 use std::ptr;
 
@@ -17,6 +18,114 @@ pub struct Module {
     inner: cuda::CUmodule,
 }
 
+/// The possible optimization levels when JIT compiling a PTX module. `O4` by default (most optimized).
+#[repr(u32)]
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum OptLevel {
+    O0 = 0,
+    O1 = 1,
+    O2 = 2,
+    O3 = 3,
+    O4 = 4,
+}
+
+/// The possible targets when JIT compiling a PTX module.
+#[non_exhaustive]
+#[repr(u32)]
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum JitTarget {
+    Compute20 = 20,
+    Compute21 = 21,
+    Compute30 = 30,
+    Compute32 = 32,
+    Compute35 = 35,
+    Compute37 = 37,
+    Compute50 = 50,
+    Compute52 = 52,
+    Compute53 = 53,
+    Compute60 = 60,
+    Compute61 = 61,
+    Compute62 = 62,
+    Compute70 = 70,
+    Compute72 = 72,
+    Compute75 = 75,
+    Compute80 = 80,
+    Compute86 = 86,
+}
+
+/// How to handle cases where a loaded module's data does not contain an exact match for the
+/// specified architecture.
+#[repr(u32)]
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum JitFallback {
+    /// Prefer to compile PTX if present if an exact binary match is not found.
+    PreferPtx = 0,
+    /// Prefer to fall back to a compatible binary code match if exact match is not found.
+    /// This means the driver may pick binary code for `7.0` if your device is `7.2` for example.
+    PreferCompatibleBinary = 1,
+}
+
+/// Different options that could be applied when loading a module.
+#[non_exhaustive]
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum ModuleJitOption {
+    /// Specifies the maximum amount of registers any compiled PTX is allowed to use.
+    MaxRegisters(u32),
+    /// Specifies the optimization level for the JIT compiler.
+    OptLevel(OptLevel),
+    /// Determines the PTX target from the current context's architecture. Cannot be combined with
+    /// [`ModuleJitOption::Target`].
+    DetermineTargetFromContext,
+    /// Specifies the target for the JIT compiler. Cannot be combined with [`ModuleJitOption::DetermineTargetFromContext`].
+    Target(JitTarget),
+    /// Specifies how to handle cases where a loaded module's data does not have an exact match for the specified
+    /// architecture.
+    Fallback(JitFallback),
+    /// Generates debug info in the compiled binary.
+    GenenerateDebugInfo(bool),
+    /// Generates line info in the compiled binary.
+    GenerateLineInfo(bool),
+}
+
+impl ModuleJitOption {
+    pub fn into_raw(opts: &[Self]) -> (Vec<cuda::CUjit_option>, Vec<*mut c_void>) {
+        let mut raw_opts = Vec::with_capacity(opts.len());
+        let mut raw_vals = Vec::with_capacity(opts.len());
+        for opt in opts {
+            match opt {
+                Self::MaxRegisters(regs) => {
+                    raw_opts.push(cuda::CUjit_option::CU_JIT_MAX_REGISTERS);
+                    raw_vals.push(regs as *const u32 as *mut _);
+                }
+                Self::OptLevel(level) => {
+                    raw_opts.push(cuda::CUjit_option::CU_JIT_OPTIMIZATION_LEVEL);
+                    raw_vals.push(level as *const OptLevel as *mut _);
+                }
+                Self::DetermineTargetFromContext => {
+                    raw_opts.push(cuda::CUjit_option::CU_JIT_TARGET_FROM_CUCONTEXT);
+                }
+                Self::Target(target) => {
+                    raw_opts.push(cuda::CUjit_option::CU_JIT_TARGET);
+                    raw_vals.push(target as *const JitTarget as *mut _);
+                }
+                Self::Fallback(fallback) => {
+                    raw_opts.push(cuda::CUjit_option::CU_JIT_FALLBACK_STRATEGY);
+                    raw_vals.push(fallback as *const JitFallback as *mut _);
+                }
+                Self::GenenerateDebugInfo(gen) => {
+                    raw_opts.push(cuda::CUjit_option::CU_JIT_GENERATE_DEBUG_INFO);
+                    raw_vals.push(gen as *const bool as *mut _);
+                }
+                Self::GenerateLineInfo(gen) => {
+                    raw_opts.push(cuda::CUjit_option::CU_JIT_GENERATE_LINE_INFO);
+                    raw_vals.push(gen as *const bool as *mut _)
+                }
+            }
+        }
+        (raw_opts, raw_vals)
+    }
+}
+
 #[cfg(unix)]
 fn path_to_bytes<P: AsRef<Path>>(path: P) -> Vec<u8> {
     use std::os::unix::ffi::OsStrExt;
@@ -66,12 +175,106 @@ impl Module {
         }
     }
 
+    /// Creates a new module by loading a fatbin (fat binary) file.
+    ///
+    /// Fatbinary files are files that contain multiple ptx or cubin files. The driver will choose already-built
+    /// cubin if it is present, and otherwise JIT compile any PTX in the file to cubin.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use cust::*;
+    /// # use std::error::Error;
+    /// # fn main() -> Result<(), Box<dyn Error>> {
+    /// # let _ctx = quick_init()?;
+    /// use cust::module::Module;
+    /// let fatbin_bytes = std::fs::read("./resources/add.cubin")?;
+    /// assert!(fatbin_bytes.contains(&0));
+    /// let module = Module::from_cubin(&fatbin_bytes, &[])?;
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub fn from_fatbin<T: AsRef<[u8]>>(
+        bytes: T,
+        options: &[ModuleJitOption],
+    ) -> CudaResult<Module> {
+        let mut bytes = bytes.as_ref().to_vec();
+        bytes.push(0);
+        // fatbins are just ELF files like cubins, and cuModuleLoadDataEx accepts ptx, cubin, and fatbin.
+        // We just make the distinction in case we want to do anything extra in the future. As well
+        // as keep things explicit to anyone reading the code.
+        Self::from_cubin(bytes, options)
+    }
+
+    pub unsafe fn from_fatbin_unchecked<T: AsRef<[u8]>>(
+        bytes: T,
+        options: &[ModuleJitOption],
+    ) -> CudaResult<Module> {
+        Self::from_cubin_unchecked(bytes, options)
+    }
+
+    pub fn from_cubin<T: AsRef<[u8]>>(bytes: T, options: &[ModuleJitOption]) -> CudaResult<Module> {
+        let bytes = bytes.as_ref();
+        goblin::elf::Elf::parse(bytes).expect("Cubin/Fatbin was not valid ELF!");
+        // SAFETY: we verified the bytes were valid ELF
+        unsafe { Self::from_cubin_unchecked(bytes, options) }
+    }
+
+    pub unsafe fn from_cubin_unchecked<T: AsRef<[u8]>>(
+        bytes: T,
+        options: &[ModuleJitOption],
+    ) -> CudaResult<Module> {
+        let bytes = bytes.as_ref();
+        let mut module = Module {
+            inner: ptr::null_mut(),
+        };
+        let (mut options, mut option_values) = ModuleJitOption::into_raw(options);
+        cuda::cuModuleLoadDataEx(
+            &mut module.inner as *mut cuda::CUmodule,
+            bytes.as_ptr() as *const c_void,
+            options.len() as c_uint,
+            options.as_mut_ptr(),
+            option_values.as_mut_ptr(),
+        )
+        .to_result()?;
+        Ok(module)
+    }
+
+    pub fn from_ptx_cstr(cstr: &CStr, options: &[ModuleJitOption]) -> CudaResult<Module> {
+        unsafe {
+            let mut module = Module {
+                inner: ptr::null_mut(),
+            };
+            let (mut options, mut option_values) = ModuleJitOption::into_raw(options);
+            cuda::cuModuleLoadDataEx(
+                &mut module.inner as *mut cuda::CUmodule,
+                cstr.as_ptr() as *const c_void,
+                options.len() as c_uint,
+                options.as_mut_ptr(),
+                option_values.as_mut_ptr(),
+            )
+            .to_result()?;
+            Ok(module)
+        }
+    }
+
+    pub fn from_ptx<T: AsRef<str>>(string: T, options: &[ModuleJitOption]) -> CudaResult<Module> {
+        let cstr = CString::new(string.as_ref())
+            .expect("string given to Module::from_str contained nul bytes");
+        Self::from_ptx_cstr(cstr.as_c_str(), options)
+    }
+
     /// Load a module from a normal (rust) string, implicitly making it into
     /// a cstring.
+    #[deprecated(
+        since = "0.3.0",
+        note = "from_str was too generic of a name, use from_ptx instead, passing an empty slice of options (usually)"
+    )]
     #[allow(clippy::should_implement_trait)]
     pub fn from_str<T: AsRef<str>>(string: T) -> CudaResult<Module> {
         let cstr = CString::new(string.as_ref())
             .expect("string given to Module::from_str contained nul bytes");
+        #[allow(deprecated)]
         Self::load_from_string(cstr.as_c_str())
     }
 
@@ -98,6 +301,12 @@ impl Module {
     /// # Ok(())
     /// # }
     /// ```
+    #[deprecated(
+        since = "0.3.0",
+        note = "load_from_string was an inconsistent name with inconsistent params, use from_ptx/from_ptx_cstr, passing 
+    an empty slice of options (usually)
+    "
+    )]
     pub fn load_from_string(image: &CStr) -> CudaResult<Module> {
         unsafe {
             let mut module = Module {
diff --git a/examples/cuda/cpu/add/src/main.rs b/examples/cuda/cpu/add/src/main.rs
@@ -22,7 +22,7 @@ fn main() -> Result<(), Box<dyn Error>> {
 
     // Make the CUDA module, modules just house the GPU code for the kernels we created.
     // they can be made from PTX code, cubins, or fatbins.
-    let module = Module::from_str(PTX)?;
+    let module = Module::from_ptx(PTX, &[])?;
 
     // make a CUDA stream to issue calls to. You can think of this as an OS thread but for dispatching
     // GPU calls.
diff --git a/examples/cuda/cpu/path_tracer/src/cuda/mod.rs b/examples/cuda/cpu/path_tracer/src/cuda/mod.rs
@@ -47,7 +47,7 @@ impl CudaRenderer {
 
         let optix_context = OptixContext::new(&context).unwrap();
 
-        let module = Module::from_str(PTX)?;
+        let module = Module::from_ptx(PTX, &[])?;
         let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?;
         let mut denoiser =
             Denoiser::new(&optix_context, DenoiserModelKind::Ldr, Default::default()).unwrap();

Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,7 @@ pub unsafe fn fence_acqrel_system() {`
`49`	`49`	`asm!("fence.acq_rel.sys;");`
`50`	`50`	`}`
`51`	`51`
	`52`	`+#[allow(unused_macros)]`
`52`	`53`	`macro_rules! load_scope {`
`53`	`54`	`(volatile, $scope:ident) => {`
`54`	`55`	`""`