Feat: finalize module changes

RDambrosio016 · RDambrosio016 · commit 3dd86d9a8a5d · 2022-01-22T00:22:54.000-05:00
diff --git a/crates/cust/CHANGELOG.md b/crates/cust/CHANGELOG.md
@@ -74,9 +74,9 @@ Instead you can now use `DeviceSlice::index` which behaves the same.
 - Added `ModuleJitOption`, `JitFallback`, `JitTarget`, and `OptLevel` for specifying options when loading a module. Note that
 `ModuleJitOption::MaxRegisters` does not seem to work currently, but NVIDIA is looking into it.
 You can achieve the same goal by compiling the ptx to cubin using nvcc then loading that: `nvcc --cubin foo.ptx -maxrregcount=REGS`
-- Added `Module::from_fatbin` and `Module::from_fatbin_unchecked`.
-- Added `Module::from_cubin` and `Module::from_cubin_unchecked`.
-- Added `Module::from_ptr` and `Module::from_ptx_cstr`.
+- Added `Module::from_fatbin`.
+- Added `Module::from_cubin`.
+- Added `Module::from_ptx` and `Module::from_ptx_cstr`.
 - `Stream`, `Module`, `Linker`, `Function`, `Event`, `UnifiedBox`, `ArrayObject`, `LockedBuffer`, `LockedBox`, `DeviceSlice`, `DeviceBuffer`, and `DeviceBox` all now impl `Send` and `Sync`, this makes
 it much easier to write multigpu code. The CUDA API is fully thread-safe except for graph objects.
 
@@ -98,6 +98,7 @@ it much easier to write multigpu code. The CUDA API is fully thread-safe except
 - `DeviceSlice::as_ptr` and `DeviceSlice::as_ptr_mut` now both return a `DevicePointer<T>`.
 - `DeviceSlice` is now `Clone` and `Copy`.
 - `DevicePointer::as_raw` now returns a `CUdeviceptr`, not a `*const T` (use `DevicePointer::as_ptr`).
+- Fixed typo in `CudaError`, `InvalidSouce` is now `InvalidSource`, no more invalid sauce 🍅🥣
 
 ## 0.2.2 - 12/5/21
 
diff --git a/crates/cust/resources/add.cubin b/crates/cust/resources/add.cubin
diff --git a/crates/cust/resources/add.fatbin b/crates/cust/resources/add.fatbin
diff --git a/crates/cust/src/error.rs b/crates/cust/src/error.rs
@@ -52,7 +52,7 @@ pub enum CudaError {
     InvalidPtx = 218,
     InvalidGraphicsContext = 219,
     NvlinkUncorrectable = 220,
-    InvalidSouce = 300,
+    InvalidSource = 300,
     FileNotFound = 301,
     SharedObjectSymbolNotFound = 302,
     SharedObjectInitFailed = 303,
@@ -165,7 +165,7 @@ impl ToResult for cudaError_enum {
                 Err(CudaError::InvalidGraphicsContext)
             }
             cudaError_enum::CUDA_ERROR_NVLINK_UNCORRECTABLE => Err(CudaError::NvlinkUncorrectable),
-            cudaError_enum::CUDA_ERROR_INVALID_SOURCE => Err(CudaError::InvalidSouce),
+            cudaError_enum::CUDA_ERROR_INVALID_SOURCE => Err(CudaError::InvalidSource),
             cudaError_enum::CUDA_ERROR_FILE_NOT_FOUND => Err(CudaError::FileNotFound),
             cudaError_enum::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND => {
                 Err(CudaError::SharedObjectSymbolNotFound)
diff --git a/crates/cust/src/lib.rs b/crates/cust/src/lib.rs
@@ -75,7 +75,6 @@ mod surface;
 mod texture;
 pub mod util;
 
-pub use cust_core;
 pub use cust_derive::DeviceCopy;
 pub use cust_raw as sys;
 
diff --git a/crates/cust/src/module.rs b/crates/cust/src/module.rs
@@ -178,91 +178,111 @@ impl Module {
         }
     }
 
-    // TODO(RDambrosio016): figure out why the heck cuda rejects cubins literally made by nvcc and loaded by fs::read
-
-    // /// Creates a new module by loading a fatbin (fat binary) file.
-    // ///
-    // /// Fatbinary files are files that contain multiple ptx or cubin files. The driver will choose already-built
-    // /// cubin if it is present, and otherwise JIT compile any PTX in the file to cubin.
-    // ///
-    // /// # Example
-    // ///
-    // /// ```
-    // /// # use cust::*;
-    // /// # use std::error::Error;
-    // /// # fn main() -> Result<(), Box<dyn Error>> {
-    // /// # let _ctx = quick_init()?;
-    // /// use cust::module::Module;
-    // /// let fatbin_bytes = std::fs::read("./resources/add.cubin")?;
-    // /// assert!(fatbin_bytes.contains(&0));
-    // /// let module = Module::from_cubin(&fatbin_bytes, &[])?;
-    // /// # Ok(())
-    // /// # }
-    // /// ```
-    // pub fn from_fatbin<T: AsRef<[u8]>>(
-    //     bytes: T,
-    //     options: &[ModuleJitOption],
-    // ) -> CudaResult<Module> {
-    //     let mut bytes = bytes.as_ref().to_vec();
-    //     bytes.push(0);
-    //     // fatbins are just ELF files like cubins, and cuModuleLoadDataEx accepts ptx, cubin, and fatbin.
-    //     // We just make the distinction in case we want to do anything extra in the future. As well
-    //     // as keep things explicit to anyone reading the code.
-    //     Self::from_cubin(bytes, options)
-    // }
-
-    // pub unsafe fn from_fatbin_unchecked<T: AsRef<[u8]>>(
-    //     bytes: T,
-    //     options: &[ModuleJitOption],
-    // ) -> CudaResult<Module> {
-    //     Self::from_cubin_unchecked(bytes, options)
-    // }
+    /// Creates a new module by loading a fatbin (fat binary) file.
+    ///
+    /// Fatbinary files are files that contain multiple ptx or cubin files. The driver will choose already-built
+    /// cubin if it is present, and otherwise JIT compile any PTX in the file to cubin.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use cust::*;
+    /// # use std::error::Error;
+    /// # fn main() -> Result<(), Box<dyn Error>> {
+    /// # let _ctx = quick_init()?;
+    /// use cust::module::Module;
+    /// let fatbin_bytes = std::fs::read("./resources/add.fatbin")?;
+    /// // will return InvalidSource if the fatbin does not contain any compatible code, meaning, either
+    /// // cubin compiled for the same device architecture OR PTX that can be JITted into valid code.
+    /// let module = Module::from_fatbin(&fatbin_bytes, &[])?;
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub fn from_fatbin<T: AsRef<[u8]>>(
+        bytes: T,
+        options: &[ModuleJitOption],
+    ) -> CudaResult<Module> {
+        // fatbins can be loaded just like cubins, we just use different methods so it's explicit.
+        // please don't use from_cubin for fatbins, that is pure chaos and ferris will come to your house
+        Self::from_cubin(bytes, options)
+    }
 
-    // pub fn from_cubin<T: AsRef<[u8]>>(bytes: T, options: &[ModuleJitOption]) -> CudaResult<Module> {
-    //     let bytes = bytes.as_ref();
-    //     goblin::elf::Elf::parse(bytes).expect("Cubin/Fatbin was not valid ELF!");
-    //     // SAFETY: we verified the bytes were valid ELF
-    //     unsafe { Self::from_cubin_unchecked(bytes, options) }
-    // }
+    /// Creates a new module by loading a cubin (CUDA Binary) file.
+    ///
+    /// Cubins are architecture/compute-capability specific files generated as the final step of the CUDA compilation
+    /// process. They cannot be interchanged across compute capabilities unlike PTX (to some degree). You can create one
+    /// using the PTX compiler APIs, the cust [`Linker`](crate::link::Linker), or nvcc (`nvcc a.ptx --cubin -arch=sm_XX`).
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use cust::*;
+    /// # use std::error::Error;
+    /// # fn main() -> Result<(), Box<dyn Error>> {
+    /// # let _ctx = quick_init()?;
+    /// use cust::module::Module;
+    /// let cubin_bytes = std::fs::read("./resources/add.cubin")?;
+    /// // will return InvalidSource if the cubin arch doesn't match the context's device arch!
+    /// let module = Module::from_cubin(&cubin_bytes, &[])?;
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub fn from_cubin<T: AsRef<[u8]>>(bytes: T, options: &[ModuleJitOption]) -> CudaResult<Module> {
+        // it is very unclear whether cuda wants or doesn't want a null terminator. The method works
+        // whether you have one or not. So for safety we just add one. In theory you can figure out the
+        // length of an ELF image without a null terminator. But the docs are confusing, so we add one just
+        // to be sure.
+        let mut bytes = bytes.as_ref().to_vec();
+        bytes.push(0);
+        // SAFETY: the image is known to be dereferenceable
+        unsafe { Self::load_module(bytes.as_ptr() as *const c_void, options) }
+    }
 
-    // pub unsafe fn from_cubin_unchecked<T: AsRef<[u8]>>(
-    //     bytes: T,
-    //     options: &[ModuleJitOption],
-    // ) -> CudaResult<Module> {
-    //     let bytes = bytes.as_ref();
-    //     let mut module = Module {
-    //         inner: ptr::null_mut(),
-    //     };
-    //     let (mut options, mut option_values) = ModuleJitOption::into_raw(options);
-    //     cuda::cuModuleLoadDataEx(
-    //         &mut module.inner as *mut cuda::CUmodule,
-    //         bytes.as_ptr() as *const c_void,
-    //         options.len() as c_uint,
-    //         options.as_mut_ptr(),
-    //         option_values.as_mut_ptr(),
-    //     )
-    //     .to_result()?;
-    //     Ok(module)
-    // }
+    unsafe fn load_module(image: *const c_void, options: &[ModuleJitOption]) -> CudaResult<Module> {
+        let mut module = Module {
+            inner: ptr::null_mut(),
+        };
+        let (mut options, mut option_values) = ModuleJitOption::into_raw(options);
+        cuda::cuModuleLoadDataEx(
+            &mut module.inner as *mut cuda::CUmodule,
+            image,
+            options.len() as c_uint,
+            options.as_mut_ptr(),
+            option_values.as_mut_ptr(),
+        )
+        .to_result()?;
+        Ok(module)
+    }
 
+    /// Creates a new module from a [`CStr`] pointing to PTX code.
+    ///
+    /// The driver will JIT the PTX into arch-specific cubin or pick already-cached cubin if available.
     pub fn from_ptx_cstr(cstr: &CStr, options: &[ModuleJitOption]) -> CudaResult<Module> {
-        unsafe {
-            let mut module = Module {
-                inner: ptr::null_mut(),
-            };
-            let (mut options, mut option_values) = ModuleJitOption::into_raw(options);
-            cuda::cuModuleLoadDataEx(
-                &mut module.inner as *mut cuda::CUmodule,
-                cstr.as_ptr() as *const c_void,
-                options.len() as c_uint,
-                options.as_mut_ptr(),
-                option_values.as_mut_ptr(),
-            )
-            .to_result()?;
-            Ok(module)
-        }
+        // SAFETY: the image is known to be dereferenceable
+        unsafe { Self::load_module(cstr.as_ptr() as *const c_void, options) }
     }
 
+    /// Creates a new module from a PTX string, allocating an intermediate buffer for the [`CString`].
+    ///
+    /// The driver will JIT the PTX into arch-specific cubin or pick already-cached cubin if available.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `string` contains a nul.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use cust::*;
+    /// # use std::error::Error;
+    /// # fn main() -> Result<(), Box<dyn Error>> {
+    /// # let _ctx = quick_init()?;
+    /// use cust::module::Module;
+    /// let ptx = std::fs::read("./resources/add.ptx")?;
+    /// let module = Module::from_ptx(&ptx, &[])?;
+    /// # Ok(())
+    /// # }
+    /// ```
     pub fn from_ptx<T: AsRef<str>>(string: T, options: &[ModuleJitOption]) -> CudaResult<Module> {
         let cstr = CString::new(string.as_ref())
             .expect("string given to Module::from_str contained nul bytes");