Add: gpu_specs and cuda_status_t

ashvardanian · ashvardanian · commit c36d2b8fc363 · 2025-04-09T08:08:51.000Z
diff --git a/include/stringcuzilla/types.cuh b/include/stringcuzilla/types.cuh
@@ -11,10 +11,11 @@
 #ifndef STRINGZILLA_TYPES_CUH_
 #define STRINGZILLA_TYPES_CUH_
 
-#include <cuda_runtime.h> // `cudaMallocManaged`, `cudaFree`, `cudaSuccess`, `cudaGetErrorString`
-
 #include "stringzilla/types.hpp"
 
+#include <cuda_runtime.h> // `cudaMallocManaged`, `cudaFree`, `cudaSuccess`, `cudaGetErrorString`
+#include <optional>       // `std::optional`
+
 #if !defined(SZ_USE_HOPPER)
 #if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ < 11)
 #define SZ_USE_HOPPER (1)
@@ -78,6 +79,35 @@ struct unified_alloc {
     }
 };
 
+inline std::optional<gpu_specs_t> gpu_specs(int device = 0) noexcept {
+    gpu_specs_t specs;
+    cudaDeviceProp prop;
+    cudaError_t cuda_error = cudaGetDeviceProperties(&prop, device);
+    if (cuda_error != cudaSuccess) return std::nullopt; // ! Failed to get device properties
+
+    // Set the GPU specs
+    specs.streaming_multiprocessors = prop.multiProcessorCount;
+    specs.constant_memory_bytes = prop.totalConstMem;
+    specs.vram_bytes = prop.totalGlobalMem;
+
+    // Infer other global settings, that CUDA doesn't expose directly
+    specs.shared_memory_bytes = prop.sharedMemPerMultiprocessor * prop.multiProcessorCount;
+    specs.cuda_cores = gpu_specs_t::cores_per_multiprocessor(prop.major, prop.minor) * specs.streaming_multiprocessors;
+
+    // Scheduling-related constants
+    specs.max_blocks_per_multiprocessor = prop.maxBlocksPerMultiProcessor;
+    specs.reserved_memory_per_block = prop.reservedSharedMemPerBlock;
+    return specs;
+}
+
+struct cuda_status_t {
+    status_t status = status_t::success_k;
+    cudaError_t cuda_error = cudaSuccess;
+    float elapsed_milliseconds = 0.0;
+
+    inline operator status_t() const noexcept { return status; }
+};
+
 } // namespace stringzilla
 } // namespace ashvardanian
 
diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
@@ -21,30 +21,6 @@ namespace ashvardanian {
 namespace stringzilla {
 namespace scripts {
 
-inline gpu_specs_t gpu_specs(int device = 0) noexcept(false) {
-    gpu_specs_t specs;
-#if SZ_USE_CUDA
-    cudaDeviceProp prop;
-    cudaError_t cuda_error = cudaGetDeviceProperties(&prop, device);
-    if (cuda_error != cudaSuccess)
-        throw std::runtime_error(std::string("Error retrieving device properties: ") + cudaGetErrorString(cuda_error));
-
-    // Set the GPU specs
-    specs.streaming_multiprocessors = prop.multiProcessorCount;
-    specs.constant_memory_bytes = prop.totalConstMem;
-    specs.vram_bytes = prop.totalGlobalMem;
-
-    // Infer other global settings, that CUDA doesn't expose directly
-    specs.shared_memory_bytes = prop.sharedMemPerMultiprocessor * prop.multiProcessorCount;
-    specs.cuda_cores = gpu_specs_t::cores_per_multiprocessor(prop.major, prop.minor) * specs.streaming_multiprocessors;
-
-    // Scheduling-related constants
-    specs.max_blocks_per_multiprocessor = prop.maxBlocksPerMultiProcessor;
-    specs.reserved_memory_per_block = prop.reservedSharedMemPerBlock;
-#endif
-    return specs;
-}
-
 int log_environment() {
     std::printf("- Uses Haswell: %s \n", SZ_USE_HASWELL ? "yes" : "no");
     std::printf("- Uses Skylake: %s \n", SZ_USE_SKYLAKE ? "yes" : "no");
@@ -568,7 +544,7 @@ void test_similarity_scores_memory_usage() {
         {.batch_size = 10, .min_string_length = 1, .max_string_length = 131072, .iterations = 1},
     };
 
-    gpu_specs_t first_gpu_specs = gpu_specs();
+    gpu_specs_t first_gpu_specs = *gpu_specs();
 
     // Progress until something fails
     for (fuzzy_config_t const &experiment : experiments) {