Fix: Check CUDA in szs_capabilities

ashvardanian · ashvardanian · commit 5deaa1b0b8dc · 2025-08-20T11:25:52.000Z
diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
@@ -755,12 +755,23 @@ SZ_DYNAMIC int szs_version_minor(void) { return STRINGZILLA_H_VERSION_MINOR; }
 SZ_DYNAMIC int szs_version_patch(void) { return STRINGZILLA_H_VERSION_PATCH; }
 
 SZ_DYNAMIC sz_capability_t szs_capabilities(void) {
-    sz_capability_t cpu_capabilities = sz_capabilities_implementation_();
+    // Preserve the static capabilities
+    static sz_capability_t static_caps = static_cast<sz_capability_t>(0);
+    if (static_caps == 0) {
+        sz_capability_t cpu_caps = sz_capabilities_implementation_();
+        sz_capability_t gpu_caps = static_cast<sz_capability_t>(0);
 #if SZ_USE_CUDA
-    return static_cast<sz_capability_t>(cpu_capabilities | sz_caps_ckh_k);
-#else
-    return cpu_capabilities;
+        sz::gpu_specs_t first_gpu_specs;
+        auto specs_status = static_cast<sz::status_t>(szs::gpu_specs_fetch(first_gpu_specs));
+        if (specs_status != sz::status_t::success_k) return static_caps;
+        gpu_caps = static_cast<sz_capability_t>(gpu_caps | sz_cap_cuda_k);
+        if (first_gpu_specs.sm_code >= 30) gpu_caps = static_cast<sz_capability_t>(gpu_caps | sz_cap_kepler_k);
+        if (first_gpu_specs.sm_code >= 90) gpu_caps = static_cast<sz_capability_t>(gpu_caps | sz_cap_hopper_k);
+        static_caps = static_cast<sz_capability_t>(cpu_caps | gpu_caps);
 #endif // SZ_USE_CUDA
+    }
+
+    return static_caps;
 }
 
 SZ_DYNAMIC sz_status_t sz_memory_allocator_init_unified(sz_memory_allocator_t *alloc) {
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
@@ -741,6 +741,22 @@ struct is_same_type {
     static constexpr bool value = false;
 };
 
+struct cpu_specs_t {
+    size_t l1_bytes = 32 * 1024;       // ? typically around 32 KB
+    size_t l2_bytes = 256 * 1024;      // ? typically around 256 KB
+    size_t l3_bytes = 8 * 1024 * 1024; // ? typically around 8 MB
+    size_t cache_line_width = 64;      // ? 64 bytes on x86, sometimes 128 on ARM
+    size_t cores_per_socket = 1;       // ? at least 1 core
+    size_t sockets = 1;                // ? at least 1 socket
+
+    size_t cores_total() const noexcept { return cores_per_socket * sockets; }
+};
+
+/**
+ *  @brief Specifications of a typical NVIDIA GPU, such as A100 or H100.
+ *  @sa pack_sm_code, cores_per_multiprocessor helpers.
+ *  @note We recommend compiling the code for the 90a compute capability, the newest with specialized optimizations.
+ */
 struct gpu_specs_t {
     size_t vram_bytes = 40ul * 1024 * 1024 * 1024; // ? On A100 it's 40 GB
     size_t constant_memory_bytes = 64 * 1024;      // ? On A100 it's 64 KB
@@ -749,43 +765,83 @@ struct gpu_specs_t {
     size_t cuda_cores = 6912;                      // ? On A100 for f32/i32 logic
     size_t reserved_memory_per_block = 1024;       // ? Typically, 1 KB per block is reserved for bookkeeping
     size_t warp_size = 32;                         // ? Warp size is 32 threads on practically all GPUs
-    size_t max_blocks_per_multiprocessor = 0;
+    size_t max_blocks_per_multiprocessor = 0;      // ? Maximum number of blocks per SM
+    size_t sm_code = 0;                            // ? Compute capability code, e.g. 90a for Hopper (H100)
 
     inline size_t shared_memory_per_multiprocessor() const noexcept {
         return shared_memory_bytes / streaming_multiprocessors;
     }
 
-    inline static size_t cores_per_multiprocessor(int major, int minor) noexcept {
+    /**
+     *  @brief Converts a compute capability (major, minor) to a single numeric code.
+     *
+     *  - 7.0, 7.2 is Volta, like V100                  - maps to 70, 72
+     *  - 7.5 is Turing, like RTX 2080 Ti               - maps to 75
+     *  - 8.0, 8.6, 8.7 is Ampere, like A100, RTX 3090  - maps to 80, 86, 87
+     *  - 8.9 is Ada Lovelace, like RTX 4090            - maps to 89
+     *  - 9.0 is Hopper, like H100                      - maps to 90
+     *  - 12.0, 12.1 is Blackwell, like B200            - maps to 120, 121
+     */
+    inline static size_t pack_sm_code(int major, int minor) noexcept { return ((major * 10) + minor); }
+
+    /**
+     *  @brief Looks up hardware specs for a given compute capability (major, minor).
+     *  @param[in] sm The compute capability code obtained from `pack_sm_code(major, minor)`.
+     *  @sa Used to populate the `cuda_cores` property.
+     */
+    inline static size_t cores_per_multiprocessor(size_t sm) noexcept {
         typedef struct {
             size_t sm;
             size_t cores;
         } generation_to_core_count;
         generation_to_core_count generations_to_core_counts[] = {
-            {(7 << 4) + 0, 64},  // Compute Capability 7.0 (V100)
-            {(7 << 4) + 5, 64},  // Compute Capability 7.5 (RTX 2080 Ti)
-            {(8 << 4) + 0, 64},  // Compute Capability 8.0 (A100)
-            {(8 << 4) + 6, 128}, // Compute Capability 8.6 (RTX 3090)
-            {(9 << 4) + 0, 128}, // Compute Capability 9.0 (H100)
+            // Kepler architecture (2012-2014)
+            {pack_sm_code(3, 0), 192}, // Capability 3.0 (GK104 - GTX 680, GTX 770)
+            {pack_sm_code(3, 5), 192}, // Capability 3.5 (GK110 - GTX 780 Ti, GTX Titan, Tesla K20/K40)
+            {pack_sm_code(3, 7), 192}, // Capability 3.7 (GK210 - Tesla K80)
+
+            // Maxwell architecture (2014-2016)
+            {pack_sm_code(5, 0), 128}, // Capability 5.0 (GM107/GM108 - GTX 750/750 Ti, GTX 850M/860M)
+            {pack_sm_code(5, 2), 128}, // Capability 5.2 (GM200/GM204/GM206 - GTX 980/970, Titan X)
+            {pack_sm_code(5, 3), 128}, // Capability 5.3 (GM20B - Jetson TX1, Tegra X1)
+
+            // Pascal architecture (2016-2018)
+            {pack_sm_code(6, 0), 64},  // Capability 6.0 (GP100 - Tesla P100) - HPC focused, different SM design
+            {pack_sm_code(6, 1), 128}, // Capability 6.1 (GP102/GP104/GP106/GP107 - GTX 1080/1070/1060/1050, Titan X/Xp)
+            {pack_sm_code(6, 2), 128}, // Capability 6.2 (GP10B - Jetson TX2, Tegra X2)
+
+            // Volta architecture (2017-2018)
+            {pack_sm_code(7, 0), 64}, // Capability 7.0 (GV100 - Tesla V100, Titan V) - Tensor Core architecture
+            {pack_sm_code(7, 2), 64}, // Capability 7.2 (GV11B - Jetson AGX Xavier, Tegra Xavier)
+
+            // Turing architecture (2018-2020)
+            {pack_sm_code(7, 5), 64}, // Capability 7.5 (TU102/TU104/TU106/TU116/TU117 - RTX 20xx, GTX 16xx)
+
+            // Ampere architecture (2020-2022)
+            {pack_sm_code(8, 0), 64},  // Capability 8.0 (GA100 - A100) - HPC focused
+            {pack_sm_code(8, 6), 128}, // Capability 8.6 (GA102/GA104/GA106/GA107 - RTX 3090/3080/3070/3060)
+            {pack_sm_code(8, 7), 128}, // Capability 8.7 (GA10B - Jetson AGX Orin, Tegra Orin)
+
+            // Ada Lovelace architecture (2022-2023)
+            {pack_sm_code(8, 9), 128}, // Capability 8.9 (AD102/AD103/AD104/AD106/AD107 - RTX 40xx)
+
+            // Hopper architecture (2022-2024)
+            {pack_sm_code(9, 0), 128}, // Capability 9.0 (GH100 - H100, H200)
+
+            // Blackwell architecture (2024+)
+            {pack_sm_code(12, 0), 128}, // Capability 12.0 (GB100 - B100)
+            {pack_sm_code(12, 1), 128}, // Capability 12.1 (GB200 - B200)
+
             {0, 0}};
 
-        // Create a numeric code: for SM 3.5, SM = (3 << 4 + 5) = 0x35.
-        size_t sm = ((major << 4) + minor);
         size_t index = 0;
         for (; generations_to_core_counts[index].sm != 0; ++index)
             if (generations_to_core_counts[index].sm == sm) return generations_to_core_counts[index].cores;
-        return generations_to_core_counts[index - 1].cores;
-    }
-};
 
-struct cpu_specs_t {
-    size_t l1_bytes = 32 * 1024;       // ? typically around 32 KB
-    size_t l2_bytes = 256 * 1024;      // ? typically around 256 KB
-    size_t l3_bytes = 8 * 1024 * 1024; // ? typically around 8 MB
-    size_t cache_line_width = 64;      // ? 64 bytes on x86, sometimes 128 on ARM
-    size_t cores_per_socket = 1;       // ? at least 1 core
-    size_t sockets = 1;                // ? at least 1 socket
-
-    size_t cores_total() const noexcept { return cores_per_socket * sockets; }
+        // If exact match not found, return the most recent known architecture's core count
+        // This provides forward compatibility for newer architectures
+        return (index > 0) ? generations_to_core_counts[index - 1].cores : 128;
+    }
 };
 
 /**
diff --git a/include/stringzillas/types.cuh b/include/stringzillas/types.cuh
@@ -114,7 +114,8 @@ inline cuda_status_t gpu_specs_fetch(gpu_specs_t &specs, int device_id = 0) noex
 
     // Infer other global settings, that CUDA doesn't expose directly
     specs.shared_memory_bytes = prop.sharedMemPerMultiprocessor * prop.multiProcessorCount;
-    specs.cuda_cores = gpu_specs_t::cores_per_multiprocessor(prop.major, prop.minor) * specs.streaming_multiprocessors;
+    specs.sm_code = gpu_specs_t::pack_sm_code(prop.major, prop.minor);
+    specs.cuda_cores = gpu_specs_t::cores_per_multiprocessor(specs.sm_code) * specs.streaming_multiprocessors;
 
     // Scheduling-related constants
     specs.max_blocks_per_multiprocessor = prop.maxBlocksPerMultiProcessor;