Skip to content

Commit 5deaa1b

Browse files
committed
Fix: Check CUDA in szs_capabilities
1 parent 446e14e commit 5deaa1b

File tree

3 files changed

+94
-26
lines changed

3 files changed

+94
-26
lines changed

c/stringzillas.cuh

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -755,12 +755,23 @@ SZ_DYNAMIC int szs_version_minor(void) { return STRINGZILLA_H_VERSION_MINOR; }
755755
SZ_DYNAMIC int szs_version_patch(void) { return STRINGZILLA_H_VERSION_PATCH; }
756756

757757
SZ_DYNAMIC sz_capability_t szs_capabilities(void) {
758-
sz_capability_t cpu_capabilities = sz_capabilities_implementation_();
758+
// Preserve the static capabilities
759+
static sz_capability_t static_caps = static_cast<sz_capability_t>(0);
760+
if (static_caps == 0) {
761+
sz_capability_t cpu_caps = sz_capabilities_implementation_();
762+
sz_capability_t gpu_caps = static_cast<sz_capability_t>(0);
759763
#if SZ_USE_CUDA
760-
return static_cast<sz_capability_t>(cpu_capabilities | sz_caps_ckh_k);
761-
#else
762-
return cpu_capabilities;
764+
sz::gpu_specs_t first_gpu_specs;
765+
auto specs_status = static_cast<sz::status_t>(szs::gpu_specs_fetch(first_gpu_specs));
766+
if (specs_status != sz::status_t::success_k) return static_caps;
767+
gpu_caps = static_cast<sz_capability_t>(gpu_caps | sz_cap_cuda_k);
768+
if (first_gpu_specs.sm_code >= 30) gpu_caps = static_cast<sz_capability_t>(gpu_caps | sz_cap_kepler_k);
769+
if (first_gpu_specs.sm_code >= 90) gpu_caps = static_cast<sz_capability_t>(gpu_caps | sz_cap_hopper_k);
770+
static_caps = static_cast<sz_capability_t>(cpu_caps | gpu_caps);
763771
#endif // SZ_USE_CUDA
772+
}
773+
774+
return static_caps;
764775
}
765776

766777
SZ_DYNAMIC sz_status_t sz_memory_allocator_init_unified(sz_memory_allocator_t *alloc) {

include/stringzilla/types.hpp

Lines changed: 77 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -741,6 +741,22 @@ struct is_same_type {
741741
static constexpr bool value = false;
742742
};
743743

744+
struct cpu_specs_t {
745+
size_t l1_bytes = 32 * 1024; // ? typically around 32 KB
746+
size_t l2_bytes = 256 * 1024; // ? typically around 256 KB
747+
size_t l3_bytes = 8 * 1024 * 1024; // ? typically around 8 MB
748+
size_t cache_line_width = 64; // ? 64 bytes on x86, sometimes 128 on ARM
749+
size_t cores_per_socket = 1; // ? at least 1 core
750+
size_t sockets = 1; // ? at least 1 socket
751+
752+
size_t cores_total() const noexcept { return cores_per_socket * sockets; }
753+
};
754+
755+
/**
756+
* @brief Specifications of a typical NVIDIA GPU, such as A100 or H100.
757+
* @sa pack_sm_code, cores_per_multiprocessor helpers.
758+
* @note We recommend compiling the code for the 90a compute capability, the newest with specialized optimizations.
759+
*/
744760
struct gpu_specs_t {
745761
size_t vram_bytes = 40ul * 1024 * 1024 * 1024; // ? On A100 it's 40 GB
746762
size_t constant_memory_bytes = 64 * 1024; // ? On A100 it's 64 KB
@@ -749,43 +765,83 @@ struct gpu_specs_t {
749765
size_t cuda_cores = 6912; // ? On A100 for f32/i32 logic
750766
size_t reserved_memory_per_block = 1024; // ? Typically, 1 KB per block is reserved for bookkeeping
751767
size_t warp_size = 32; // ? Warp size is 32 threads on practically all GPUs
752-
size_t max_blocks_per_multiprocessor = 0;
768+
size_t max_blocks_per_multiprocessor = 0; // ? Maximum number of blocks per SM
769+
size_t sm_code = 0; // ? Compute capability code, e.g. 90a for Hopper (H100)
753770

754771
inline size_t shared_memory_per_multiprocessor() const noexcept {
755772
return shared_memory_bytes / streaming_multiprocessors;
756773
}
757774

758-
inline static size_t cores_per_multiprocessor(int major, int minor) noexcept {
775+
/**
776+
* @brief Converts a compute capability (major, minor) to a single numeric code.
777+
*
778+
* - 7.0, 7.2 is Volta, like V100 - maps to 70, 72
779+
* - 7.5 is Turing, like RTX 2080 Ti - maps to 75
780+
* - 8.0, 8.6, 8.7 is Ampere, like A100, RTX 3090 - maps to 80, 86, 87
781+
* - 8.9 is Ada Lovelace, like RTX 4090 - maps to 89
782+
* - 9.0 is Hopper, like H100 - maps to 90
783+
* - 12.0, 12.1 is Blackwell, like B200 - maps to 120, 121
784+
*/
785+
inline static size_t pack_sm_code(int major, int minor) noexcept { return ((major * 10) + minor); }
786+
787+
/**
788+
* @brief Looks up hardware specs for a given compute capability (major, minor).
789+
* @param[in] sm The compute capability code obtained from `pack_sm_code(major, minor)`.
790+
* @sa Used to populate the `cuda_cores` property.
791+
*/
792+
inline static size_t cores_per_multiprocessor(size_t sm) noexcept {
759793
typedef struct {
760794
size_t sm;
761795
size_t cores;
762796
} generation_to_core_count;
763797
generation_to_core_count generations_to_core_counts[] = {
764-
{(7 << 4) + 0, 64}, // Compute Capability 7.0 (V100)
765-
{(7 << 4) + 5, 64}, // Compute Capability 7.5 (RTX 2080 Ti)
766-
{(8 << 4) + 0, 64}, // Compute Capability 8.0 (A100)
767-
{(8 << 4) + 6, 128}, // Compute Capability 8.6 (RTX 3090)
768-
{(9 << 4) + 0, 128}, // Compute Capability 9.0 (H100)
798+
// Kepler architecture (2012-2014)
799+
{pack_sm_code(3, 0), 192}, // Capability 3.0 (GK104 - GTX 680, GTX 770)
800+
{pack_sm_code(3, 5), 192}, // Capability 3.5 (GK110 - GTX 780 Ti, GTX Titan, Tesla K20/K40)
801+
{pack_sm_code(3, 7), 192}, // Capability 3.7 (GK210 - Tesla K80)
802+
803+
// Maxwell architecture (2014-2016)
804+
{pack_sm_code(5, 0), 128}, // Capability 5.0 (GM107/GM108 - GTX 750/750 Ti, GTX 850M/860M)
805+
{pack_sm_code(5, 2), 128}, // Capability 5.2 (GM200/GM204/GM206 - GTX 980/970, Titan X)
806+
{pack_sm_code(5, 3), 128}, // Capability 5.3 (GM20B - Jetson TX1, Tegra X1)
807+
808+
// Pascal architecture (2016-2018)
809+
{pack_sm_code(6, 0), 64}, // Capability 6.0 (GP100 - Tesla P100) - HPC focused, different SM design
810+
{pack_sm_code(6, 1), 128}, // Capability 6.1 (GP102/GP104/GP106/GP107 - GTX 1080/1070/1060/1050, Titan X/Xp)
811+
{pack_sm_code(6, 2), 128}, // Capability 6.2 (GP10B - Jetson TX2, Tegra X2)
812+
813+
// Volta architecture (2017-2018)
814+
{pack_sm_code(7, 0), 64}, // Capability 7.0 (GV100 - Tesla V100, Titan V) - Tensor Core architecture
815+
{pack_sm_code(7, 2), 64}, // Capability 7.2 (GV11B - Jetson AGX Xavier, Tegra Xavier)
816+
817+
// Turing architecture (2018-2020)
818+
{pack_sm_code(7, 5), 64}, // Capability 7.5 (TU102/TU104/TU106/TU116/TU117 - RTX 20xx, GTX 16xx)
819+
820+
// Ampere architecture (2020-2022)
821+
{pack_sm_code(8, 0), 64}, // Capability 8.0 (GA100 - A100) - HPC focused
822+
{pack_sm_code(8, 6), 128}, // Capability 8.6 (GA102/GA104/GA106/GA107 - RTX 3090/3080/3070/3060)
823+
{pack_sm_code(8, 7), 128}, // Capability 8.7 (GA10B - Jetson AGX Orin, Tegra Orin)
824+
825+
// Ada Lovelace architecture (2022-2023)
826+
{pack_sm_code(8, 9), 128}, // Capability 8.9 (AD102/AD103/AD104/AD106/AD107 - RTX 40xx)
827+
828+
// Hopper architecture (2022-2024)
829+
{pack_sm_code(9, 0), 128}, // Capability 9.0 (GH100 - H100, H200)
830+
831+
// Blackwell architecture (2024+)
832+
{pack_sm_code(12, 0), 128}, // Capability 12.0 (GB100 - B100)
833+
{pack_sm_code(12, 1), 128}, // Capability 12.1 (GB200 - B200)
834+
769835
{0, 0}};
770836

771-
// Create a numeric code: for SM 3.5, SM = (3 << 4 + 5) = 0x35.
772-
size_t sm = ((major << 4) + minor);
773837
size_t index = 0;
774838
for (; generations_to_core_counts[index].sm != 0; ++index)
775839
if (generations_to_core_counts[index].sm == sm) return generations_to_core_counts[index].cores;
776-
return generations_to_core_counts[index - 1].cores;
777-
}
778-
};
779840

780-
struct cpu_specs_t {
781-
size_t l1_bytes = 32 * 1024; // ? typically around 32 KB
782-
size_t l2_bytes = 256 * 1024; // ? typically around 256 KB
783-
size_t l3_bytes = 8 * 1024 * 1024; // ? typically around 8 MB
784-
size_t cache_line_width = 64; // ? 64 bytes on x86, sometimes 128 on ARM
785-
size_t cores_per_socket = 1; // ? at least 1 core
786-
size_t sockets = 1; // ? at least 1 socket
787-
788-
size_t cores_total() const noexcept { return cores_per_socket * sockets; }
841+
// If exact match not found, return the most recent known architecture's core count
842+
// This provides forward compatibility for newer architectures
843+
return (index > 0) ? generations_to_core_counts[index - 1].cores : 128;
844+
}
789845
};
790846

791847
/**

include/stringzillas/types.cuh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,8 @@ inline cuda_status_t gpu_specs_fetch(gpu_specs_t &specs, int device_id = 0) noex
114114

115115
// Infer other global settings, that CUDA doesn't expose directly
116116
specs.shared_memory_bytes = prop.sharedMemPerMultiprocessor * prop.multiProcessorCount;
117-
specs.cuda_cores = gpu_specs_t::cores_per_multiprocessor(prop.major, prop.minor) * specs.streaming_multiprocessors;
117+
specs.sm_code = gpu_specs_t::pack_sm_code(prop.major, prop.minor);
118+
specs.cuda_cores = gpu_specs_t::cores_per_multiprocessor(specs.sm_code) * specs.streaming_multiprocessors;
118119

119120
// Scheduling-related constants
120121
specs.max_blocks_per_multiprocessor = prop.maxBlocksPerMultiProcessor;

0 commit comments

Comments
 (0)