@@ -741,6 +741,22 @@ struct is_same_type {
741741 static constexpr bool value = false ;
742742};
743743
744+ struct cpu_specs_t {
745+ size_t l1_bytes = 32 * 1024 ; // ? typically around 32 KB
746+ size_t l2_bytes = 256 * 1024 ; // ? typically around 256 KB
747+ size_t l3_bytes = 8 * 1024 * 1024 ; // ? typically around 8 MB
748+ size_t cache_line_width = 64 ; // ? 64 bytes on x86, sometimes 128 on ARM
749+ size_t cores_per_socket = 1 ; // ? at least 1 core
750+ size_t sockets = 1 ; // ? at least 1 socket
751+
752+ size_t cores_total () const noexcept { return cores_per_socket * sockets; }
753+ };
754+
755+ /* *
756+ * @brief Specifications of a typical NVIDIA GPU, such as A100 or H100.
757+ * @sa pack_sm_code, cores_per_multiprocessor helpers.
758+ * @note We recommend compiling the code for the 90a compute capability, the newest with specialized optimizations.
759+ */
744760struct gpu_specs_t {
745761 size_t vram_bytes = 40ul * 1024 * 1024 * 1024 ; // ? On A100 it's 40 GB
746762 size_t constant_memory_bytes = 64 * 1024 ; // ? On A100 it's 64 KB
@@ -749,43 +765,83 @@ struct gpu_specs_t {
749765 size_t cuda_cores = 6912 ; // ? On A100 for f32/i32 logic
750766 size_t reserved_memory_per_block = 1024 ; // ? Typically, 1 KB per block is reserved for bookkeeping
751767 size_t warp_size = 32 ; // ? Warp size is 32 threads on practically all GPUs
752- size_t max_blocks_per_multiprocessor = 0 ;
768+ size_t max_blocks_per_multiprocessor = 0 ; // ? Maximum number of blocks per SM
769+ size_t sm_code = 0 ; // ? Compute capability code, e.g. 90a for Hopper (H100)
753770
754771 inline size_t shared_memory_per_multiprocessor () const noexcept {
755772 return shared_memory_bytes / streaming_multiprocessors;
756773 }
757774
758- inline static size_t cores_per_multiprocessor (int major, int minor) noexcept {
775+ /* *
776+ * @brief Converts a compute capability (major, minor) to a single numeric code.
777+ *
778+ * - 7.0, 7.2 is Volta, like V100 - maps to 70, 72
779+ * - 7.5 is Turing, like RTX 2080 Ti - maps to 75
780+ * - 8.0, 8.6, 8.7 is Ampere, like A100, RTX 3090 - maps to 80, 86, 87
781+ * - 8.9 is Ada Lovelace, like RTX 4090 - maps to 89
782+ * - 9.0 is Hopper, like H100 - maps to 90
783+ * - 12.0, 12.1 is Blackwell, like B200 - maps to 120, 121
784+ */
785+ inline static size_t pack_sm_code (int major, int minor) noexcept { return ((major * 10 ) + minor); }
786+
787+ /* *
788+ * @brief Looks up hardware specs for a given compute capability (major, minor).
789+ * @param[in] sm The compute capability code obtained from `pack_sm_code(major, minor)`.
790+ * @sa Used to populate the `cuda_cores` property.
791+ */
792+ inline static size_t cores_per_multiprocessor (size_t sm) noexcept {
759793 typedef struct {
760794 size_t sm;
761795 size_t cores;
762796 } generation_to_core_count;
763797 generation_to_core_count generations_to_core_counts[] = {
764- {(7 << 4 ) + 0 , 64 }, // Compute Capability 7.0 (V100)
765- {(7 << 4 ) + 5 , 64 }, // Compute Capability 7.5 (RTX 2080 Ti)
766- {(8 << 4 ) + 0 , 64 }, // Compute Capability 8.0 (A100)
767- {(8 << 4 ) + 6 , 128 }, // Compute Capability 8.6 (RTX 3090)
768- {(9 << 4 ) + 0 , 128 }, // Compute Capability 9.0 (H100)
798+ // Kepler architecture (2012-2014)
799+ {pack_sm_code (3 , 0 ), 192 }, // Capability 3.0 (GK104 - GTX 680, GTX 770)
800+ {pack_sm_code (3 , 5 ), 192 }, // Capability 3.5 (GK110 - GTX 780 Ti, GTX Titan, Tesla K20/K40)
801+ {pack_sm_code (3 , 7 ), 192 }, // Capability 3.7 (GK210 - Tesla K80)
802+
803+ // Maxwell architecture (2014-2016)
804+ {pack_sm_code (5 , 0 ), 128 }, // Capability 5.0 (GM107/GM108 - GTX 750/750 Ti, GTX 850M/860M)
805+ {pack_sm_code (5 , 2 ), 128 }, // Capability 5.2 (GM200/GM204/GM206 - GTX 980/970, Titan X)
806+ {pack_sm_code (5 , 3 ), 128 }, // Capability 5.3 (GM20B - Jetson TX1, Tegra X1)
807+
808+ // Pascal architecture (2016-2018)
809+ {pack_sm_code (6 , 0 ), 64 }, // Capability 6.0 (GP100 - Tesla P100) - HPC focused, different SM design
810+ {pack_sm_code (6 , 1 ), 128 }, // Capability 6.1 (GP102/GP104/GP106/GP107 - GTX 1080/1070/1060/1050, Titan X/Xp)
811+ {pack_sm_code (6 , 2 ), 128 }, // Capability 6.2 (GP10B - Jetson TX2, Tegra X2)
812+
813+ // Volta architecture (2017-2018)
814+ {pack_sm_code (7 , 0 ), 64 }, // Capability 7.0 (GV100 - Tesla V100, Titan V) - Tensor Core architecture
815+ {pack_sm_code (7 , 2 ), 64 }, // Capability 7.2 (GV11B - Jetson AGX Xavier, Tegra Xavier)
816+
817+ // Turing architecture (2018-2020)
818+ {pack_sm_code (7 , 5 ), 64 }, // Capability 7.5 (TU102/TU104/TU106/TU116/TU117 - RTX 20xx, GTX 16xx)
819+
820+ // Ampere architecture (2020-2022)
821+ {pack_sm_code (8 , 0 ), 64 }, // Capability 8.0 (GA100 - A100) - HPC focused
822+ {pack_sm_code (8 , 6 ), 128 }, // Capability 8.6 (GA102/GA104/GA106/GA107 - RTX 3090/3080/3070/3060)
823+ {pack_sm_code (8 , 7 ), 128 }, // Capability 8.7 (GA10B - Jetson AGX Orin, Tegra Orin)
824+
825+ // Ada Lovelace architecture (2022-2023)
826+ {pack_sm_code (8 , 9 ), 128 }, // Capability 8.9 (AD102/AD103/AD104/AD106/AD107 - RTX 40xx)
827+
828+ // Hopper architecture (2022-2024)
829+ {pack_sm_code (9 , 0 ), 128 }, // Capability 9.0 (GH100 - H100, H200)
830+
831+ // Blackwell architecture (2024+)
832+ {pack_sm_code (12 , 0 ), 128 }, // Capability 12.0 (GB100 - B100)
833+ {pack_sm_code (12 , 1 ), 128 }, // Capability 12.1 (GB200 - B200)
834+
769835 {0 , 0 }};
770836
771- // Create a numeric code: for SM 3.5, SM = (3 << 4 + 5) = 0x35.
772- size_t sm = ((major << 4 ) + minor);
773837 size_t index = 0 ;
774838 for (; generations_to_core_counts[index].sm != 0 ; ++index)
775839 if (generations_to_core_counts[index].sm == sm) return generations_to_core_counts[index].cores ;
776- return generations_to_core_counts[index - 1 ].cores ;
777- }
778- };
779840
780- struct cpu_specs_t {
781- size_t l1_bytes = 32 * 1024 ; // ? typically around 32 KB
782- size_t l2_bytes = 256 * 1024 ; // ? typically around 256 KB
783- size_t l3_bytes = 8 * 1024 * 1024 ; // ? typically around 8 MB
784- size_t cache_line_width = 64 ; // ? 64 bytes on x86, sometimes 128 on ARM
785- size_t cores_per_socket = 1 ; // ? at least 1 core
786- size_t sockets = 1 ; // ? at least 1 socket
787-
788- size_t cores_total () const noexcept { return cores_per_socket * sockets; }
841+ // If exact match not found, return the most recent known architecture's core count
842+ // This provides forward compatibility for newer architectures
843+ return (index > 0 ) ? generations_to_core_counts[index - 1 ].cores : 128 ;
844+ }
789845};
790846
791847/* *
0 commit comments