|
8 | 8 | #define VLLM_LAUNCH_BLOCKS_CAP 4
|
9 | 9 | #endif
|
10 | 10 |
|
11 |
| -// compile-time estimate of max threads per SM for launch bounds. |
| 11 | +// Compile-time estimate of max threads per SM for launch bounds. |
| 12 | +// Families: 1024, 1536, 2048 threads/SM. |
12 | 13 | #ifndef VLLM_MAX_THREADS_PER_SM
|
13 |
| - #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 |
14 |
| - #define VLLM_MAX_THREADS_PER_SM 1536 |
| 14 | + #ifdef __CUDA_ARCH__ |
| 15 | + |
| 16 | + /* 1024 thr/SM: Turing (sm_75) */ |
| 17 | + #if (__CUDA_ARCH__ == 750) |
| 18 | + #define VLLM_MAX_THREADS_PER_SM 1024 |
| 19 | + |
| 20 | + /* 1536 thr/SM: Ampere GA10x (sm_86/87), Ada (sm_89), |
| 21 | + GB20x consumer (sm_120/121), Thor (sm_101 or sm_110) */ |
| 22 | + #elif (__CUDA_ARCH__ == 860) || (__CUDA_ARCH__ == 870) || \ |
| 23 | + (__CUDA_ARCH__ == 890) || (__CUDA_ARCH__ == 1010) || \ |
| 24 | + (__CUDA_ARCH__ == 1100) || (__CUDA_ARCH__ == 1200) || \ |
| 25 | + (__CUDA_ARCH__ == 1210) |
| 26 | + #define VLLM_MAX_THREADS_PER_SM 1536 |
| 27 | + |
| 28 | + /* 2048 thr/SM: Volta (sm_70/72), Ampere GA100 (sm_80), |
| 29 | + Hopper (sm_90), Blackwell (sm_100/103) */ |
| 30 | + #elif (__CUDA_ARCH__ == 700) || (__CUDA_ARCH__ == 720) || \ |
| 31 | + (__CUDA_ARCH__ == 800) || (__CUDA_ARCH__ == 900) || \ |
| 32 | + (__CUDA_ARCH__ == 1000) || (__CUDA_ARCH__ == 1030) |
| 33 | + #define VLLM_MAX_THREADS_PER_SM 2048 |
| 34 | + |
| 35 | + /* Fallback: use 2048 for unknown future CCs */ |
| 36 | + #else |
| 37 | + #define VLLM_MAX_THREADS_PER_SM 2048 |
| 38 | + #endif |
| 39 | + |
15 | 40 | #else
|
| 41 | + /* Host pass (no __CUDA_ARCH__): neutral default */ |
16 | 42 | #define VLLM_MAX_THREADS_PER_SM 2048
|
17 | 43 | #endif
|
18 | 44 | #endif
|
|
0 commit comments