@@ -19,8 +19,12 @@ module Foreign.CUDA.Analysis.Device (
1919
2020#include "cbits/stubs.h"
2121
22+ import qualified Data.Set as Set
23+ import Data.Set (Set )
2224import Data.Int
25+ import Data.IORef
2326import Text.Show.Describe
27+ import System.IO.Unsafe
2428
2529import Debug.Trace
2630
@@ -179,7 +183,17 @@ data DeviceResources = DeviceResources
179183deviceResources :: DeviceProperties -> DeviceResources
180184deviceResources = resources . computeCapability
181185 where
182- -- This is mostly extracted from tables in the CUDA occupancy calculator.
186+ -- Sources:
187+ -- [1] https://github.com/NVIDIA/cuda-samples/blob/7b60178984e96bc09d066077d5455df71fee2a9f/Common/helper_cuda.h
188+ -- - for: coresPerMP (line 643 _ConvertSMVer2Cores)
189+ -- - for: architecture names (line 695 _ConvertSMVer2ArchName)
190+ -- [2] https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications-technical-specifications-per-compute-capability
191+ -- - for: maxGridsPerDevice
192+ -- - archived here: https://web.archive.org/web/20250409220108/https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications-technical-specifications-per-compute-capability
193+ -- - reproduced here: https://en.wikipedia.org/w/index.php?title=CUDA&oldid=1285775690#Technical_specification (note: link to specific page version)
194+ -- [3] NVidia Nsight Compute
195+ -- - for: the other fields
196+ -- - left top "Start Activity" -> "Occupancy Calculator" -> "Launch"; tab "GPU Data"
183197 --
184198 resources compute = case compute of
185199 Compute 1 0 -> resources (Compute 1 1 ) -- Tesla G80
@@ -283,7 +297,7 @@ deviceResources = resources . computeCapability
283297 }
284298 Compute 5 2 -> (resources (Compute 5 0 )) -- Maxwell GM20x
285299 { sharedMemPerMP = 98304
286- , maxRegPerBlock = 32768
300+ , maxRegPerBlock = 32768 -- value from [3], wrong in [2]?
287301 , warpAllocUnit = 2
288302 }
289303 Compute 5 3 -> (resources (Compute 5 0 )) -- Maxwell GM20B
@@ -318,9 +332,15 @@ deviceResources = resources . computeCapability
318332 }
319333 Compute 6 2 -> (resources (Compute 6 0 )) -- Pascal GP10B
320334 { coresPerMP = 128
321- , warpsPerMP = 128
322- , threadBlocksPerMP = 4096
323- , maxRegPerBlock = 32768
335+ -- Commit 4f75ea889c2ade2bd3eab377b51bb5bbd28bfbae changed warpsPerMP
336+ -- to 128, but [2] and [3] say 64 like CC 6.0; reverted back to 64 to
337+ -- match NVIDIA documentation.
338+ -- That commit also changed threadsPerMP (later mistakenly translated
339+ -- to threadBlocksPerMP in 9df19adec8efc9df761deab40cf04d27810d97d3)
340+ -- from 2048 to 4096, but again [2] and [3] retain 2048 so we keep it
341+ -- at that.
342+ , warpsPerMP = 64
343+ , maxRegPerBlock = 32768 -- value from [2], wrong in [3]?
324344 , warpAllocUnit = 4
325345 , maxGridsPerDevice = 16
326346 }
@@ -346,7 +366,7 @@ deviceResources = resources . computeCapability
346366
347367 Compute 7 2 -> (resources (Compute 7 0 )) -- Volta GV10B
348368 { maxGridsPerDevice = 16
349- , maxSharedMemPerBlock = 49152
369+ , maxSharedMemPerBlock = 49152 -- unsure why this is here; [2] and [3] say still 98304
350370 }
351371
352372 Compute 7 5 -> (resources (Compute 7 0 )) -- Turing TU1xx
@@ -376,14 +396,91 @@ deviceResources = resources . computeCapability
376396 , warpRegAllocUnit = 256
377397 , maxGridsPerDevice = 128
378398 }
379-
380399 Compute 8 6 -> (resources (Compute 8 0 )) -- Ampere GA102
381- { warpsPerMP = 48
400+ { coresPerMP = 128
401+ , warpsPerMP = 48
382402 , threadsPerMP = 1536
383403 , threadBlocksPerMP = 16
384404 , sharedMemPerMP = 102400
385405 , maxSharedMemPerBlock = 102400
386406 }
407+ Compute 8 7 -> (resources (Compute 8 0 )) -- Ampere
408+ { coresPerMP = 128
409+ , warpsPerMP = 48
410+ , threadsPerMP = 1536
411+ , threadBlocksPerMP = 16
412+ }
413+ Compute 8 9 -> (resources (Compute 8 0 )) -- Ada
414+ { coresPerMP = 128
415+ , warpsPerMP = 48
416+ , threadsPerMP = 1536
417+ , threadBlocksPerMP = 24
418+ , sharedMemPerMP = 102400
419+ , maxSharedMemPerBlock = 102400
420+ }
421+
422+ Compute 9 0 -> DeviceResources -- Hopper
423+ { threadsPerWarp = 32
424+ , coresPerMP = 128
425+ , warpsPerMP = 64
426+ , threadsPerMP = 2048
427+ , threadBlocksPerMP = 32
428+ , sharedMemPerMP = 233472
429+ , maxSharedMemPerBlock = 233472
430+ , regFileSizePerMP = 65536
431+ , maxRegPerBlock = 65536
432+ , regAllocUnit = 256
433+ , regAllocationStyle = Warp
434+ , maxRegPerThread = 255
435+ , sharedMemAllocUnit = 128
436+ , warpAllocUnit = 4
437+ , warpRegAllocUnit = 256
438+ , maxGridsPerDevice = 128
439+ }
440+
441+ Compute 10 0 -> DeviceResources -- Blackwell
442+ { threadsPerWarp = 32
443+ , coresPerMP = 128
444+ , warpsPerMP = 64
445+ , threadsPerMP = 2048
446+ , threadBlocksPerMP = 32
447+ , sharedMemPerMP = 233472
448+ , maxSharedMemPerBlock = 233472
449+ , regFileSizePerMP = 65536
450+ , maxRegPerBlock = 65536
451+ , regAllocUnit = 256
452+ , regAllocationStyle = Warp
453+ , maxRegPerThread = 255
454+ , sharedMemAllocUnit = 128
455+ , warpAllocUnit = 4
456+ , warpRegAllocUnit = 256
457+ , maxGridsPerDevice = 128
458+ }
459+ Compute 10 1 -> (resources (Compute 10 0 )) -- Blackwell
460+ { warpsPerMP = 48
461+ , threadsPerMP = 1536
462+ , threadBlocksPerMP = 24
463+ }
464+
465+ Compute 12 0 -> DeviceResources -- Blackwell
466+ { threadsPerWarp = 32
467+ , coresPerMP = 128
468+ , warpsPerMP = 48
469+ , threadsPerMP = 1536
470+ , threadBlocksPerMP = 24
471+ , sharedMemPerMP = 102400
472+ , maxSharedMemPerBlock = 102400
473+ , regFileSizePerMP = 65536
474+ , maxRegPerBlock = 65536
475+ , regAllocUnit = 256
476+ , regAllocationStyle = Warp
477+ , maxRegPerThread = 255
478+ , sharedMemAllocUnit = 128
479+ , warpAllocUnit = 4
480+ , warpRegAllocUnit = 256
481+ , maxGridsPerDevice = 128
482+ }
483+
387484
388485 -- Something might have gone wrong, or the library just needs to be
389486 -- updated for the next generation of hardware, in which case we just want
@@ -393,7 +490,30 @@ deviceResources = resources . computeCapability
393490 -- However, it should be OK because all library functions run in IO, so it
394491 -- is likely the user code is as well.
395492 --
396- _ -> trace warning $ resources (Compute 6 0 )
397- where warning = unlines [ " *** Warning: Unknown CUDA device compute capability: " ++ show compute
398- , " *** Please submit a bug report at https://github.com/tmcdonell/cuda/issues" ]
399-
493+ _ -> case warningForCC compute of
494+ Just warning -> trace warning defaultResources
495+ Nothing -> defaultResources
496+
497+ defaultResources = resources (Compute 6 0 )
498+
499+ -- All this logic is to ensure the warning is only shown once per unknown
500+ -- compute capability. This sounds not worth it, but in practice, it is:
501+ -- empirically, an unknown compute capability often leads to /screenfuls/
502+ -- of warnings in accelerate-llvm-ptx otherwise.
503+ {-# NOINLINE warningForCC #-}
504+ warningForCC :: Compute -> Maybe String
505+ warningForCC compute = unsafePerformIO $ do
506+ unseen <- atomicModifyIORef' warningShown $ \ seen ->
507+ -- This is just one tree traversal; lookup-insert would be two traversals.
508+ let seen' = Set. insert compute seen
509+ in (seen', Set. size seen' > Set. size seen)
510+ return $ if unseen
511+ then Just $ unlines
512+ [ " *** Warning: Unknown CUDA device compute capability: " ++ show compute
513+ , " *** Please submit a bug report at https://github.com/tmcdonell/cuda/issues"
514+ , " *** (This warning will only be shown once for this compute capability)" ]
515+ else Nothing
516+
517+ {-# NOINLINE warningShown #-}
518+ warningShown :: IORef (Set Compute )
519+ warningShown = unsafePerformIO $ newIORef mempty
0 commit comments