Merge pull request #80 from tomsmeding/cuda-12

tomsmeding · web-flow · commit bdbff27fbf03 · 2025-08-13T18:12:45.000+02:00
Cuda 12 and related patches
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,19 @@ package _DOES NOT_ follow the PVP, or indeed any sensible version scheme,
 because NVIDIA are A-OK introducing breaking changes in minor updates.
 
 
+## [0.12.8.0] - ???
+### Added
+  * Support for CUDA-12
+      - Thanks to @noahmartinwilliams on GitHub for helping out!
+
+### Removed
+  * The following modules have been deprecated for a long time, and have
+    finally been removed in CUDA-12:
+      - `Foreign.CUDA.Driver.Texture`
+      - `Foreign.CUDA.Runtime.Texture`
+    Support for Texture Objects (their replacement) is missing in these
+    bindings so far. Contributions welcome.
+
 ## [0.11.0.1] - 2023-08-15
 ### Fixed
   * Build fixes for GHC 9.2 .. 9.6
diff --git a/README.md b/README.md
@@ -145,3 +145,6 @@ An incomplete list of missing bindings. Pull requests welcome!
 - cuGraphMemAllocNodeGetParams
 - cuGraphMemFreeNodeGetParams
 
+### CUDA-12
+
+A lot. PRs welcome.
diff --git a/cbits/stubs.c b/cbits/stubs.c
@@ -22,17 +22,6 @@ cudaError_t cudaConfigureCall_simple(unsigned int gridX, unsigned int gridY, uns
 }
 #endif
 
-CUresult cuTexRefSetAddress2D_simple(CUtexref tex, CUarray_format format, unsigned int numChannels, CUdeviceptr dptr, size_t width, size_t height, size_t pitch)
-{
-    CUDA_ARRAY_DESCRIPTOR desc;
-    desc.Format      = format;
-    desc.NumChannels = numChannels;
-    desc.Width       = width;
-    desc.Height      = height;
-
-    return cuTexRefSetAddress2D(tex, &desc, dptr, pitch);
-}
-
 CUresult cuMemcpy2DHtoD(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int dstXInBytes, unsigned int dstY, void* srcHost, unsigned int srcPitch, unsigned int srcXInBytes, unsigned int srcY, unsigned int widthInBytes, unsigned int height)
 {
     CUDA_MEMCPY2D desc;
@@ -284,11 +273,6 @@ CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N)
 {
     return cuMemsetD32_v2(dstDevice, ui, N);
 }
-
-CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes)
-{
-    return cuTexRefSetAddress_v2(ByteOffset, hTexRef, dptr, bytes);
-}
 #endif
 
 #if CUDA_VERSION >= 4000
diff --git a/cuda.cabal b/cuda.cabal
@@ -1,7 +1,7 @@
 cabal-version:          1.24
 
 Name:                   cuda
-Version:                0.11.0.1
+Version:                0.12.8.0
 Synopsis:               FFI binding to the CUDA interface for programming NVIDIA GPUs
 Description:
     The CUDA library provides a direct, general purpose C-like SPMD programming
@@ -121,7 +121,6 @@ Library
       Foreign.CUDA.Driver.Module.Query
       Foreign.CUDA.Driver.Profiler
       Foreign.CUDA.Driver.Stream
-      Foreign.CUDA.Driver.Texture
       Foreign.CUDA.Driver.Unified
       Foreign.CUDA.Driver.Utils
 
@@ -133,7 +132,6 @@ Library
       Foreign.CUDA.Runtime.Exec
       Foreign.CUDA.Runtime.Marshal
       Foreign.CUDA.Runtime.Stream
-      Foreign.CUDA.Runtime.Texture
       Foreign.CUDA.Runtime.Utils
 
       -- Extras
@@ -151,6 +149,7 @@ Library
   build-depends:
       base              >= 4.7 && < 5
     , bytestring        >= 0.10.4
+    , containers
     , filepath          >= 1.0
     , template-haskell
     , uuid-types        >= 1.0
diff --git a/src/Foreign/CUDA/Analysis/Device.chs b/src/Foreign/CUDA/Analysis/Device.chs
@@ -19,8 +19,12 @@ module Foreign.CUDA.Analysis.Device (
 
 #include "cbits/stubs.h"
 
+import qualified Data.Set as Set
+import Data.Set (Set)
 import Data.Int
+import Data.IORef
 import Text.Show.Describe
+import System.IO.Unsafe
 
 import Debug.Trace
 
@@ -179,7 +183,17 @@ data DeviceResources = DeviceResources
 deviceResources :: DeviceProperties -> DeviceResources
 deviceResources = resources . computeCapability
   where
-    -- This is mostly extracted from tables in the CUDA occupancy calculator.
+    -- Sources:
+    -- [1] https://github.com/NVIDIA/cuda-samples/blob/7b60178984e96bc09d066077d5455df71fee2a9f/Common/helper_cuda.h
+    --    - for: coresPerMP (line 643 _ConvertSMVer2Cores)
+    --    - for: architecture names (line 695 _ConvertSMVer2ArchName)
+    -- [2] https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications-technical-specifications-per-compute-capability
+    --    - for: maxGridsPerDevice
+    --    - archived here: https://web.archive.org/web/20250409220108/https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications-technical-specifications-per-compute-capability
+    --    - reproduced here: https://en.wikipedia.org/w/index.php?title=CUDA&oldid=1285775690#Technical_specification (note: link to specific page version)
+    -- [3] NVidia Nsight Compute
+    --    - for: the other fields
+    --    - left top "Start Activity" -> "Occupancy Calculator" -> "Launch"; tab "GPU Data"
     --
     resources compute = case compute of
       Compute 1 0 -> resources (Compute 1 1)      -- Tesla G80
@@ -283,7 +297,7 @@ deviceResources = resources . computeCapability
         }
       Compute 5 2 -> (resources (Compute 5 0))    -- Maxwell GM20x
         { sharedMemPerMP        = 98304
-        , maxRegPerBlock        = 32768
+        , maxRegPerBlock        = 32768  -- value from [3], wrong in [2]?
         , warpAllocUnit         = 2
         }
       Compute 5 3 -> (resources (Compute 5 0))    -- Maxwell GM20B
@@ -318,9 +332,15 @@ deviceResources = resources . computeCapability
         }
       Compute 6 2 -> (resources (Compute 6 0))    -- Pascal GP10B
         { coresPerMP            = 128
-        , warpsPerMP            = 128
-        , threadBlocksPerMP     = 4096
-        , maxRegPerBlock        = 32768
+        -- Commit 4f75ea889c2ade2bd3eab377b51bb5bbd28bfbae changed warpsPerMP
+        -- to 128, but [2] and [3] say 64 like CC 6.0; reverted back to 64 to
+        -- match NVIDIA documentation.
+        -- That commit also changed threadsPerMP (later mistakenly translated
+        -- to threadBlocksPerMP in 9df19adec8efc9df761deab40cf04d27810d97d3)
+        -- from 2048 to 4096, but again [2] and [3] retain 2048 so we keep it
+        -- at that.
+        , warpsPerMP            = 64
+        , maxRegPerBlock        = 32768  -- value from [2], wrong in [3]?
         , warpAllocUnit         = 4
         , maxGridsPerDevice     = 16
         }
@@ -346,7 +366,7 @@ deviceResources = resources . computeCapability
 
       Compute 7 2 -> (resources (Compute 7 0))    -- Volta GV10B
         { maxGridsPerDevice     = 16
-        , maxSharedMemPerBlock  = 49152
+        , maxSharedMemPerBlock  = 49152  -- unsure why this is here; [2] and [3] say still 98304
         }
 
       Compute 7 5 -> (resources (Compute 7 0))    -- Turing TU1xx
@@ -376,14 +396,91 @@ deviceResources = resources . computeCapability
         , warpRegAllocUnit      = 256
         , maxGridsPerDevice     = 128
         }
-
       Compute 8 6 -> (resources (Compute 8 0))    -- Ampere GA102
-        { warpsPerMP            = 48
+        { coresPerMP            = 128
+        , warpsPerMP            = 48
         , threadsPerMP          = 1536
         , threadBlocksPerMP     = 16
         , sharedMemPerMP        = 102400
         , maxSharedMemPerBlock  = 102400
         }
+      Compute 8 7 -> (resources (Compute 8 0))    -- Ampere
+        { coresPerMP            = 128
+        , warpsPerMP            = 48
+        , threadsPerMP          = 1536
+        , threadBlocksPerMP     = 16
+        }
+      Compute 8 9 -> (resources (Compute 8 0))    -- Ada
+        { coresPerMP            = 128
+        , warpsPerMP            = 48
+        , threadsPerMP          = 1536
+        , threadBlocksPerMP     = 24
+        , sharedMemPerMP        = 102400
+        , maxSharedMemPerBlock  = 102400
+        }
+
+      Compute 9 0 -> DeviceResources              -- Hopper
+        { threadsPerWarp        = 32
+        , coresPerMP            = 128
+        , warpsPerMP            = 64
+        , threadsPerMP          = 2048
+        , threadBlocksPerMP     = 32
+        , sharedMemPerMP        = 233472
+        , maxSharedMemPerBlock  = 233472
+        , regFileSizePerMP      = 65536
+        , maxRegPerBlock        = 65536
+        , regAllocUnit          = 256
+        , regAllocationStyle    = Warp
+        , maxRegPerThread       = 255
+        , sharedMemAllocUnit    = 128
+        , warpAllocUnit         = 4
+        , warpRegAllocUnit      = 256
+        , maxGridsPerDevice     = 128
+        }
+
+      Compute 10 0 -> DeviceResources             -- Blackwell
+        { threadsPerWarp        = 32
+        , coresPerMP            = 128
+        , warpsPerMP            = 64
+        , threadsPerMP          = 2048
+        , threadBlocksPerMP     = 32
+        , sharedMemPerMP        = 233472
+        , maxSharedMemPerBlock  = 233472
+        , regFileSizePerMP      = 65536
+        , maxRegPerBlock        = 65536
+        , regAllocUnit          = 256
+        , regAllocationStyle    = Warp
+        , maxRegPerThread       = 255
+        , sharedMemAllocUnit    = 128
+        , warpAllocUnit         = 4
+        , warpRegAllocUnit      = 256
+        , maxGridsPerDevice     = 128
+        }
+      Compute 10 1 -> (resources (Compute 10 0))  -- Blackwell
+        { warpsPerMP            = 48
+        , threadsPerMP          = 1536
+        , threadBlocksPerMP     = 24
+        }
+
+      Compute 12 0 -> DeviceResources             -- Blackwell
+        { threadsPerWarp        = 32
+        , coresPerMP            = 128
+        , warpsPerMP            = 48
+        , threadsPerMP          = 1536
+        , threadBlocksPerMP     = 24
+        , sharedMemPerMP        = 102400
+        , maxSharedMemPerBlock  = 102400
+        , regFileSizePerMP      = 65536
+        , maxRegPerBlock        = 65536
+        , regAllocUnit          = 256
+        , regAllocationStyle    = Warp
+        , maxRegPerThread       = 255
+        , sharedMemAllocUnit    = 128
+        , warpAllocUnit         = 4
+        , warpRegAllocUnit      = 256
+        , maxGridsPerDevice     = 128
+        }
+
 
       -- Something might have gone wrong, or the library just needs to be
       -- updated for the next generation of hardware, in which case we just want
@@ -393,7 +490,30 @@ deviceResources = resources . computeCapability
       -- However, it should be OK because all library functions run in IO, so it
       -- is likely the user code is as well.
       --
-      _           -> trace warning $ resources (Compute 6 0)
-        where warning = unlines [ "*** Warning: Unknown CUDA device compute capability: " ++ show compute
-                                , "*** Please submit a bug report at https://github.com/tmcdonell/cuda/issues" ]
-
+      _ -> case warningForCC compute of
+             Just warning -> trace warning defaultResources
+             Nothing      -> defaultResources
+
+    defaultResources = resources (Compute 6 0)
+
+    -- All this logic is to ensure the warning is only shown once per unknown
+    -- compute capability. This sounds not worth it, but in practice, it is:
+    -- empirically, an unknown compute capability often leads to /screenfuls/
+    -- of warnings in accelerate-llvm-ptx otherwise.
+    {-# NOINLINE warningForCC #-}
+    warningForCC :: Compute -> Maybe String
+    warningForCC compute = unsafePerformIO $ do
+      unseen <- atomicModifyIORef' warningShown $ \seen ->
+                  -- This is just one tree traversal; lookup-insert would be two traversals.
+                  let seen' = Set.insert compute seen
+                  in (seen', Set.size seen' > Set.size seen)
+      return $ if unseen
+        then Just $ unlines
+               [ "*** Warning: Unknown CUDA device compute capability: " ++ show compute
+               , "*** Please submit a bug report at https://github.com/tmcdonell/cuda/issues"
+               , "*** (This warning will only be shown once for this compute capability)" ]
+        else Nothing
+
+    {-# NOINLINE warningShown #-}
+    warningShown :: IORef (Set Compute)
+    warningShown = unsafePerformIO $ newIORef mempty
diff --git a/src/Foreign/CUDA/Analysis/Occupancy.hs b/src/Foreign/CUDA/Analysis/Occupancy.hs
@@ -28,6 +28,14 @@
 -- the number in the @.cubin@ file to the amount you dynamically allocate at run
 -- time to get the correct shared memory usage.
 --
+-- __Warning__: Like the official Occupancy Calculator in NVidia Nsight
+-- Compute, the calculator in this module does not support or consider Thread
+-- Block Clusters
+-- (<https://docs.nvidia.com/cuda/cuda-c-programming-guide/#thread-block-clusters>)
+-- that have been introduced with compute capability 9.0 (Hopper). If you use
+-- thread block clusters in your kernels, the results you get with the
+-- functions in this module may not be accurate. Profile and measure.
+--
 -- /Notes About Occupancy/
 --
 -- Higher occupancy does not necessarily mean higher performance.  If a kernel
diff --git a/src/Foreign/CUDA/Driver/Graph/Capture.chs b/src/Foreign/CUDA/Driver/Graph/Capture.chs
@@ -152,13 +152,23 @@ status = requireSDK 'status 10.0
 #if CUDA_VERSION < 10010
 info :: Stream -> IO (Status, Int64)
 info = requireSDK 'info 10.1
-#else
+#elif CUDA_VERSION < 12000
 {# fun unsafe cuStreamGetCaptureInfo as info
   { useStream `Stream'
   , alloca-   `Status' peekEnum*
   , alloca-   `Int64'  peekIntConv*
   }
   -> `()' checkStatus*- #}
+#else
+{# fun unsafe cuStreamGetCaptureInfo_v2 as info
+  { useStream `Stream'
+  , alloca-   `Status' peekEnum*
+  , alloca-   `Int64'  peekIntConv*
+  , alloca-   `Graph'
+  , alloca-   `Node'
+  , alloca-   `CSize'
+  }
+  -> `()' checkStatus*- #}
 #endif
 
 
diff --git a/src/Foreign/CUDA/Driver/Module/Query.chs b/src/Foreign/CUDA/Driver/Module/Query.chs
@@ -16,7 +16,7 @@
 module Foreign.CUDA.Driver.Module.Query (
 
   -- ** Querying module inhabitants
-  getFun, getPtr, getTex,
+  getFun, getPtr,
 
 ) where
 
@@ -28,7 +28,6 @@ import Foreign.CUDA.Driver.Error
 import Foreign.CUDA.Driver.Exec
 import Foreign.CUDA.Driver.Marshal                      ( peekDeviceHandle )
 import Foreign.CUDA.Driver.Module.Base
-import Foreign.CUDA.Driver.Texture
 import Foreign.CUDA.Internal.C2HS
 import Foreign.CUDA.Ptr
 
@@ -92,26 +91,6 @@ getPtr !mdl !name = do
   -> `Status' cToEnum #}
 
 
--- |
--- Return a handle to a texture reference. This texture reference handle
--- should not be destroyed, as the texture will be destroyed automatically
--- when the module is unloaded.
---
--- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g9607dcbf911c16420d5264273f2b5608>
---
-{-# INLINEABLE getTex #-}
-getTex :: Module -> ShortByteString -> IO Texture
-getTex !mdl !name = resultIfFound "texture" name =<< cuModuleGetTexRef mdl name
-
-{-# INLINE cuModuleGetTexRef #-}
-{# fun unsafe cuModuleGetTexRef
-  { alloca-       `Texture'         peekTex*
-  , useModule     `Module'
-  , useAsCString* `ShortByteString'
-  }
-  -> `Status' cToEnum #}
-
-
 --------------------------------------------------------------------------------
 -- Internal
 --------------------------------------------------------------------------------
diff --git a/src/Foreign/CUDA/Driver/Stream.chs b/src/Foreign/CUDA/Driver/Stream.chs
diff --git a/src/Foreign/CUDA/Driver/Texture.chs b/src/Foreign/CUDA/Driver/Texture.chs
diff --git a/src/Foreign/CUDA/Runtime/Device.chs b/src/Foreign/CUDA/Runtime/Device.chs
diff --git a/src/Foreign/CUDA/Runtime/Texture.chs b/src/Foreign/CUDA/Runtime/Texture.chs